Source code for rambutan.rambutan

# rambutan.py
# Contact: Jacob Schreiber
#          jmschr@cs.washington.edu

import os, numpy, pandas

try:
	from sklearn.metrics import roc_auc_score
except:
	roc_auc_score = 'acc'

from joblib import Parallel, delayed

from .io import TrainingGenerator, ValidationGenerator
from .utils import bedgraph_to_dense, fasta_to_dense
from .utils import encode_dnase, extract_regions

[docs]def extract_sequence(filename, verbose=False):
	"""Extract a nucleotide sequence from a file and encode it.

	This function will read in a FastA formatted DNA file and convert it to be
	a one-hot encoded numpy array for internal use. If a one-hot encoded file
	is passed in, it is simply returned. This function is a convenient wrapper
	for joblib to parallelize the unzipping portion.

	Parameters
	----------
	filename : str or numpy.ndarray
		The name of the fasta file to open or the one-hot encoded sequence.

	verbose: bool, optional
		Whether to report the status while extracting sequence. This does not
		look good when done in parallel, so it is suggested it is set to false
		in that case.

	Returns
	-------
	sequence : numpy.ndarray, shape=(n, 4)
		The one-hot encoded DNA sequence.
	"""

	if isinstance(filename, str):
		if verbose:
			print("Converting {}".format(filename))

		return fasta_to_dense(filename, verbose)
	return filename


[docs]def extract_dnase(filename, verbose=False):
	"""Extract a DNaseI file and encode it.

	This function will read in a bedgraph format file and convert it to the
	one-hot encoded numpy array used internally. If a one-hot encoded file is
	passed in, it is simple returned. This function is a convenient wrapper for
	joblib to parallelize the unzipping portion.

	Parameters
	----------
	filename : str or numpy.ndarray
		The name of the bedgraph file to open or the one-hot encoded sequence.

	verbose: bool, optional
		Whether to report the status while extracting sequence. This does not
		look good when done in parallel, so it is suggested it is set to false
		in that case.

	Returns
	-------
	sequence : numpy.ndarray, shape=(n, 8)
		The one-hot encoded DNaseI sequence.
	"""

	if isinstance(filename, str):
		if verbose:
			print("Converting {}".format(filename))

		dnase_dense = bedgraph_to_dense(filename, verbose)

		if verbose:
			print("Encoding {}".format(filename))

		dnase_ohe = encode_dnase(dnase_dense, verbose)
		return dnase_ohe
	return filename


[docs]class Rambutan(object):
	"""Rambutan: a predictor of mid-range DNA-DNA contacts.

	This serves as a wrapper for all functionality involving the use of Rambutan.
	There are two main functions to use, fit and predict. Fit involves taking in
	nucleotide sequence, DNaseI sensitivity, and a contact map, and training the
	model. Predict involves taking in nucleotide sequence and DNaseI sensitivity
	and predicting significant contacts.

	Note: Due to a limitation of mxnets part, you cannot fit and predict in the
	same program. You must fit the model and save the parameters during training,
	and then load the pre-fit model and make predictions.

	Parameters
	----------
	name : str, optional
		The name of the model, necessary for saving or loading parameters.
		Default is 'rambutan'.

	iteration : int or None, optional
		The iteration of training to load model parameters from, if using Rambutan
		in predict mode. Default is None.

	model : mxnet.symbol or None
		An alternate neural network can be passed in if one wishes to train that
		using the same framework instead of the original Rambutan model.

	learning_rate : float, optional
		The learning rate for the optimizer. Default is 0.01.

	num_epoch : int, optional
		The number of epochs to train the model for. Default is 25.

	epoch_size : int, optional
		The number of batches which comprise an 'epoch'. Default is 500.

	wd : float, optional
		The weight decay. This is equivalent to L2 regularization. Default is
		0.0.

	optimizer : str, optional
		The optimizer to use for training. Default is 'adam'.

	batch_size : int, optional
		The number of samples to use in each batch. Default is 1024.

	min_dist : int, optional
		The minimum distance to consider contacts for. Default is 50kb.

	max_dist : int, optional
		The maximum distance to consider contacts for. Default is 1mb.

	use_seq : bool, optional
		Whether to use nucleotide sequence as an input to the model in the 
		training step. Default is True.

	use_dnase : bool, optional
		Whether to use DNaseI sensitivity as an input to the model in the
		training step. Default is True.

	use_dist : bool, optional
		Whether to use genomic distance as an input to the model in the
		training step. Default is True.

	verbose : bool, optional
		Whether to output information during training and prediction. Default
		is True.

	Example 
	-------
	>>> from rambutan import Rambutan
	>>> import numpy
	>>> y_pred = Rambutan(iteration=25).predict('chr21.fa', 'chr21.GM12878.dnase.bedgraph', ctxs=[0, 1, 2, 3])
	>>> numpy.save("chr21.predictions.npy", y_pred)
	"""

	def __init__(self, name='rambutan', iteration=None, model=None, 
		learning_rate=0.01,  num_epoch=25, epoch_size=500, wd=0.0, 
		optimizer='adam', batch_size=1024, min_dist=50000, max_dist=1000000, 
		use_seq=True, use_dnase=True, use_dist=True, verbose=True):

		self.name = name
		self.iteration = iteration
		self.model = model
		self.learning_rate = learning_rate
		self.num_epoch = num_epoch
		self.epoch_size = epoch_size
		self.wd = wd
		self.optimizer = optimizer
		self.batch_size = batch_size
		self.min_dist = min_dist
		self.max_dist = max_dist
		self.use_seq = use_seq
		self.use_dnase = use_dnase
		self.use_dist = use_dist
		self.verbose = verbose

[docs]	def predict(self, sequence, dnase, regions=None, ctxs=[0], sparse=False):
		"""Make predictions and return the matrix of probabilities.

		Rambutan will make a prediction for each pair of genomic loci defined in
		`regions' which fall between `min_dist' and `max_dist'. Inputs can either
		be appropriately encoded sequence and dnase files, or fasta files and
		bedgraph files for the nucleotide sequence and DNaseI sensitivity
		respectively. Note: fasta files and bedgraph files must be made up of
		a single chromosome, not one entry per chromosome.

		Parameters
		----------
		sequence : numpy.ndarray, shape (n, 4) or str
			The nucleotide sequence. Either a one hot encoded matrix of 
			nucleotides with n being the size of the chromosome, or a file 
			name for a fasta file.

		dnase : numpy.ndarray, shape (n, 8) or str
			The DNaseI fold change sensitivity. Either an encoded matrix in 
			the manner described in the manuscript or the file name of a 
			bedgraph file.

		regions : numpy.ndarray or None, optional
			The regions of interest to look at. All other regions will be
			ignored. If set to none, the regions of interest are defined
			to be 1kb bins for which all nucleotides are mappable, i.e.
			where there are no n or N symbols in the fasta file. Default
			is None.

		ctxs : list, optional
			The contexts of the gpus to use for prediction. Currently
			prediction is only supported on gpus and not cpus due to
			the time it would take for prediction. For example, if you
			wanted to use three gpus of index 0 1 and 3 (because 2
			is busy doing something else) you would just pass in
			ctxs=[0, 1, 3] and the prediction task will be naturally
			parallelized across your 3 gpus with a linear speedup.

		sparse : bool, optional
			Whether to return three arrays, the rows, columns, and values,
			or the full dense matrix. Sparse is useful for large matrices.

		Returns
		-------
		y : numpy.ndarray, shape=(m, m)
			A matrix of predictions of shape (m, m) where m is the number of
			1kb loci in the chromosome. The predictions will reside in the 
			upper triangle of the matrix since predictions are symmetric.
		"""

		if isinstance(sequence, str) and isinstance(dnase, str):
			if self.verbose:
				print("Converting FASTA")

			sequence = fasta_to_dense(sequence, self.verbose)

			if self.verbose:
				print("Converting DNase")

			dnase = bedgraph_to_dense(dnase, self.verbose)

			if self.verbose:
				print("Encoding DNase")

			dnase = encode_dnase(dnase, self.verbose)

		if regions is None:
			regions = extract_regions(sequence)

		os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
		from .models import predict_task

		Parallel(n_jobs=len(ctxs))( delayed(predict_task)(self.name, 
			self.iteration, ctx, len(ctxs), sequence, dnase, regions, 
			self.use_seq, self.use_dnase, self.use_dist, self.min_dist, 
			self.max_dist, self.batch_size, self.verbose) for ctx in ctxs)

		if sparse == False:
			n = int(regions.max()) / 1000 + 1
			y = numpy.zeros((n, n))

			for ctx in ctxs:
				with open('.rambutan.predictions.{}.txt'.format(ctx), 'r') as infile:
					for line in infile:
						mid1, mid2, p = line.split()
						mid1 = (int(float(mid1)) - 500) / 1000
						mid2 = (int(float(mid2)) - 500) / 1000
						p = float(p)
						y[mid1, mid2] = p

				os.system('rm .rambutan.predictions.{}.txt'.format(ctx))

			return y

		else:
			rows, cols, values = [], [], []
			for ctx in ctxs:
				with open('.rambutan.predictions.{}.txt'.format(ctx), 'r') as infile:
					for line in infile:
						mid1, mid2, p = line.split()
						mid1, mid2, p = int(mid1), int(mid2), float(p)

						rows.append(mid1)
						cols.append(mid2)
						values.append(p)

				os.system('rm .rambutan.predictions.{}.txt'.format(ctx))

			rows = numpy.array(rows)
			cols = numpy.array(cols)
			values = numpy.array(values)
			return rows, cols, values

[docs]	def fit(self, sequence, dnase, contacts, regions=None, validation_contacts=None,
		training_chromosome=None, validation_chromosome=None, ctxs=[0], 
		eval_metric=roc_auc_score, symbol=None, n_jobs=1):
		"""Fit the model to sequence, DNaseI, and Hi-C data.

		You can fit the Rambutan model to new data. One must pass in sequence
		data, DNaseI data, and Hi-C contact maps. The sequence data can come
		either in the form of FastA files or one-hot encoded numpy arrays. The
		DNaseI data can likewise come as either bedgraph files or numpy arrays.
		The Hi-C data must come in the traditional 7 column format. Validation
		data can optionally be passed in to report a validation set error during
		the training process. NOTE: Regardless of if they are used or not, all
		chromosomes should be passed in to the `sequence` and `dnase` parameters.
		The contacts specified in `contacts` will dictate which are used. This is
		to make the internals easier.

		Parameters for training such as the number of epochs and batches are
		set in the initial constructor, following with the sklearn format for
		estimators.

		Parameters
		----------
		sequence : numpy.ndarray, shape (n, 4) or str
			The nucleotide sequence. Either a one hot encoded matrix of 
			nucleotides with n being the size of the chromosome, or a file 
			name for a fasta file.

		dnase : numpy.ndarray, shape (n, 8) or str
			The DNaseI fold change sensitivity. Either an encoded matrix in 
			the manner described in the manuscript or the file name of a 
			bedgraph file.

		regions : numpy.ndarray or None, optional
			The regions of interest to look at. All other regions will be
			ignored. If set to none, the regions of interest are defined
			to be 1kb bins for which all nucleotides are mappable, i.e.
			where there are no n or N symbols in the fasta file. Default
			is None.

		ctxs: list, optional
			The contexts of the gpus to use for prediction. Currently
			prediction is only supported on gpus and not cpus due to
			the time it would take for prediction. For example, if you
			wanted to use three gpus of index 0 1 and 3 (because 2
			is busy doing something else) you would just pass in
			ctxs=[0, 1, 3] and the prediction task will be naturally
			parallelized across your 3 gpus with a linear speedup.

		Returns
		-------
		y : numpy.ndarray, shape=(m, m)
			A matrix of predictions of shape (m, m) where m is the number of
			1kb loci in the chromosome. The predictions will reside in the 
			upper triangle of the matrix since predictions are symmetric.
		"""

		if not isinstance(sequence, list):
			raise ValueError("sequence must be a list of FastA file names or pre-encoded numpy arrays.")
		if not isinstance(dnase, list):
			raise ValueError("DNase must be a list of bedgraph file names or pre-encoded numpy arrays.")

		if isinstance(contacts, str):
			contacts = pandas.read_csv(contacts, sep='\t')

		with Parallel(n_jobs=n_jobs) as parallel:
			sequences = parallel( delayed(extract_sequence)(filename, self.verbose) for filename in sequence )
			dnases = parallel( delayed(extract_dnase)(filename, self.verbose) for filename in dnase )

			if regions is None:
				if self.verbose:
					print("Extracting regions")

				regions = parallel( delayed(extract_regions)(sequence) for sequence in sequences )

		sequences = numpy.array(sequences)
		dnases = numpy.array(dnases)
		regions = numpy.array(regions)

		if isinstance(validation_contacts, str):
			validation_contacts = pandas.read_csv(validation_contacts, sep='\t')
			validation_chromosome = int(validation_contacts.ix[0][0][3:])

		import mxnet as mx
		from .models import RambutanSymbol

		if symbol is None:
			symbol = RambutanSymbol

		model = symbol(ctx=map(mx.gpu, ctxs),
			                   epoch_size=self.epoch_size,
			                   num_epoch=self.num_epoch,
			                   learning_rate=self.learning_rate,
			                   wd=self.wd,
			                   optimizer=self.optimizer
			    )	

		training_contacts = numpy.empty((contacts.shape[0], 3), dtype='float64')
		training_contacts[:,0] = [int(chrom[3:])-1 for chrom in contacts['chr1']]
		training_contacts[:,1] = contacts['fragmentMid1'].values
		training_contacts[:,2] = contacts['fragmentMid2'].values

		if self.verbose:
			print("Training on {} contacts".format(training_contacts.shape[0]))
 
		X_train = TrainingGenerator(sequences, dnases, training_contacts, regions,
			self.batch_size, min_dist=self.min_dist, max_dist=self.max_dist,
			use_seq=self.use_seq, use_dnase=self.use_dnase, 
			use_dist=self.use_dist)

		if validation_contacts is not None:
			validation_contacts = validation_contacts[['fragmentMid1', 'fragmentMid2']].values

			if self.verbose:
				print("Validating on {} contacts from chromosome {}".format(validation_contacts.shape[0], validation_chromosome))

			X_validation = ValidationGenerator(sequences[validation_chromosome-1],
				dnases[validation_chromosome-1], validation_contacts, regions[validation_chromosome-1],
				batch_size=self.batch_size, min_dist=self.min_dist, 
				max_dist=self.max_dist, use_seq=self.use_seq, 
				use_dnase=self.use_dnase, use_dist=self.use_dist)

		model.fit( X=X_train,
				   eval_data=X_validation if validation_contacts is not None else None,
				   eval_metric=eval_metric,
				   batch_end_callback=mx.callback.Speedometer(self.batch_size),
				   kvstore='device',
				   epoch_end_callback=mx.callback.do_checkpoint(self.name)
		)

		self.iteration = self.num_epoch
		return model