import math import os import re import cv2 import random import pickle import numpy as np import tensorflow.keras.backend as K import pandas as pd import matplotlib.pyplot as plt import matplotlib.colors import matplotlib.cm import scipy.sparse from scipy.sparse import coo_matrix, csr_matrix, triu, tril import scipy.ndimage chromosome_labels = {'chr1': 0, 'chr2': 1, 'chr3': 2, 'chr4': 3, 'chr5': 4, 'chr6': 5, 'chr7': 6, 'chr8': 7, 'chr9': 8, 'chr10': 9, 'chr11': 10, 'chr12': 11, 'chr13': 12, 'chr14': 13, 'chr15': 14, 'chr16': 15, 'chr17': 16, 'chr18': 17, 'chr19': 18, 'chr20': 19, 'chr21': 20, 'chr22': 21, 'chrX': 22, 'chrY': 23} data_dir = 'data/' sparse_data_dir = 'data/sparse/' try: os.mkdir(data_dir) except FileExistsError: pass try: os.mkdir(sparse_data_dir) except FileExistsError: pass def open_anchor_to_anchor(filename): ''' Read a tab delimited anchor to anchor file as a DataFrame Args: filename (:obj:`str`) : full path to anchor to anchor file Returns: ``pandas.DataFrame``: if reading a normalized anchor to anchor file, columns are ``a1 a2 obs exp ratio`` and if reading a denoised or enhanced anchor to anchor file, columns are ``a1 a2 ratio`` ''' df = pd.read_csv(filename, sep='\t') n_cols = len(df.columns) if n_cols == 4: # if before denoise top loops df = pd.read_csv(filename, sep='\t', names=['anchor1', 'anchor2', 'obs', 'exp']) df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5) elif n_cols == 5: # includes p-value df = pd.read_csv(filename, sep='\t', names=['anchor1', 'anchor2', 'obs', 'exp', 'p_val']) df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5) else: # after denoise has no obs or exp df = pd.read_csv(filename, sep='\t', names=['anchor1', 'anchor2', 'ratio']) df = df[['anchor1', 'anchor2', 'ratio']] return df def open_full_genome(data_dir): ''' Args: data_dir: Returns: ''' genome = pd.DataFrame() print('Opening genome-wide anchor to anchor...') for chr_file in os.listdir(data_dir): if 'anchor_2_anchor' in chr_file or 'denoised.anchor.to.anchor' in chr_file: print(chr_file) genome = pd.concat([genome, open_anchor_to_anchor(data_dir + '/' + chr_file)]) return genome def get_chromosome_from_filename(filename): """ Extract the chromosome string from any of the file name formats we use Args: filename (:obj:`str`) : name of anchor to anchor file Returns: Chromosome string of form chr<> """ chr_index = filename.find('chr') # index of chromosome name if chr_index == 0: # if chromosome name is file prefix return filename[:filename.find('.')] file_ending_index = filename.rfind('.') # index of file ending if chr_index > file_ending_index: # if chromosome name is file ending return filename[chr_index:] else: return filename[chr_index: file_ending_index] def locus_to_anchor(chr_name, locus, anchor_dir): anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t', names=['chr', 'start', 'end', 'anchor']) # read anchor list file loci_indices = (anchor_list['start'] <= locus) & (locus <= anchor_list['end']) & ( anchor_list['chr'] == chr_name) print(np.where(loci_indices)[0][0]) return int(np.where(loci_indices)[0][0]) def save_samples(input_dir, target_dir, matrix_size, multi_input=False, dir_3=None, combined_dir=None, anchor_dir=None, name='sample', chr_name='chr6', locus_start=25922605, locus_end=26709867, force_size=128, force_symmetry=True): """ Saves sample matrices for use in training visualizations Args: input_dir (:obj:`str`) : directory containing input anchor to anchor files target_dir (:obj:`str`) : directory containing target anchor to anchor files matrix_size (:obj:`int`) : size of each sample matrix multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir`` dir_3 (:obj:`str`) : optional directory containing third set of input anchor to anchor files combined_dir (:obj:`str`) : optional directory containing combined target anchor to anchor files anchor_dir (:obj:`str`) : directory containing anchor reference ``.bed`` files name (:obj:`str`) : each saved sample file will begin with this string chr_index (:obj:`int`) : index of chromosome to save samples from locus (:obj:`int`) : index of anchor to save samples from """ global data_dir global sparse_data_dir try: os.mkdir(sparse_data_dir) except FileExistsError as e: pass if multi_input: input_folder_1 = os.listdir(input_dir)[0] + '/' input_folder_2 = os.listdir(input_dir)[1] + '/' try: input_folder_3 = os.listdir(input_dir)[2] + '/' except IndexError: pass chr_index = min(int(chr_name.replace('chr', '')), len(os.listdir(input_dir + input_folder_1)) - 1) print('Saving samples from', chr_name, '...') if (name == 'enhance' or name == 'val_enhance') and multi_input: matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry) matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry) matrix_3 = None combined_matrix = None else: if multi_input: matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry) matrix_2 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_2, os.listdir(input_dir + input_folder_2)[chr_index], anchor_dir, force_symmetry=force_symmetry) matrix_3 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_3, os.listdir(input_dir + input_folder_3)[chr_index], anchor_dir, force_symmetry=force_symmetry) combined_matrix = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry) else: matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir, os.listdir(input_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry) matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry) if dir_3 is not None: matrix_3 = load_chr_ratio_matrix_from_sparse(dir_3, os.listdir(dir_3)[chr_index], anchor_dir, force_symmetry=force_symmetry) else: matrix_3 = None if combined_dir is not None: combined_matrix = load_chr_ratio_matrix_from_sparse(combined_dir, os.listdir(combined_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry) else: combined_matrix = None i = locus_to_anchor(chr_name, locus_start, anchor_dir) j = locus_to_anchor(chr_name, locus_end, anchor_dir) mid = int((i + j) / 2) i = max(0, mid - int(force_size / 2)) j = i + force_size rows = slice(i, j) cols = slice(i, j) tile_1 = matrix_1[rows, cols].A tile_2 = matrix_2[rows, cols].A tile_1 = np.expand_dims(tile_1, -1) # add channel dimension tile_1 = np.expand_dims(tile_1, 0) # model expects a list of inputs tile_2 = np.expand_dims(tile_2, -1) tile_2 = np.expand_dims(tile_2, 0) if matrix_3 is not None: tile_3 = matrix_3[i:i + matrix_size, j:j + matrix_size].A tile_3 = np.expand_dims(tile_3, -1) tile_3 = np.expand_dims(tile_3, 0) np.save('%s%s_3' % (data_dir, name), tile_3) if combined_matrix is not None: combined_tile = combined_matrix[i:i + matrix_size, j:j + matrix_size].A combined_tile = np.expand_dims(combined_tile, -1) combined_tile = np.expand_dims(combined_tile, 0) np.save('%s%s_combined' % (data_dir, name), combined_tile) np.save('%s%s_1' % (data_dir, name), tile_1) np.save('%s%s_2' % (data_dir, name), tile_2) def load_chr_ratio_matrix_from_sparse(dir_name, file_name, anchor_dir, sparse_dir=None, anchor_list=None, chr_name=None, dummy=5, ignore_sparse=False, force_symmetry=True, use_raw=False): """ Loads data as a sparse matrix by either reading a precompiled sparse matrix or an anchor to anchor file which is converted to sparse CSR format. Ratio values are computed using the observed (obs) and expected (exp) values: .. math:: ratio = \\frac{obs + dummy}{exp + dummy} Args: dir_name (:obj:`str`) : directory containing the anchor to anchor or precompiled (.npz) sparse matrix file file_name (:obj:`str`) : name of anchor to anchor or precompiled (.npz) sparse matrix file anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files dummy (:obj:`int`) : dummy value to used when computing ratio values ignore_sparse (:obj:`bool`) : set to True to ignore precompiled sparse matrices even if they exist Returns: ``scipy.sparse.csr_matrix``: sparse matrix of ratio values """ global data_dir global sparse_data_dir if chr_name is None: chr_name = get_chromosome_from_filename(file_name) sparse_rep_dir = dir_name[dir_name[: -1].rfind('/') + 1:] # directory where the pre-compiled sparse matrices are saved if sparse_dir is not None: sparse_data_dir = sparse_dir os.makedirs(os.path.join(sparse_data_dir, sparse_rep_dir), exist_ok=True) if file_name.endswith('.npz'): # loading pre-combined and pre-compiled sparse data sparse_matrix = scipy.sparse.load_npz(dir_name + file_name) else: # load from file name if file_name + '.npz' in os.listdir(os.path.join(sparse_data_dir, sparse_rep_dir)) and not ignore_sparse: # check if pre-compiled data already exists sparse_matrix = scipy.sparse.load_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name + '.npz')) else: # otherwise generate sparse matrix from anchor2anchor file and save pre-compiled data if anchor_list is None: if anchor_dir is None: assert 'You must supply either an anchor reference list or the directory containing one' anchor_list = pd.read_csv(os.path.join(anchor_dir, '%s.bed' % chr_name), sep='\t', names=['chr', 'start', 'end', 'anchor']) # read anchor list file matrix_size = len(anchor_list) # matrix size is needed to construct sparse CSR matrix anchor_dict = anchor_list_to_dict(anchor_list['anchor'].values) # convert to anchor --> index dictionary try: # first try reading anchor to anchor file as chr_anchor_file = pd.read_csv( os.path.join(dir_name, file_name), delimiter='\t', names=['anchor1', 'anchor2', 'obs', 'exp'], usecols=['anchor1', 'anchor2', 'obs', 'exp']) # read chromosome anchor to anchor file rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices ratio = (chr_anchor_file['obs'] + dummy) / (chr_anchor_file['exp'] + dummy) # compute matrix ratio value sparse_matrix = scipy.sparse.csr_matrix((ratio, (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix except: # otherwise read anchor to anchor file as chr_anchor_file = pd.read_csv( os.path.join(dir_name, file_name), delimiter='\t', names=['anchor1', 'anchor2', 'ratio'], usecols=['anchor1', 'anchor2', 'ratio']) rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices if use_raw: sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['obs'], (rows, cols)), shape=( matrix_size, matrix_size)) # construct sparse CSR matrix else: sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['ratio'], (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix if force_symmetry: upper_sum = triu(sparse_matrix, k=1).sum() lower_sum = tril(sparse_matrix, k=-1).sum() if upper_sum == 0 or lower_sum == 0: sparse_matrix = sparse_matrix + sparse_matrix.transpose() sparse_triu = scipy.sparse.triu(sparse_matrix) sparse_matrix = sparse_triu + sparse_triu.transpose() if not ignore_sparse: scipy.sparse.save_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name), sparse_matrix) # save precompiled data return sparse_matrix def split_matrix(input_filename, input_matrix, target_matrix, input_batch, target_batch, matrix_size, step_size, batch_size, n_matrices, start_index, normalize, shuffle, random_steps, diagonal_only, upper_triangular_only): """ Generator function to split input and target sparse matrices into patches which are used for training and prediction. Args: input_filename (:obj:`str`): name of file which is being used to generate ratio matrix patches input_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR input matrix target_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR target matrix input_batch (:obj:`numpy.array`) : current array of samples in the input batch being generated target_batch (:obj:`numpy.array`) : current array of samples in the target batch being generated matrix_size (:obj:`int`) : size of each patch step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions batch_size (:obj:`int`) : number of patches to use in each batch n_matrices (:obj:`int`) : current number of matrix patches in the batch being generated start_index (:obj:`int`) : starting anchor index of the matrix splitting, ensures batches are not identical across epochs normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]`` shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix Returns: (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label """ if matrix_size == -1: input_matrix = np.expand_dims(np.expand_dims(input_matrix.A, 0), -1) target_matrix = np.expand_dims(np.expand_dims(target_matrix.A, 0), -1) yield input_matrix, target_matrix, input_filename + '_full_chr' else: if random_steps: # random offset from step size intervals start_index = np.random.randint(0, step_size) row_indices = np.arange(start_index, input_matrix.shape[0], step_size) col_indices = np.arange(start_index, input_matrix.shape[1], step_size) if shuffle: # shuffle slicing indices np.random.shuffle(row_indices) np.random.shuffle(col_indices) for i in row_indices: for j in col_indices: if abs(i - j) > 384: # max distance from diagonal with actual values continue if diagonal_only and i != j: continue if upper_triangular_only and i < j: continue input_tile = input_matrix[i:i + matrix_size, j:j + matrix_size].A target_tile = target_matrix[i:i + matrix_size, j:j + matrix_size].A #input_tile = np.expand_dims(input_tile, axis=-1) #target_tile = np.expand_dims(target_tile, axis=-1) input_batch.append(input_tile) target_batch.append(target_tile) n_matrices += 1 if n_matrices == batch_size: try: input_batch = np.reshape(np.array(input_batch), (n_matrices, matrix_size, matrix_size, 1)) target_batch = np.reshape(np.array(target_batch), (n_matrices, matrix_size, matrix_size, 1)) if normalize: input_batch = normalize_matrix(input_batch) target_batch = normalize_matrix(target_batch) yield input_batch, target_batch, input_filename + '_' + str(i) except ValueError as e: # reached end of valid values input_batch = [] target_batch = [] n_matrices = 0 pass input_batch = [] target_batch = [] n_matrices = 0 def generate_batches_from_chr(input_dir, target_dir, matrix_size, batch_size, anchor_dir=None, step_size=64, multi_input=False, shuffle=False, random_steps=False, normalize=False, diagonal_only=False, upper_triangular_only=False, force_symmetry=True, ignore_XY=True, ignore_even_chr=False, ignore_odd_chr=False): """ Generator function which generates batches of input target pairs to train the model: .. code-block:: python :linenos: for epoch_i in range(epochs): for input_batch, target_batch, batch_label in generate_batches_from_chr(input_dir, target_dir, matrix_size=128, batch_size=64, step_size=64, shuffle=True, random_steps=True, anchor_dir=anchor_dir): step_start_time = time.time() loss = model.train_on_batch(noisy_batch, target_batch) print("%d-%d %ds [Loss: %.3f][PSNR: %.3f, Jaccard: %.3f]" % (epoch_i, step_i, time.time() - step_start_time, loss[0], loss[1], loss[2] )) step_i += 1 Args: input_dir (:obj:`str`) : directory containing all input data to be generated target_dir (:obj:`str`) : directory containing all target data to be generated matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into batch_size (:obj:`int`) : number of patches to use in each batch anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir`` shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]`` ignore_XY (:obj:`bool`) : set to True to ignore chromosomes X and Y when generating batches ignore_even_chr (:obj:`bool`) : set to True to ignore all even numbered chromosomes ignore_odd_chr (:obj:`bool`) : set to True to ignore all odd numbered chromosomes Returns: (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label """ input_batch = [] target_batch = [] if multi_input: input_folders = os.listdir(input_dir) # get list of all folders in input dir input_files = sorted(os.listdir(input_dir + input_folders[0])) # get list of input files (assume all inputs have same name pattern) target_files = sorted(os.listdir(target_dir)) ''' # remove duplicates of chromosomes tmp = [] for f in input_files: if '.p_val' in f and f.replace('.p_val', '') in input_files: tmp.append(f.replace('.p_val', '')) if len(tmp) > 0: input_files = tmp print(input_files) ''' else: input_files = sorted(os.listdir(input_dir)) target_files = sorted(os.listdir(target_dir)) if shuffle: # shuffle chromosome file order c = list(zip(input_files, target_files)) random.shuffle(c) input_files, target_files = zip(*c) if ignore_XY: remove_XY = lambda files: [f for f in files if 'chrX' not in f and 'chrY' not in f] input_files = remove_XY(input_files) target_files = remove_XY(target_files) if ignore_odd_chr: # fun one-liner to remove all odd-numbered chromosomes remove_odds = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 == 0] input_files = remove_odds(input_files) target_files = remove_odds(target_files) elif ignore_even_chr: remove_evens = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 != 0] input_files = remove_evens(input_files) target_files = remove_evens(target_files) for input_file, target_file in zip(input_files, target_files): n_matrices = 0 start_index = 0 if multi_input: target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry) for input_folder in input_folders: input_folder += '/' input_matrix = load_chr_ratio_matrix_from_sparse(input_dir + input_folder, input_file, anchor_dir, force_symmetry=force_symmetry) for input_batch, target_batch, figure_title in split_matrix(input_filename=input_folder + input_file, input_matrix=input_matrix, target_matrix=target_matrix, input_batch=input_batch, target_batch=target_batch, matrix_size=matrix_size, step_size=step_size, batch_size=batch_size, n_matrices=n_matrices, start_index=start_index, normalize=normalize, shuffle=shuffle, random_steps=random_steps, diagonal_only=diagonal_only, upper_triangular_only=upper_triangular_only): yield input_batch, target_batch, figure_title input_batch = [] target_batch = [] n_matrices = 0 else: input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, input_file, anchor_dir, force_symmetry=force_symmetry) target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry) for input_batch, target_batch, figure_title in split_matrix(input_filename=input_file, input_matrix=input_matrix, target_matrix=target_matrix, input_batch=input_batch, target_batch=target_batch, matrix_size=matrix_size, step_size=step_size, batch_size=batch_size, n_matrices=n_matrices, start_index=start_index, normalize=normalize, shuffle=shuffle, random_steps=random_steps, diagonal_only=diagonal_only, upper_triangular_only=upper_triangular_only): yield input_batch, target_batch, figure_title input_batch = [] target_batch = [] n_matrices = 0 def get_matrices_from_loci(input_dir, target_dir, matrix_size, loci, anchor_dir=None): """ Generator function for getting sample matrices at specific loci Args: input_dir (:obj:`str`) : directory containing all input data to be generated target_dir (:obj:`str`) : directory containing all target data to be generated matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into loci (:obj:`dict`) : dictionary of chromosome locus pairs anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files Returns: (``numpy.array``, ``numpy.array``, ``str``, ``int``, ``int``): input matrix, target matrix, chromosome name, locus, and anchor index """ input_files = sorted_nicely(os.listdir(input_dir)) target_files = sorted_nicely(os.listdir(target_dir)) for file_1, file_2 in zip(input_files, target_files): chr_name = get_chromosome_from_filename(file_1) if chr_name in loci.keys(): anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t', names=['chr', 'start', 'end', 'anchor']) # read anchor list file else: continue input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, file_1, anchor_dir) target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, file_2, anchor_dir) loci_indices = (anchor_list['start'] <= loci[chr_name]) & (loci[chr_name] <= anchor_list['end']) & (anchor_list['chr'] == chr_name) for i, locus in enumerate(loci_indices): if locus: input_tile = input_matrix[i:i + matrix_size, i:i + matrix_size].A target_tile = target_matrix[i:i + matrix_size, i:i + matrix_size].A input_tile = np.expand_dims(input_tile, axis=-1) target_tile = np.expand_dims(target_tile, axis=-1) input_tile = np.expand_dims(input_tile, axis=0) target_tile = np.expand_dims(target_tile, axis=0) yield input_tile, target_tile, chr_name, loci[chr_name], i def get_top_loops(matrix_data_dir, reference_dir, num_top_loops=None, q=None, dummy=5): """ Ranks the ratio values of all chromosomes and computes the cutoff value for taking the top ``num_top_loops`` or the ``q`` th quantile Args: matrix_data_dir (:obj:`str`) : directory containing the anchor to anchor files used to count loops reference_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files num_top_loops (:obj:`str`) : number of top loops to consider q (:obj:`str`) : quantile range of loops to consider dummy (:obj:`str`) : dummy value to use to calculate each ratio value Returns: ``float`` : cutoff value for top loops """ global data_dir if 'top_loop_values.pickle' in os.listdir(data_dir): with open(data_dir + 'top_loop_values.pickle', 'rb') as handle: top_loop_values = pickle.load(handle) else: top_loop_values = {} if q is not None: # select top loops based on quantile not quantity if matrix_data_dir + str(q) in top_loop_values.keys(): genome_min_loop_value = top_loop_values[matrix_data_dir + str(q)] else: top_loops = np.array([]) for file in os.listdir(matrix_data_dir): sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy) sparse = scipy.sparse.triu(sparse) nonzero_indices = sparse.nonzero() top_loops = np.append(top_loops, sparse.tocsr()[nonzero_indices].A) genome_min_loop_value = np.quantile(top_loops, q=q) top_loop_values[matrix_data_dir + str(q)] = genome_min_loop_value print('%s %.4f quantile loops cutoff value: %f' % (matrix_data_dir, q, genome_min_loop_value)) else: # select top loops based on rank if matrix_data_dir + str(num_top_loops) in top_loop_values.keys(): genome_min_loop_value = top_loop_values[matrix_data_dir + str(num_top_loops)] else: top_loops = np.array([]) for file in os.listdir(matrix_data_dir): sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy) sparse = scipy.sparse.triu(sparse) loop_list = np.append(top_loops, sparse.data) top_loops = loop_list[np.argsort(-loop_list)[:num_top_loops]] genome_min_loop_value = top_loops[-1] top_loop_values[matrix_data_dir + str(num_top_loops)] = genome_min_loop_value print('%s top %d loops cutoff value: %f' % (matrix_data_dir, num_top_loops, genome_min_loop_value)) with open(data_dir + 'top_loop_values.pickle', 'wb') as handle: pickle.dump(top_loop_values, handle, protocol=pickle.HIGHEST_PROTOCOL) return genome_min_loop_value def anchor_list_to_dict(anchors): """ Converts the array of anchor names to a dictionary mapping each anchor to its chromosomal index Args: anchors (:obj:`numpy.array`) : array of anchor name values Returns: `dict` : dictionary mapping each anchor to its index from the array """ anchor_dict = {} for i, anchor in enumerate(anchors): anchor_dict[anchor] = i return anchor_dict def anchor_to_locus(anchor_dict): """ Function to convert an anchor name to its genomic locus which can be easily vectorized Args: anchor_dict (:obj:`dict`) : dictionary mapping each anchor to its chromosomal index Returns: `function` : function which returns the locus of an anchor name """ def f(anchor): return anchor_dict[anchor] return f def sorted_nicely(l): """ Sorts an iterable object according to file system defaults Args: l (:obj:`iterable`) : iterable object containing items which can be interpreted as text Returns: `iterable` : sorted iterable """ convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] return sorted(l, key=alphanum_key) def normalize_matrix(matrix): """ Normalize ratio values between ``[0, 1]`` using the following function: .. math:: f(x) = 1 - \\frac{1}{1 + x} .. image:: _static/normalization_function_plot.PNG :scale: 100 % :align: center Args: matrix (:obj:`numpy.array`) : matrix of ratio values Returns: ``numpy.array`` : matrix of normalized ratio values between ``[0, 1]`` """ return 1 - (1 / (1 + matrix)) def denormalize_matrix(matrix): """ Reverse the normalization of a matrix to set all valid normalized values back to their original ratio values using the following function: .. math:: f^{-1}(x) = \\frac{1}{1 - g(x)} - 1 &\\quad \\mbox{where} &\\quad g(x) = \\begin{cases} 0.98, & \\mbox{if } x > 1 \\\\ 0, & \\mbox{if } x < 0 \\\\ x & \\mbox{ otherwise} \\end{cases} We apply the function :math:`g(x)` to remove invalid values that could be in a predicted result and because :math:`f^{-1}(x)` blows up as we approach 1: .. image:: _static/denormalization_function_plot.PNG :scale: 100 % :align: center Args: matrix (:obj:`numpy.array`) : matrix of normalized ratio values Returns: ``numpy.array`` : matrix of ratio values """ matrix[matrix > 1] = 0.98 matrix[matrix < 0] = 0 return (1 / (1 - matrix)) - 1 def draw_heatmap(matrix, color_scale, ax=None, return_image=False): """ Display ratio heatmap containing only strong signals (values > 1 or 0.98th quantile) Args: matrix (:obj:`numpy.array`) : ratio matrix to be displayed color_scale (:obj:`int`) : max ratio value to be considered strongest by color mapping ax (:obj:`matplotlib.axes.Axes`) : axes which will contain the heatmap. If None, new axes are created return_image (:obj:`bool`) : set to True to return the image obtained from drawing the heatmap with the generated color map Returns: ``numpy.array`` : if ``return_image`` is set to True, return the heatmap as an array """ if color_scale != 0: breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix)) elif np.max(matrix) < 2: breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19) else: step = (np.quantile(matrix, q=0.95) - 1) / 18 up = np.quantile(matrix, q=0.95) + 0.011 if up < 2: up = 2 step = 0.999 / 18 breaks = np.append(np.arange(1.001, up, step), np.max(matrix)) n_bin = 20 # Discretizes the interpolation into bins colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686", "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D", "#FF0000"] cmap_name = 'my_list' # Create the colormap cm = matplotlib.colors.LinearSegmentedColormap.from_list( cmap_name, colors, N=n_bin) norm = matplotlib.colors.BoundaryNorm(breaks, 20) # Fewer bins will result in "coarser" colomap interpolation if ax is None: _, ax = plt.subplots() img = ax.imshow(matrix, cmap=cm, norm=norm, interpolation='nearest') if return_image: plt.close() return img.get_array() def get_heatmap(matrix, color_scale): if color_scale != 0: breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix)) elif np.max(matrix) < 2: breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19) else: step = (np.quantile(matrix, q=0.98) - 1) / 18 up = np.quantile(matrix, q=0.98) + 0.011 if up < 2: up = 2 step = 0.999 / 18 breaks = np.append(np.arange(1.001, up, step), np.max(matrix)) n_bin = 20 # Discretizes the interpolation into bins colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686", "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D", "#FF0000"] cmap_name = 'my_list' # Create the colormap cm = matplotlib.colors.LinearSegmentedColormap.from_list( cmap_name, colors, N=n_bin) norm = matplotlib.colors.BoundaryNorm(breaks, 20) # Fewer bins will result in "coarser" colomap interpolation m = matplotlib.cm.ScalarMappable(norm=norm, cmap=cm) heatmap = m.to_rgba(matrix) mask = matrix > 1.2 heatmap[..., -1] = np.ones_like(mask) * mask return heatmap def save_images_to_video(output_name, out_dir): """ Saves all training visualization images to a video file Args: output_name (:obj:`str`) : filename for the saved video file """ image_folder = 'images' video_name = out_dir + output_name + '.avi' images = [img for img in sorted(os.listdir(image_folder)) if img.endswith(".png")] frame = cv2.imread(os.path.join(image_folder, images[0])) height, width, layers = frame.shape video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 29.94, (width, height)) for image in images: video.write(cv2.imread(os.path.join(image_folder, image))) last_frame = cv2.imread(os.path.join(image_folder, images[-1])) for _ in range(150): video.write(last_frame) cv2.destroyAllWindows() video.release() def get_model_memory_usage(batch_size, model): """ Estimates the amount of memory required to train the model using the current batch size. Args: batch_size (:obj:`int`) : number of training samples in each batch model (:obj:`keras.models.Model`) : uncompiled Keras model to be trained Returns: ``float`` : estimated memory usage in GB """ shapes_mem_count = 0 for l in model.layers: single_layer_mem = 1 for s in l.output_shape: if s is None: continue single_layer_mem *= s shapes_mem_count += single_layer_mem trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)]) non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]) number_size = 4.0 if K.floatx() == 'float16': number_size = 2.0 if K.floatx() == 'float64': number_size = 8.0 total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count) gbytes = np.round(total_memory / (1024.0 ** 3), 3) return gbytes