Spaces:
Sleeping
Sleeping
File size: 41,667 Bytes
b3dae8f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 | import math
import os
import re
import cv2
import random
import pickle
import numpy as np
import tensorflow.keras.backend as K
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.cm
import scipy.sparse
from scipy.sparse import coo_matrix, csr_matrix, triu, tril
import scipy.ndimage
chromosome_labels = {'chr1': 0, 'chr2': 1, 'chr3': 2, 'chr4': 3, 'chr5': 4, 'chr6': 5, 'chr7': 6, 'chr8': 7, 'chr9': 8,
'chr10': 9, 'chr11': 10, 'chr12': 11, 'chr13': 12, 'chr14': 13, 'chr15': 14, 'chr16': 15, 'chr17': 16, 'chr18': 17,
'chr19': 18, 'chr20': 19, 'chr21': 20, 'chr22': 21, 'chrX': 22, 'chrY': 23}
data_dir = 'data/'
sparse_data_dir = 'data/sparse/'
try:
os.mkdir(data_dir)
except FileExistsError:
pass
try:
os.mkdir(sparse_data_dir)
except FileExistsError:
pass
def open_anchor_to_anchor(filename):
'''
Read a tab delimited anchor to anchor file as a DataFrame
Args:
filename (:obj:`str`) : full path to anchor to anchor file
Returns:
``pandas.DataFrame``: if reading a normalized anchor to anchor file, columns are ``a1 a2 obs exp ratio``
and if reading a denoised or enhanced anchor to anchor file, columns are ``a1 a2 ratio``
'''
df = pd.read_csv(filename, sep='\t')
n_cols = len(df.columns)
if n_cols == 4: # if before denoise top loops
df = pd.read_csv(filename,
sep='\t',
names=['anchor1', 'anchor2', 'obs', 'exp'])
df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
elif n_cols == 5: # includes p-value
df = pd.read_csv(filename,
sep='\t',
names=['anchor1', 'anchor2', 'obs', 'exp', 'p_val'])
df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
else: # after denoise has no obs or exp
df = pd.read_csv(filename,
sep='\t',
names=['anchor1', 'anchor2', 'ratio'])
df = df[['anchor1', 'anchor2', 'ratio']]
return df
def open_full_genome(data_dir):
'''
Args:
data_dir:
Returns:
'''
genome = pd.DataFrame()
print('Opening genome-wide anchor to anchor...')
for chr_file in os.listdir(data_dir):
if 'anchor_2_anchor' in chr_file or 'denoised.anchor.to.anchor' in chr_file:
print(chr_file)
genome = pd.concat([genome, open_anchor_to_anchor(data_dir + '/' + chr_file)])
return genome
def get_chromosome_from_filename(filename):
"""
Extract the chromosome string from any of the file name formats we use
Args:
filename (:obj:`str`) : name of anchor to anchor file
Returns:
Chromosome string of form chr<>
"""
chr_index = filename.find('chr') # index of chromosome name
if chr_index == 0: # if chromosome name is file prefix
return filename[:filename.find('.')]
file_ending_index = filename.rfind('.') # index of file ending
if chr_index > file_ending_index: # if chromosome name is file ending
return filename[chr_index:]
else:
return filename[chr_index: file_ending_index]
def locus_to_anchor(chr_name, locus, anchor_dir):
anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
names=['chr', 'start', 'end', 'anchor']) # read anchor list file
loci_indices = (anchor_list['start'] <= locus) & (locus <= anchor_list['end']) & (
anchor_list['chr'] == chr_name)
print(np.where(loci_indices)[0][0])
return int(np.where(loci_indices)[0][0])
def save_samples(input_dir, target_dir, matrix_size, multi_input=False, dir_3=None, combined_dir=None, anchor_dir=None, name='sample', chr_name='chr6', locus_start=25922605, locus_end=26709867, force_size=128, force_symmetry=True):
"""
Saves sample matrices for use in training visualizations
Args:
input_dir (:obj:`str`) : directory containing input anchor to anchor files
target_dir (:obj:`str`) : directory containing target anchor to anchor files
matrix_size (:obj:`int`) : size of each sample matrix
multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
dir_3 (:obj:`str`) : optional directory containing third set of input anchor to anchor files
combined_dir (:obj:`str`) : optional directory containing combined target anchor to anchor files
anchor_dir (:obj:`str`) : directory containing anchor reference ``.bed`` files
name (:obj:`str`) : each saved sample file will begin with this string
chr_index (:obj:`int`) : index of chromosome to save samples from
locus (:obj:`int`) : index of anchor to save samples from
"""
global data_dir
global sparse_data_dir
try:
os.mkdir(sparse_data_dir)
except FileExistsError as e:
pass
if multi_input:
input_folder_1 = os.listdir(input_dir)[0] + '/'
input_folder_2 = os.listdir(input_dir)[1] + '/'
try:
input_folder_3 = os.listdir(input_dir)[2] + '/'
except IndexError:
pass
chr_index = min(int(chr_name.replace('chr', '')), len(os.listdir(input_dir + input_folder_1)) - 1)
print('Saving samples from', chr_name, '...')
if (name == 'enhance' or name == 'val_enhance') and multi_input:
matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
matrix_3 = None
combined_matrix = None
else:
if multi_input:
matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
matrix_2 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_2, os.listdir(input_dir + input_folder_2)[chr_index], anchor_dir, force_symmetry=force_symmetry)
matrix_3 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_3, os.listdir(input_dir + input_folder_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
combined_matrix = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
else:
matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir, os.listdir(input_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
if dir_3 is not None:
matrix_3 = load_chr_ratio_matrix_from_sparse(dir_3, os.listdir(dir_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
else:
matrix_3 = None
if combined_dir is not None:
combined_matrix = load_chr_ratio_matrix_from_sparse(combined_dir, os.listdir(combined_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
else:
combined_matrix = None
i = locus_to_anchor(chr_name, locus_start, anchor_dir)
j = locus_to_anchor(chr_name, locus_end, anchor_dir)
mid = int((i + j) / 2)
i = max(0, mid - int(force_size / 2))
j = i + force_size
rows = slice(i, j)
cols = slice(i, j)
tile_1 = matrix_1[rows, cols].A
tile_2 = matrix_2[rows, cols].A
tile_1 = np.expand_dims(tile_1, -1) # add channel dimension
tile_1 = np.expand_dims(tile_1, 0) # model expects a list of inputs
tile_2 = np.expand_dims(tile_2, -1)
tile_2 = np.expand_dims(tile_2, 0)
if matrix_3 is not None:
tile_3 = matrix_3[i:i + matrix_size, j:j + matrix_size].A
tile_3 = np.expand_dims(tile_3, -1)
tile_3 = np.expand_dims(tile_3, 0)
np.save('%s%s_3' % (data_dir, name), tile_3)
if combined_matrix is not None:
combined_tile = combined_matrix[i:i + matrix_size, j:j + matrix_size].A
combined_tile = np.expand_dims(combined_tile, -1)
combined_tile = np.expand_dims(combined_tile, 0)
np.save('%s%s_combined' % (data_dir, name), combined_tile)
np.save('%s%s_1' % (data_dir, name), tile_1)
np.save('%s%s_2' % (data_dir, name), tile_2)
def load_chr_ratio_matrix_from_sparse(dir_name, file_name, anchor_dir, sparse_dir=None, anchor_list=None, chr_name=None, dummy=5, ignore_sparse=False, force_symmetry=True, use_raw=False):
"""
Loads data as a sparse matrix by either reading a precompiled sparse matrix or an anchor to anchor file which is converted to sparse CSR format.
Ratio values are computed using the observed (obs) and expected (exp) values:
.. math::
ratio = \\frac{obs + dummy}{exp + dummy}
Args:
dir_name (:obj:`str`) : directory containing the anchor to anchor or precompiled (.npz) sparse matrix file
file_name (:obj:`str`) : name of anchor to anchor or precompiled (.npz) sparse matrix file
anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
dummy (:obj:`int`) : dummy value to used when computing ratio values
ignore_sparse (:obj:`bool`) : set to True to ignore precompiled sparse matrices even if they exist
Returns:
``scipy.sparse.csr_matrix``: sparse matrix of ratio values
"""
global data_dir
global sparse_data_dir
if chr_name is None:
chr_name = get_chromosome_from_filename(file_name)
sparse_rep_dir = dir_name[dir_name[: -1].rfind('/') + 1:] # directory where the pre-compiled sparse matrices are saved
if sparse_dir is not None:
sparse_data_dir = sparse_dir
os.makedirs(os.path.join(sparse_data_dir, sparse_rep_dir), exist_ok=True)
if file_name.endswith('.npz'): # loading pre-combined and pre-compiled sparse data
sparse_matrix = scipy.sparse.load_npz(dir_name + file_name)
else: # load from file name
if file_name + '.npz' in os.listdir(os.path.join(sparse_data_dir, sparse_rep_dir)) and not ignore_sparse: # check if pre-compiled data already exists
sparse_matrix = scipy.sparse.load_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name + '.npz'))
else: # otherwise generate sparse matrix from anchor2anchor file and save pre-compiled data
if anchor_list is None:
if anchor_dir is None:
assert 'You must supply either an anchor reference list or the directory containing one'
anchor_list = pd.read_csv(os.path.join(anchor_dir, '%s.bed' % chr_name), sep='\t',
names=['chr', 'start', 'end', 'anchor']) # read anchor list file
matrix_size = len(anchor_list) # matrix size is needed to construct sparse CSR matrix
anchor_dict = anchor_list_to_dict(anchor_list['anchor'].values) # convert to anchor --> index dictionary
try: # first try reading anchor to anchor file as <a1> <a2> <obs> <exp>
chr_anchor_file = pd.read_csv(
os.path.join(dir_name, file_name),
delimiter='\t',
names=['anchor1', 'anchor2', 'obs', 'exp'],
usecols=['anchor1', 'anchor2', 'obs', 'exp']) # read chromosome anchor to anchor file
rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
ratio = (chr_anchor_file['obs'] + dummy) / (chr_anchor_file['exp'] + dummy) # compute matrix ratio value
sparse_matrix = scipy.sparse.csr_matrix((ratio, (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
except: # otherwise read anchor to anchor file as <a1> <a2> <ratio>
chr_anchor_file = pd.read_csv(
os.path.join(dir_name, file_name),
delimiter='\t',
names=['anchor1', 'anchor2', 'ratio'],
usecols=['anchor1', 'anchor2', 'ratio'])
rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
if use_raw:
sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['obs'], (rows, cols)), shape=(
matrix_size, matrix_size)) # construct sparse CSR matrix
else:
sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['ratio'], (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
if force_symmetry:
upper_sum = triu(sparse_matrix, k=1).sum()
lower_sum = tril(sparse_matrix, k=-1).sum()
if upper_sum == 0 or lower_sum == 0:
sparse_matrix = sparse_matrix + sparse_matrix.transpose()
sparse_triu = scipy.sparse.triu(sparse_matrix)
sparse_matrix = sparse_triu + sparse_triu.transpose()
if not ignore_sparse:
scipy.sparse.save_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name), sparse_matrix) # save precompiled data
return sparse_matrix
def split_matrix(input_filename,
input_matrix,
target_matrix,
input_batch,
target_batch,
matrix_size,
step_size,
batch_size,
n_matrices,
start_index,
normalize,
shuffle,
random_steps,
diagonal_only,
upper_triangular_only):
"""
Generator function to split input and target sparse matrices into patches which are used for training and prediction.
Args:
input_filename (:obj:`str`): name of file which is being used to generate ratio matrix patches
input_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR input matrix
target_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR target matrix
input_batch (:obj:`numpy.array`) : current array of samples in the input batch being generated
target_batch (:obj:`numpy.array`) : current array of samples in the target batch being generated
matrix_size (:obj:`int`) : size of each patch
step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
batch_size (:obj:`int`) : number of patches to use in each batch
n_matrices (:obj:`int`) : current number of matrix patches in the batch being generated
start_index (:obj:`int`) : starting anchor index of the matrix splitting, ensures batches are not identical across epochs
normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
Returns:
(``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
"""
if matrix_size == -1:
input_matrix = np.expand_dims(np.expand_dims(input_matrix.A, 0), -1)
target_matrix = np.expand_dims(np.expand_dims(target_matrix.A, 0), -1)
yield input_matrix, target_matrix, input_filename + '_full_chr'
else:
if random_steps: # random offset from step size intervals
start_index = np.random.randint(0, step_size)
row_indices = np.arange(start_index, input_matrix.shape[0], step_size)
col_indices = np.arange(start_index, input_matrix.shape[1], step_size)
if shuffle: # shuffle slicing indices
np.random.shuffle(row_indices)
np.random.shuffle(col_indices)
for i in row_indices:
for j in col_indices:
if abs(i - j) > 384: # max distance from diagonal with actual values
continue
if diagonal_only and i != j:
continue
if upper_triangular_only and i < j:
continue
input_tile = input_matrix[i:i + matrix_size, j:j + matrix_size].A
target_tile = target_matrix[i:i + matrix_size, j:j + matrix_size].A
#input_tile = np.expand_dims(input_tile, axis=-1)
#target_tile = np.expand_dims(target_tile, axis=-1)
input_batch.append(input_tile)
target_batch.append(target_tile)
n_matrices += 1
if n_matrices == batch_size:
try:
input_batch = np.reshape(np.array(input_batch), (n_matrices, matrix_size, matrix_size, 1))
target_batch = np.reshape(np.array(target_batch), (n_matrices, matrix_size, matrix_size, 1))
if normalize:
input_batch = normalize_matrix(input_batch)
target_batch = normalize_matrix(target_batch)
yield input_batch, target_batch, input_filename + '_' + str(i)
except ValueError as e: # reached end of valid values
input_batch = []
target_batch = []
n_matrices = 0
pass
input_batch = []
target_batch = []
n_matrices = 0
def generate_batches_from_chr(input_dir,
target_dir,
matrix_size,
batch_size,
anchor_dir=None,
step_size=64,
multi_input=False,
shuffle=False,
random_steps=False,
normalize=False,
diagonal_only=False,
upper_triangular_only=False,
force_symmetry=True,
ignore_XY=True,
ignore_even_chr=False,
ignore_odd_chr=False):
"""
Generator function which generates batches of input target pairs to train the model:
.. code-block:: python
:linenos:
for epoch_i in range(epochs):
for input_batch, target_batch, batch_label in generate_batches_from_chr(input_dir,
target_dir,
matrix_size=128,
batch_size=64,
step_size=64,
shuffle=True,
random_steps=True,
anchor_dir=anchor_dir):
step_start_time = time.time()
loss = model.train_on_batch(noisy_batch, target_batch)
print("%d-%d %ds [Loss: %.3f][PSNR: %.3f, Jaccard: %.3f]" %
(epoch_i,
step_i,
time.time() - step_start_time,
loss[0],
loss[1],
loss[2]
))
step_i += 1
Args:
input_dir (:obj:`str`) : directory containing all input data to be generated
target_dir (:obj:`str`) : directory containing all target data to be generated
matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
batch_size (:obj:`int`) : number of patches to use in each batch
anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
ignore_XY (:obj:`bool`) : set to True to ignore chromosomes X and Y when generating batches
ignore_even_chr (:obj:`bool`) : set to True to ignore all even numbered chromosomes
ignore_odd_chr (:obj:`bool`) : set to True to ignore all odd numbered chromosomes
Returns:
(``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
"""
input_batch = []
target_batch = []
if multi_input:
input_folders = os.listdir(input_dir) # get list of all folders in input dir
input_files = sorted(os.listdir(input_dir + input_folders[0])) # get list of input files (assume all inputs have same name pattern)
target_files = sorted(os.listdir(target_dir))
'''
# remove duplicates of chromosomes
tmp = []
for f in input_files:
if '.p_val' in f and f.replace('.p_val', '') in input_files:
tmp.append(f.replace('.p_val', ''))
if len(tmp) > 0:
input_files = tmp
print(input_files)
'''
else:
input_files = sorted(os.listdir(input_dir))
target_files = sorted(os.listdir(target_dir))
if shuffle: # shuffle chromosome file order
c = list(zip(input_files, target_files))
random.shuffle(c)
input_files, target_files = zip(*c)
if ignore_XY:
remove_XY = lambda files: [f for f in files if 'chrX' not in f and 'chrY' not in f]
input_files = remove_XY(input_files)
target_files = remove_XY(target_files)
if ignore_odd_chr:
# fun one-liner to remove all odd-numbered chromosomes
remove_odds = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 == 0]
input_files = remove_odds(input_files)
target_files = remove_odds(target_files)
elif ignore_even_chr:
remove_evens = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 != 0]
input_files = remove_evens(input_files)
target_files = remove_evens(target_files)
for input_file, target_file in zip(input_files, target_files):
n_matrices = 0
start_index = 0
if multi_input:
target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
for input_folder in input_folders:
input_folder += '/'
input_matrix = load_chr_ratio_matrix_from_sparse(input_dir + input_folder, input_file, anchor_dir, force_symmetry=force_symmetry)
for input_batch, target_batch, figure_title in split_matrix(input_filename=input_folder + input_file,
input_matrix=input_matrix,
target_matrix=target_matrix,
input_batch=input_batch,
target_batch=target_batch,
matrix_size=matrix_size,
step_size=step_size,
batch_size=batch_size,
n_matrices=n_matrices,
start_index=start_index,
normalize=normalize,
shuffle=shuffle,
random_steps=random_steps,
diagonal_only=diagonal_only,
upper_triangular_only=upper_triangular_only):
yield input_batch, target_batch, figure_title
input_batch = []
target_batch = []
n_matrices = 0
else:
input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, input_file, anchor_dir, force_symmetry=force_symmetry)
target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
for input_batch, target_batch, figure_title in split_matrix(input_filename=input_file,
input_matrix=input_matrix,
target_matrix=target_matrix,
input_batch=input_batch,
target_batch=target_batch,
matrix_size=matrix_size,
step_size=step_size,
batch_size=batch_size,
n_matrices=n_matrices,
start_index=start_index,
normalize=normalize,
shuffle=shuffle,
random_steps=random_steps,
diagonal_only=diagonal_only,
upper_triangular_only=upper_triangular_only):
yield input_batch, target_batch, figure_title
input_batch = []
target_batch = []
n_matrices = 0
def get_matrices_from_loci(input_dir,
target_dir,
matrix_size,
loci,
anchor_dir=None):
"""
Generator function for getting sample matrices at specific loci
Args:
input_dir (:obj:`str`) : directory containing all input data to be generated
target_dir (:obj:`str`) : directory containing all target data to be generated
matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
loci (:obj:`dict`) : dictionary of chromosome locus pairs
anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
Returns:
(``numpy.array``, ``numpy.array``, ``str``, ``int``, ``int``): input matrix, target matrix, chromosome name, locus, and anchor index
"""
input_files = sorted_nicely(os.listdir(input_dir))
target_files = sorted_nicely(os.listdir(target_dir))
for file_1, file_2 in zip(input_files, target_files):
chr_name = get_chromosome_from_filename(file_1)
if chr_name in loci.keys():
anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
names=['chr', 'start', 'end', 'anchor']) # read anchor list file
else:
continue
input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, file_1, anchor_dir)
target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, file_2, anchor_dir)
loci_indices = (anchor_list['start'] <= loci[chr_name]) & (loci[chr_name] <= anchor_list['end']) & (anchor_list['chr'] == chr_name)
for i, locus in enumerate(loci_indices):
if locus:
input_tile = input_matrix[i:i + matrix_size, i:i + matrix_size].A
target_tile = target_matrix[i:i + matrix_size, i:i + matrix_size].A
input_tile = np.expand_dims(input_tile, axis=-1)
target_tile = np.expand_dims(target_tile, axis=-1)
input_tile = np.expand_dims(input_tile, axis=0)
target_tile = np.expand_dims(target_tile, axis=0)
yield input_tile, target_tile, chr_name, loci[chr_name], i
def get_top_loops(matrix_data_dir, reference_dir, num_top_loops=None, q=None, dummy=5):
"""
Ranks the ratio values of all chromosomes and computes the cutoff value for taking the top ``num_top_loops`` or the ``q`` th quantile
Args:
matrix_data_dir (:obj:`str`) : directory containing the anchor to anchor files used to count loops
reference_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
num_top_loops (:obj:`str`) : number of top loops to consider
q (:obj:`str`) : quantile range of loops to consider
dummy (:obj:`str`) : dummy value to use to calculate each ratio value
Returns:
``float`` : cutoff value for top loops
"""
global data_dir
if 'top_loop_values.pickle' in os.listdir(data_dir):
with open(data_dir + 'top_loop_values.pickle', 'rb') as handle:
top_loop_values = pickle.load(handle)
else:
top_loop_values = {}
if q is not None: # select top loops based on quantile not quantity
if matrix_data_dir + str(q) in top_loop_values.keys():
genome_min_loop_value = top_loop_values[matrix_data_dir + str(q)]
else:
top_loops = np.array([])
for file in os.listdir(matrix_data_dir):
sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
sparse = scipy.sparse.triu(sparse)
nonzero_indices = sparse.nonzero()
top_loops = np.append(top_loops, sparse.tocsr()[nonzero_indices].A)
genome_min_loop_value = np.quantile(top_loops, q=q)
top_loop_values[matrix_data_dir + str(q)] = genome_min_loop_value
print('%s %.4f quantile loops cutoff value: %f' % (matrix_data_dir, q, genome_min_loop_value))
else: # select top loops based on rank
if matrix_data_dir + str(num_top_loops) in top_loop_values.keys():
genome_min_loop_value = top_loop_values[matrix_data_dir + str(num_top_loops)]
else:
top_loops = np.array([])
for file in os.listdir(matrix_data_dir):
sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
sparse = scipy.sparse.triu(sparse)
loop_list = np.append(top_loops, sparse.data)
top_loops = loop_list[np.argsort(-loop_list)[:num_top_loops]]
genome_min_loop_value = top_loops[-1]
top_loop_values[matrix_data_dir + str(num_top_loops)] = genome_min_loop_value
print('%s top %d loops cutoff value: %f' % (matrix_data_dir, num_top_loops, genome_min_loop_value))
with open(data_dir + 'top_loop_values.pickle', 'wb') as handle:
pickle.dump(top_loop_values, handle, protocol=pickle.HIGHEST_PROTOCOL)
return genome_min_loop_value
def anchor_list_to_dict(anchors):
"""
Converts the array of anchor names to a dictionary mapping each anchor to its chromosomal index
Args:
anchors (:obj:`numpy.array`) : array of anchor name values
Returns:
`dict` : dictionary mapping each anchor to its index from the array
"""
anchor_dict = {}
for i, anchor in enumerate(anchors):
anchor_dict[anchor] = i
return anchor_dict
def anchor_to_locus(anchor_dict):
"""
Function to convert an anchor name to its genomic locus which can be easily vectorized
Args:
anchor_dict (:obj:`dict`) : dictionary mapping each anchor to its chromosomal index
Returns:
`function` : function which returns the locus of an anchor name
"""
def f(anchor):
return anchor_dict[anchor]
return f
def sorted_nicely(l):
"""
Sorts an iterable object according to file system defaults
Args:
l (:obj:`iterable`) : iterable object containing items which can be interpreted as text
Returns:
`iterable` : sorted iterable
"""
convert = lambda text: int(text) if text.isdigit() else text
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
return sorted(l, key=alphanum_key)
def normalize_matrix(matrix):
"""
Normalize ratio values between ``[0, 1]`` using the following function:
.. math::
f(x) = 1 - \\frac{1}{1 + x}
.. image:: _static/normalization_function_plot.PNG
:scale: 100 %
:align: center
Args:
matrix (:obj:`numpy.array`) : matrix of ratio values
Returns:
``numpy.array`` : matrix of normalized ratio values between ``[0, 1]``
"""
return 1 - (1 / (1 + matrix))
def denormalize_matrix(matrix):
"""
Reverse the normalization of a matrix to set all valid normalized values back to their original ratio values using the following function:
.. math::
f^{-1}(x) = \\frac{1}{1 - g(x)} - 1 &\\quad \\mbox{where} &\\quad g(x) = \\begin{cases} 0.98, & \\mbox{if } x > 1 \\\\ 0, & \\mbox{if } x < 0 \\\\ x & \\mbox{ otherwise} \\end{cases}
We apply the function :math:`g(x)` to remove invalid values that could be in a predicted result and because :math:`f^{-1}(x)` blows up as we approach 1:
.. image:: _static/denormalization_function_plot.PNG
:scale: 100 %
:align: center
Args:
matrix (:obj:`numpy.array`) : matrix of normalized ratio values
Returns:
``numpy.array`` : matrix of ratio values
"""
matrix[matrix > 1] = 0.98
matrix[matrix < 0] = 0
return (1 / (1 - matrix)) - 1
def draw_heatmap(matrix, color_scale, ax=None, return_image=False):
"""
Display ratio heatmap containing only strong signals (values > 1 or 0.98th quantile)
Args:
matrix (:obj:`numpy.array`) : ratio matrix to be displayed
color_scale (:obj:`int`) : max ratio value to be considered strongest by color mapping
ax (:obj:`matplotlib.axes.Axes`) : axes which will contain the heatmap. If None, new axes are created
return_image (:obj:`bool`) : set to True to return the image obtained from drawing the heatmap with the generated color map
Returns:
``numpy.array`` : if ``return_image`` is set to True, return the heatmap as an array
"""
if color_scale != 0:
breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
elif np.max(matrix) < 2:
breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
else:
step = (np.quantile(matrix, q=0.95) - 1) / 18
up = np.quantile(matrix, q=0.95) + 0.011
if up < 2:
up = 2
step = 0.999 / 18
breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
n_bin = 20 # Discretizes the interpolation into bins
colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
"#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
"#FF0000"]
cmap_name = 'my_list'
# Create the colormap
cm = matplotlib.colors.LinearSegmentedColormap.from_list(
cmap_name, colors, N=n_bin)
norm = matplotlib.colors.BoundaryNorm(breaks, 20)
# Fewer bins will result in "coarser" colomap interpolation
if ax is None:
_, ax = plt.subplots()
img = ax.imshow(matrix, cmap=cm, norm=norm, interpolation='nearest')
if return_image:
plt.close()
return img.get_array()
def get_heatmap(matrix, color_scale):
if color_scale != 0:
breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
elif np.max(matrix) < 2:
breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
else:
step = (np.quantile(matrix, q=0.98) - 1) / 18
up = np.quantile(matrix, q=0.98) + 0.011
if up < 2:
up = 2
step = 0.999 / 18
breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
n_bin = 20 # Discretizes the interpolation into bins
colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
"#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
"#FF0000"]
cmap_name = 'my_list'
# Create the colormap
cm = matplotlib.colors.LinearSegmentedColormap.from_list(
cmap_name, colors, N=n_bin)
norm = matplotlib.colors.BoundaryNorm(breaks, 20)
# Fewer bins will result in "coarser" colomap interpolation
m = matplotlib.cm.ScalarMappable(norm=norm, cmap=cm)
heatmap = m.to_rgba(matrix)
mask = matrix > 1.2
heatmap[..., -1] = np.ones_like(mask) * mask
return heatmap
def save_images_to_video(output_name, out_dir):
"""
Saves all training visualization images to a video file
Args:
output_name (:obj:`str`) : filename for the saved video file
"""
image_folder = 'images'
video_name = out_dir + output_name + '.avi'
images = [img for img in sorted(os.listdir(image_folder)) if img.endswith(".png")]
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape
video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 29.94, (width, height))
for image in images:
video.write(cv2.imread(os.path.join(image_folder, image)))
last_frame = cv2.imread(os.path.join(image_folder, images[-1]))
for _ in range(150):
video.write(last_frame)
cv2.destroyAllWindows()
video.release()
def get_model_memory_usage(batch_size, model):
"""
Estimates the amount of memory required to train the model using the current batch size.
Args:
batch_size (:obj:`int`) : number of training samples in each batch
model (:obj:`keras.models.Model`) : uncompiled Keras model to be trained
Returns:
``float`` : estimated memory usage in GB
"""
shapes_mem_count = 0
for l in model.layers:
single_layer_mem = 1
for s in l.output_shape:
if s is None:
continue
single_layer_mem *= s
shapes_mem_count += single_layer_mem
trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)])
non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])
number_size = 4.0
if K.floatx() == 'float16':
number_size = 2.0
if K.floatx() == 'float64':
number_size = 8.0
total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count)
gbytes = np.round(total_memory / (1024.0 ** 3), 3)
return gbytes
|