dylanplummer commited on
Commit
f4051e1
·
1 Parent(s): 883a832

add dependencies

Browse files
Files changed (3) hide show
  1. app.py +174 -1
  2. requirements.txt +5 -0
  3. utils/utils.py +0 -819
app.py CHANGED
@@ -1,11 +1,184 @@
1
  import gradio as gr
2
  import os
 
 
3
  import numpy as np
4
  import pandas as pd
 
5
  import matplotlib.pyplot as plt
6
  from pathlib import Path
7
  from tensorflow.keras.models import model_from_json
8
- from utils.utils import open_anchor_to_anchor, draw_heatmap, load_chr_ratio_matrix_from_sparse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  model_depths = ['1.5M', '2M', '2.4M', '4.88M', '5M', '6.29M', '8.5M', '12.5M', '16.5M', '25M', '32M', '50M', '100M', '150M']
 
1
  import gradio as gr
2
  import os
3
+ import scipy
4
+ from scipy.sparse import tril, triu
5
  import numpy as np
6
  import pandas as pd
7
+ import matplotlib
8
  import matplotlib.pyplot as plt
9
  from pathlib import Path
10
  from tensorflow.keras.models import model_from_json
11
+ from utils.utils import draw_heatmap, load_chr_ratio_matrix_from_sparse
12
+
13
+
14
+
15
+ def get_chromosome_from_filename(filename):
16
+ """
17
+ Extract the chromosome string from any of the file name formats we use
18
+
19
+ Args:
20
+ filename (:obj:`str`) : name of anchor to anchor file
21
+
22
+ Returns:
23
+ Chromosome string of form chr<>
24
+ """
25
+ chr_index = filename.find('chr') # index of chromosome name
26
+ if chr_index == 0: # if chromosome name is file prefix
27
+ return filename[:filename.find('.')]
28
+ file_ending_index = filename.rfind('.') # index of file ending
29
+ if chr_index > file_ending_index: # if chromosome name is file ending
30
+ return filename[chr_index:]
31
+ else:
32
+ return filename[chr_index: file_ending_index]
33
+
34
+
35
+ def draw_heatmap(matrix, color_scale, ax=None, return_image=False):
36
+ """
37
+ Display ratio heatmap containing only strong signals (values > 1 or 0.98th quantile)
38
+
39
+ Args:
40
+ matrix (:obj:`numpy.array`) : ratio matrix to be displayed
41
+ color_scale (:obj:`int`) : max ratio value to be considered strongest by color mapping
42
+ ax (:obj:`matplotlib.axes.Axes`) : axes which will contain the heatmap. If None, new axes are created
43
+ return_image (:obj:`bool`) : set to True to return the image obtained from drawing the heatmap with the generated color map
44
+
45
+ Returns:
46
+ ``numpy.array`` : if ``return_image`` is set to True, return the heatmap as an array
47
+ """
48
+ if color_scale != 0:
49
+ breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
50
+ elif np.max(matrix) < 2:
51
+ breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
52
+ else:
53
+ step = (np.quantile(matrix, q=0.95) - 1) / 18
54
+ up = np.quantile(matrix, q=0.95) + 0.011
55
+ if up < 2:
56
+ up = 2
57
+ step = 0.999 / 18
58
+ breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
59
+
60
+ n_bin = 20 # Discretizes the interpolation into bins
61
+ colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
62
+ "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
63
+ "#FF0000"]
64
+ cmap_name = 'my_list'
65
+ # Create the colormap
66
+ cm = matplotlib.colors.LinearSegmentedColormap.from_list(
67
+ cmap_name, colors, N=n_bin)
68
+ norm = matplotlib.colors.BoundaryNorm(breaks, 20)
69
+ # Fewer bins will result in "coarser" colomap interpolation
70
+ if ax is None:
71
+ _, ax = plt.subplots()
72
+ img = ax.imshow(matrix, cmap=cm, norm=norm, interpolation='nearest')
73
+ if return_image:
74
+ plt.close()
75
+ return img.get_array()
76
+
77
+
78
+ def anchor_list_to_dict(anchors):
79
+ """
80
+ Converts the array of anchor names to a dictionary mapping each anchor to its chromosomal index
81
+
82
+ Args:
83
+ anchors (:obj:`numpy.array`) : array of anchor name values
84
+
85
+ Returns:
86
+ `dict` : dictionary mapping each anchor to its index from the array
87
+ """
88
+ anchor_dict = {}
89
+ for i, anchor in enumerate(anchors):
90
+ anchor_dict[anchor] = i
91
+ return anchor_dict
92
+
93
+
94
+ def anchor_to_locus(anchor_dict):
95
+ """
96
+ Function to convert an anchor name to its genomic locus which can be easily vectorized
97
+
98
+ Args:
99
+ anchor_dict (:obj:`dict`) : dictionary mapping each anchor to its chromosomal index
100
+
101
+ Returns:
102
+ `function` : function which returns the locus of an anchor name
103
+ """
104
+ def f(anchor):
105
+ return anchor_dict[anchor]
106
+ return f
107
+
108
+
109
+
110
+ def load_chr_ratio_matrix_from_sparse(dir_name, file_name, anchor_dir, sparse_dir=None, anchor_list=None, chr_name=None, dummy=5, ignore_sparse=False, force_symmetry=True, use_raw=False):
111
+ """
112
+ Loads data as a sparse matrix by either reading a precompiled sparse matrix or an anchor to anchor file which is converted to sparse CSR format.
113
+ Ratio values are computed using the observed (obs) and expected (exp) values:
114
+
115
+ .. math::
116
+ ratio = \\frac{obs + dummy}{exp + dummy}
117
+
118
+ Args:
119
+ dir_name (:obj:`str`) : directory containing the anchor to anchor or precompiled (.npz) sparse matrix file
120
+ file_name (:obj:`str`) : name of anchor to anchor or precompiled (.npz) sparse matrix file
121
+ anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
122
+ dummy (:obj:`int`) : dummy value to used when computing ratio values
123
+ ignore_sparse (:obj:`bool`) : set to True to ignore precompiled sparse matrices even if they exist
124
+
125
+ Returns:
126
+ ``scipy.sparse.csr_matrix``: sparse matrix of ratio values
127
+ """
128
+ global data_dir
129
+ global sparse_data_dir
130
+ if chr_name is None:
131
+ chr_name = get_chromosome_from_filename(file_name)
132
+ sparse_rep_dir = dir_name[dir_name[: -1].rfind('/') + 1:] # directory where the pre-compiled sparse matrices are saved
133
+ if sparse_dir is not None:
134
+ sparse_data_dir = sparse_dir
135
+ os.makedirs(os.path.join(sparse_data_dir, sparse_rep_dir), exist_ok=True)
136
+ if file_name.endswith('.npz'): # loading pre-combined and pre-compiled sparse data
137
+ sparse_matrix = scipy.sparse.load_npz(dir_name + file_name)
138
+ else: # load from file name
139
+ if file_name + '.npz' in os.listdir(os.path.join(sparse_data_dir, sparse_rep_dir)) and not ignore_sparse: # check if pre-compiled data already exists
140
+ sparse_matrix = scipy.sparse.load_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name + '.npz'))
141
+ else: # otherwise generate sparse matrix from anchor2anchor file and save pre-compiled data
142
+ if anchor_list is None:
143
+ if anchor_dir is None:
144
+ assert 'You must supply either an anchor reference list or the directory containing one'
145
+ anchor_list = pd.read_csv(os.path.join(anchor_dir, '%s.bed' % chr_name), sep='\t',
146
+ names=['chr', 'start', 'end', 'anchor']) # read anchor list file
147
+ matrix_size = len(anchor_list) # matrix size is needed to construct sparse CSR matrix
148
+ anchor_dict = anchor_list_to_dict(anchor_list['anchor'].values) # convert to anchor --> index dictionary
149
+ try: # first try reading anchor to anchor file as <a1> <a2> <obs> <exp>
150
+ chr_anchor_file = pd.read_csv(
151
+ os.path.join(dir_name, file_name),
152
+ delimiter='\t',
153
+ names=['anchor1', 'anchor2', 'obs', 'exp'],
154
+ usecols=['anchor1', 'anchor2', 'obs', 'exp']) # read chromosome anchor to anchor file
155
+ rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
156
+ cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
157
+ ratio = (chr_anchor_file['obs'] + dummy) / (chr_anchor_file['exp'] + dummy) # compute matrix ratio value
158
+ sparse_matrix = scipy.sparse.csr_matrix((ratio, (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
159
+ except: # otherwise read anchor to anchor file as <a1> <a2> <ratio>
160
+ chr_anchor_file = pd.read_csv(
161
+ os.path.join(dir_name, file_name),
162
+ delimiter='\t',
163
+ names=['anchor1', 'anchor2', 'ratio'],
164
+ usecols=['anchor1', 'anchor2', 'ratio'])
165
+ rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
166
+ cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
167
+ if use_raw:
168
+ sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['obs'], (rows, cols)), shape=(
169
+ matrix_size, matrix_size)) # construct sparse CSR matrix
170
+ else:
171
+ sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['ratio'], (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
172
+ if force_symmetry:
173
+ upper_sum = triu(sparse_matrix, k=1).sum()
174
+ lower_sum = tril(sparse_matrix, k=-1).sum()
175
+ if upper_sum == 0 or lower_sum == 0:
176
+ sparse_matrix = sparse_matrix + sparse_matrix.transpose()
177
+ sparse_triu = scipy.sparse.triu(sparse_matrix)
178
+ sparse_matrix = sparse_triu + sparse_triu.transpose()
179
+ if not ignore_sparse:
180
+ scipy.sparse.save_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name), sparse_matrix) # save precompiled data
181
+ return sparse_matrix
182
 
183
 
184
  model_depths = ['1.5M', '2M', '2.4M', '4.88M', '5M', '6.29M', '8.5M', '12.5M', '16.5M', '25M', '32M', '50M', '100M', '150M']
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ tensorflow
2
+ numpy<2.0
3
+ pandas
4
+ matplotlib
5
+ scipy
utils/utils.py DELETED
@@ -1,819 +0,0 @@
1
- import math
2
- import os
3
- import re
4
- import cv2
5
- import random
6
- import pickle
7
- import numpy as np
8
- import tensorflow.keras.backend as K
9
- import pandas as pd
10
- import matplotlib.pyplot as plt
11
- import matplotlib.colors
12
- import matplotlib.cm
13
- import scipy.sparse
14
- from scipy.sparse import coo_matrix, csr_matrix, triu, tril
15
- import scipy.ndimage
16
-
17
- chromosome_labels = {'chr1': 0, 'chr2': 1, 'chr3': 2, 'chr4': 3, 'chr5': 4, 'chr6': 5, 'chr7': 6, 'chr8': 7, 'chr9': 8,
18
- 'chr10': 9, 'chr11': 10, 'chr12': 11, 'chr13': 12, 'chr14': 13, 'chr15': 14, 'chr16': 15, 'chr17': 16, 'chr18': 17,
19
- 'chr19': 18, 'chr20': 19, 'chr21': 20, 'chr22': 21, 'chrX': 22, 'chrY': 23}
20
-
21
- data_dir = 'data/'
22
- sparse_data_dir = 'data/sparse/'
23
- try:
24
- os.mkdir(data_dir)
25
- except FileExistsError:
26
- pass
27
- try:
28
- os.mkdir(sparse_data_dir)
29
- except FileExistsError:
30
- pass
31
-
32
-
33
- def open_anchor_to_anchor(filename):
34
- '''
35
- Read a tab delimited anchor to anchor file as a DataFrame
36
- Args:
37
- filename (:obj:`str`) : full path to anchor to anchor file
38
-
39
- Returns:
40
- ``pandas.DataFrame``: if reading a normalized anchor to anchor file, columns are ``a1 a2 obs exp ratio``
41
- and if reading a denoised or enhanced anchor to anchor file, columns are ``a1 a2 ratio``
42
- '''
43
- df = pd.read_csv(filename, sep='\t')
44
- n_cols = len(df.columns)
45
- if n_cols == 4: # if before denoise top loops
46
- df = pd.read_csv(filename,
47
- sep='\t',
48
- names=['anchor1', 'anchor2', 'obs', 'exp'])
49
- df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
50
- elif n_cols == 5: # includes p-value
51
- df = pd.read_csv(filename,
52
- sep='\t',
53
- names=['anchor1', 'anchor2', 'obs', 'exp', 'p_val'])
54
- df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
55
- else: # after denoise has no obs or exp
56
- df = pd.read_csv(filename,
57
- sep='\t',
58
- names=['anchor1', 'anchor2', 'ratio'])
59
- df = df[['anchor1', 'anchor2', 'ratio']]
60
- return df
61
-
62
-
63
- def open_full_genome(data_dir):
64
- '''
65
-
66
- Args:
67
- data_dir:
68
-
69
- Returns:
70
-
71
- '''
72
- genome = pd.DataFrame()
73
- print('Opening genome-wide anchor to anchor...')
74
- for chr_file in os.listdir(data_dir):
75
- if 'anchor_2_anchor' in chr_file or 'denoised.anchor.to.anchor' in chr_file:
76
- print(chr_file)
77
- genome = pd.concat([genome, open_anchor_to_anchor(data_dir + '/' + chr_file)])
78
- return genome
79
-
80
-
81
- def get_chromosome_from_filename(filename):
82
- """
83
- Extract the chromosome string from any of the file name formats we use
84
-
85
- Args:
86
- filename (:obj:`str`) : name of anchor to anchor file
87
-
88
- Returns:
89
- Chromosome string of form chr<>
90
- """
91
- chr_index = filename.find('chr') # index of chromosome name
92
- if chr_index == 0: # if chromosome name is file prefix
93
- return filename[:filename.find('.')]
94
- file_ending_index = filename.rfind('.') # index of file ending
95
- if chr_index > file_ending_index: # if chromosome name is file ending
96
- return filename[chr_index:]
97
- else:
98
- return filename[chr_index: file_ending_index]
99
-
100
-
101
- def locus_to_anchor(chr_name, locus, anchor_dir):
102
- anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
103
- names=['chr', 'start', 'end', 'anchor']) # read anchor list file
104
- loci_indices = (anchor_list['start'] <= locus) & (locus <= anchor_list['end']) & (
105
- anchor_list['chr'] == chr_name)
106
- print(np.where(loci_indices)[0][0])
107
- return int(np.where(loci_indices)[0][0])
108
-
109
-
110
- def save_samples(input_dir, target_dir, matrix_size, multi_input=False, dir_3=None, combined_dir=None, anchor_dir=None, name='sample', chr_name='chr6', locus_start=25922605, locus_end=26709867, force_size=128, force_symmetry=True):
111
- """
112
- Saves sample matrices for use in training visualizations
113
-
114
- Args:
115
- input_dir (:obj:`str`) : directory containing input anchor to anchor files
116
- target_dir (:obj:`str`) : directory containing target anchor to anchor files
117
- matrix_size (:obj:`int`) : size of each sample matrix
118
- multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
119
- dir_3 (:obj:`str`) : optional directory containing third set of input anchor to anchor files
120
- combined_dir (:obj:`str`) : optional directory containing combined target anchor to anchor files
121
- anchor_dir (:obj:`str`) : directory containing anchor reference ``.bed`` files
122
- name (:obj:`str`) : each saved sample file will begin with this string
123
- chr_index (:obj:`int`) : index of chromosome to save samples from
124
- locus (:obj:`int`) : index of anchor to save samples from
125
- """
126
- global data_dir
127
- global sparse_data_dir
128
- try:
129
- os.mkdir(sparse_data_dir)
130
- except FileExistsError as e:
131
- pass
132
- if multi_input:
133
- input_folder_1 = os.listdir(input_dir)[0] + '/'
134
- input_folder_2 = os.listdir(input_dir)[1] + '/'
135
- try:
136
- input_folder_3 = os.listdir(input_dir)[2] + '/'
137
- except IndexError:
138
- pass
139
- chr_index = min(int(chr_name.replace('chr', '')), len(os.listdir(input_dir + input_folder_1)) - 1)
140
- print('Saving samples from', chr_name, '...')
141
- if (name == 'enhance' or name == 'val_enhance') and multi_input:
142
- matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
143
- matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
144
- matrix_3 = None
145
- combined_matrix = None
146
- else:
147
- if multi_input:
148
- matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
149
- matrix_2 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_2, os.listdir(input_dir + input_folder_2)[chr_index], anchor_dir, force_symmetry=force_symmetry)
150
- matrix_3 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_3, os.listdir(input_dir + input_folder_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
151
- combined_matrix = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
152
- else:
153
- matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir, os.listdir(input_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
154
- matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
155
- if dir_3 is not None:
156
- matrix_3 = load_chr_ratio_matrix_from_sparse(dir_3, os.listdir(dir_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
157
- else:
158
- matrix_3 = None
159
- if combined_dir is not None:
160
- combined_matrix = load_chr_ratio_matrix_from_sparse(combined_dir, os.listdir(combined_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
161
- else:
162
- combined_matrix = None
163
- i = locus_to_anchor(chr_name, locus_start, anchor_dir)
164
- j = locus_to_anchor(chr_name, locus_end, anchor_dir)
165
- mid = int((i + j) / 2)
166
- i = max(0, mid - int(force_size / 2))
167
- j = i + force_size
168
- rows = slice(i, j)
169
- cols = slice(i, j)
170
- tile_1 = matrix_1[rows, cols].A
171
- tile_2 = matrix_2[rows, cols].A
172
- tile_1 = np.expand_dims(tile_1, -1) # add channel dimension
173
- tile_1 = np.expand_dims(tile_1, 0) # model expects a list of inputs
174
- tile_2 = np.expand_dims(tile_2, -1)
175
- tile_2 = np.expand_dims(tile_2, 0)
176
- if matrix_3 is not None:
177
- tile_3 = matrix_3[i:i + matrix_size, j:j + matrix_size].A
178
- tile_3 = np.expand_dims(tile_3, -1)
179
- tile_3 = np.expand_dims(tile_3, 0)
180
- np.save('%s%s_3' % (data_dir, name), tile_3)
181
- if combined_matrix is not None:
182
- combined_tile = combined_matrix[i:i + matrix_size, j:j + matrix_size].A
183
- combined_tile = np.expand_dims(combined_tile, -1)
184
- combined_tile = np.expand_dims(combined_tile, 0)
185
- np.save('%s%s_combined' % (data_dir, name), combined_tile)
186
- np.save('%s%s_1' % (data_dir, name), tile_1)
187
- np.save('%s%s_2' % (data_dir, name), tile_2)
188
-
189
-
190
- def load_chr_ratio_matrix_from_sparse(dir_name, file_name, anchor_dir, sparse_dir=None, anchor_list=None, chr_name=None, dummy=5, ignore_sparse=False, force_symmetry=True, use_raw=False):
191
- """
192
- Loads data as a sparse matrix by either reading a precompiled sparse matrix or an anchor to anchor file which is converted to sparse CSR format.
193
- Ratio values are computed using the observed (obs) and expected (exp) values:
194
-
195
- .. math::
196
- ratio = \\frac{obs + dummy}{exp + dummy}
197
-
198
- Args:
199
- dir_name (:obj:`str`) : directory containing the anchor to anchor or precompiled (.npz) sparse matrix file
200
- file_name (:obj:`str`) : name of anchor to anchor or precompiled (.npz) sparse matrix file
201
- anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
202
- dummy (:obj:`int`) : dummy value to used when computing ratio values
203
- ignore_sparse (:obj:`bool`) : set to True to ignore precompiled sparse matrices even if they exist
204
-
205
- Returns:
206
- ``scipy.sparse.csr_matrix``: sparse matrix of ratio values
207
- """
208
- global data_dir
209
- global sparse_data_dir
210
- if chr_name is None:
211
- chr_name = get_chromosome_from_filename(file_name)
212
- sparse_rep_dir = dir_name[dir_name[: -1].rfind('/') + 1:] # directory where the pre-compiled sparse matrices are saved
213
- if sparse_dir is not None:
214
- sparse_data_dir = sparse_dir
215
- os.makedirs(os.path.join(sparse_data_dir, sparse_rep_dir), exist_ok=True)
216
- if file_name.endswith('.npz'): # loading pre-combined and pre-compiled sparse data
217
- sparse_matrix = scipy.sparse.load_npz(dir_name + file_name)
218
- else: # load from file name
219
- if file_name + '.npz' in os.listdir(os.path.join(sparse_data_dir, sparse_rep_dir)) and not ignore_sparse: # check if pre-compiled data already exists
220
- sparse_matrix = scipy.sparse.load_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name + '.npz'))
221
- else: # otherwise generate sparse matrix from anchor2anchor file and save pre-compiled data
222
- if anchor_list is None:
223
- if anchor_dir is None:
224
- assert 'You must supply either an anchor reference list or the directory containing one'
225
- anchor_list = pd.read_csv(os.path.join(anchor_dir, '%s.bed' % chr_name), sep='\t',
226
- names=['chr', 'start', 'end', 'anchor']) # read anchor list file
227
- matrix_size = len(anchor_list) # matrix size is needed to construct sparse CSR matrix
228
- anchor_dict = anchor_list_to_dict(anchor_list['anchor'].values) # convert to anchor --> index dictionary
229
- try: # first try reading anchor to anchor file as <a1> <a2> <obs> <exp>
230
- chr_anchor_file = pd.read_csv(
231
- os.path.join(dir_name, file_name),
232
- delimiter='\t',
233
- names=['anchor1', 'anchor2', 'obs', 'exp'],
234
- usecols=['anchor1', 'anchor2', 'obs', 'exp']) # read chromosome anchor to anchor file
235
- rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
236
- cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
237
- ratio = (chr_anchor_file['obs'] + dummy) / (chr_anchor_file['exp'] + dummy) # compute matrix ratio value
238
- sparse_matrix = scipy.sparse.csr_matrix((ratio, (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
239
- except: # otherwise read anchor to anchor file as <a1> <a2> <ratio>
240
- chr_anchor_file = pd.read_csv(
241
- os.path.join(dir_name, file_name),
242
- delimiter='\t',
243
- names=['anchor1', 'anchor2', 'ratio'],
244
- usecols=['anchor1', 'anchor2', 'ratio'])
245
- rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
246
- cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
247
- if use_raw:
248
- sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['obs'], (rows, cols)), shape=(
249
- matrix_size, matrix_size)) # construct sparse CSR matrix
250
- else:
251
- sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['ratio'], (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
252
- if force_symmetry:
253
- upper_sum = triu(sparse_matrix, k=1).sum()
254
- lower_sum = tril(sparse_matrix, k=-1).sum()
255
- if upper_sum == 0 or lower_sum == 0:
256
- sparse_matrix = sparse_matrix + sparse_matrix.transpose()
257
- sparse_triu = scipy.sparse.triu(sparse_matrix)
258
- sparse_matrix = sparse_triu + sparse_triu.transpose()
259
- if not ignore_sparse:
260
- scipy.sparse.save_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name), sparse_matrix) # save precompiled data
261
- return sparse_matrix
262
-
263
-
264
- def split_matrix(input_filename,
265
- input_matrix,
266
- target_matrix,
267
- input_batch,
268
- target_batch,
269
- matrix_size,
270
- step_size,
271
- batch_size,
272
- n_matrices,
273
- start_index,
274
- normalize,
275
- shuffle,
276
- random_steps,
277
- diagonal_only,
278
- upper_triangular_only):
279
- """
280
- Generator function to split input and target sparse matrices into patches which are used for training and prediction.
281
-
282
- Args:
283
- input_filename (:obj:`str`): name of file which is being used to generate ratio matrix patches
284
- input_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR input matrix
285
- target_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR target matrix
286
- input_batch (:obj:`numpy.array`) : current array of samples in the input batch being generated
287
- target_batch (:obj:`numpy.array`) : current array of samples in the target batch being generated
288
- matrix_size (:obj:`int`) : size of each patch
289
- step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
290
- batch_size (:obj:`int`) : number of patches to use in each batch
291
- n_matrices (:obj:`int`) : current number of matrix patches in the batch being generated
292
- start_index (:obj:`int`) : starting anchor index of the matrix splitting, ensures batches are not identical across epochs
293
- normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
294
- shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
295
- random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
296
- diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
297
-
298
- Returns:
299
- (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
300
- """
301
- if matrix_size == -1:
302
- input_matrix = np.expand_dims(np.expand_dims(input_matrix.A, 0), -1)
303
- target_matrix = np.expand_dims(np.expand_dims(target_matrix.A, 0), -1)
304
- yield input_matrix, target_matrix, input_filename + '_full_chr'
305
- else:
306
- if random_steps: # random offset from step size intervals
307
- start_index = np.random.randint(0, step_size)
308
- row_indices = np.arange(start_index, input_matrix.shape[0], step_size)
309
- col_indices = np.arange(start_index, input_matrix.shape[1], step_size)
310
- if shuffle: # shuffle slicing indices
311
- np.random.shuffle(row_indices)
312
- np.random.shuffle(col_indices)
313
- for i in row_indices:
314
- for j in col_indices:
315
- if abs(i - j) > 384: # max distance from diagonal with actual values
316
- continue
317
- if diagonal_only and i != j:
318
- continue
319
- if upper_triangular_only and i < j:
320
- continue
321
- input_tile = input_matrix[i:i + matrix_size, j:j + matrix_size].A
322
- target_tile = target_matrix[i:i + matrix_size, j:j + matrix_size].A
323
- #input_tile = np.expand_dims(input_tile, axis=-1)
324
- #target_tile = np.expand_dims(target_tile, axis=-1)
325
- input_batch.append(input_tile)
326
- target_batch.append(target_tile)
327
- n_matrices += 1
328
- if n_matrices == batch_size:
329
- try:
330
- input_batch = np.reshape(np.array(input_batch), (n_matrices, matrix_size, matrix_size, 1))
331
- target_batch = np.reshape(np.array(target_batch), (n_matrices, matrix_size, matrix_size, 1))
332
- if normalize:
333
- input_batch = normalize_matrix(input_batch)
334
- target_batch = normalize_matrix(target_batch)
335
-
336
- yield input_batch, target_batch, input_filename + '_' + str(i)
337
- except ValueError as e: # reached end of valid values
338
- input_batch = []
339
- target_batch = []
340
- n_matrices = 0
341
- pass
342
- input_batch = []
343
- target_batch = []
344
- n_matrices = 0
345
-
346
-
347
-
348
- def generate_batches_from_chr(input_dir,
349
- target_dir,
350
- matrix_size,
351
- batch_size,
352
- anchor_dir=None,
353
- step_size=64,
354
- multi_input=False,
355
- shuffle=False,
356
- random_steps=False,
357
- normalize=False,
358
- diagonal_only=False,
359
- upper_triangular_only=False,
360
- force_symmetry=True,
361
- ignore_XY=True,
362
- ignore_even_chr=False,
363
- ignore_odd_chr=False):
364
- """
365
- Generator function which generates batches of input target pairs to train the model:
366
-
367
- .. code-block:: python
368
- :linenos:
369
-
370
- for epoch_i in range(epochs):
371
- for input_batch, target_batch, batch_label in generate_batches_from_chr(input_dir,
372
- target_dir,
373
- matrix_size=128,
374
- batch_size=64,
375
- step_size=64,
376
- shuffle=True,
377
- random_steps=True,
378
- anchor_dir=anchor_dir):
379
- step_start_time = time.time()
380
- loss = model.train_on_batch(noisy_batch, target_batch)
381
- print("%d-%d %ds [Loss: %.3f][PSNR: %.3f, Jaccard: %.3f]" %
382
- (epoch_i,
383
- step_i,
384
- time.time() - step_start_time,
385
- loss[0],
386
- loss[1],
387
- loss[2]
388
- ))
389
- step_i += 1
390
-
391
- Args:
392
- input_dir (:obj:`str`) : directory containing all input data to be generated
393
- target_dir (:obj:`str`) : directory containing all target data to be generated
394
- matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
395
- batch_size (:obj:`int`) : number of patches to use in each batch
396
- anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
397
- step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
398
- multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
399
- shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
400
- random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
401
- diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
402
- normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
403
- ignore_XY (:obj:`bool`) : set to True to ignore chromosomes X and Y when generating batches
404
- ignore_even_chr (:obj:`bool`) : set to True to ignore all even numbered chromosomes
405
- ignore_odd_chr (:obj:`bool`) : set to True to ignore all odd numbered chromosomes
406
-
407
- Returns:
408
- (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
409
- """
410
- input_batch = []
411
- target_batch = []
412
- if multi_input:
413
- input_folders = os.listdir(input_dir) # get list of all folders in input dir
414
- input_files = sorted(os.listdir(input_dir + input_folders[0])) # get list of input files (assume all inputs have same name pattern)
415
- target_files = sorted(os.listdir(target_dir))
416
- '''
417
- # remove duplicates of chromosomes
418
- tmp = []
419
- for f in input_files:
420
- if '.p_val' in f and f.replace('.p_val', '') in input_files:
421
- tmp.append(f.replace('.p_val', ''))
422
- if len(tmp) > 0:
423
- input_files = tmp
424
- print(input_files)
425
- '''
426
- else:
427
- input_files = sorted(os.listdir(input_dir))
428
- target_files = sorted(os.listdir(target_dir))
429
-
430
- if shuffle: # shuffle chromosome file order
431
- c = list(zip(input_files, target_files))
432
- random.shuffle(c)
433
- input_files, target_files = zip(*c)
434
-
435
- if ignore_XY:
436
- remove_XY = lambda files: [f for f in files if 'chrX' not in f and 'chrY' not in f]
437
- input_files = remove_XY(input_files)
438
- target_files = remove_XY(target_files)
439
-
440
- if ignore_odd_chr:
441
- # fun one-liner to remove all odd-numbered chromosomes
442
- remove_odds = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 == 0]
443
- input_files = remove_odds(input_files)
444
- target_files = remove_odds(target_files)
445
- elif ignore_even_chr:
446
- remove_evens = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 != 0]
447
- input_files = remove_evens(input_files)
448
- target_files = remove_evens(target_files)
449
-
450
- for input_file, target_file in zip(input_files, target_files):
451
- n_matrices = 0
452
- start_index = 0
453
- if multi_input:
454
- target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
455
- for input_folder in input_folders:
456
- input_folder += '/'
457
- input_matrix = load_chr_ratio_matrix_from_sparse(input_dir + input_folder, input_file, anchor_dir, force_symmetry=force_symmetry)
458
- for input_batch, target_batch, figure_title in split_matrix(input_filename=input_folder + input_file,
459
- input_matrix=input_matrix,
460
- target_matrix=target_matrix,
461
- input_batch=input_batch,
462
- target_batch=target_batch,
463
- matrix_size=matrix_size,
464
- step_size=step_size,
465
- batch_size=batch_size,
466
- n_matrices=n_matrices,
467
- start_index=start_index,
468
- normalize=normalize,
469
- shuffle=shuffle,
470
- random_steps=random_steps,
471
- diagonal_only=diagonal_only,
472
- upper_triangular_only=upper_triangular_only):
473
- yield input_batch, target_batch, figure_title
474
- input_batch = []
475
- target_batch = []
476
- n_matrices = 0
477
- else:
478
- input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, input_file, anchor_dir, force_symmetry=force_symmetry)
479
- target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
480
- for input_batch, target_batch, figure_title in split_matrix(input_filename=input_file,
481
- input_matrix=input_matrix,
482
- target_matrix=target_matrix,
483
- input_batch=input_batch,
484
- target_batch=target_batch,
485
- matrix_size=matrix_size,
486
- step_size=step_size,
487
- batch_size=batch_size,
488
- n_matrices=n_matrices,
489
- start_index=start_index,
490
- normalize=normalize,
491
- shuffle=shuffle,
492
- random_steps=random_steps,
493
- diagonal_only=diagonal_only,
494
- upper_triangular_only=upper_triangular_only):
495
- yield input_batch, target_batch, figure_title
496
- input_batch = []
497
- target_batch = []
498
- n_matrices = 0
499
-
500
-
501
- def get_matrices_from_loci(input_dir,
502
- target_dir,
503
- matrix_size,
504
- loci,
505
- anchor_dir=None):
506
- """
507
- Generator function for getting sample matrices at specific loci
508
-
509
- Args:
510
- input_dir (:obj:`str`) : directory containing all input data to be generated
511
- target_dir (:obj:`str`) : directory containing all target data to be generated
512
- matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
513
- loci (:obj:`dict`) : dictionary of chromosome locus pairs
514
- anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
515
-
516
- Returns:
517
- (``numpy.array``, ``numpy.array``, ``str``, ``int``, ``int``): input matrix, target matrix, chromosome name, locus, and anchor index
518
- """
519
- input_files = sorted_nicely(os.listdir(input_dir))
520
- target_files = sorted_nicely(os.listdir(target_dir))
521
-
522
- for file_1, file_2 in zip(input_files, target_files):
523
- chr_name = get_chromosome_from_filename(file_1)
524
- if chr_name in loci.keys():
525
- anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
526
- names=['chr', 'start', 'end', 'anchor']) # read anchor list file
527
- else:
528
- continue
529
- input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, file_1, anchor_dir)
530
- target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, file_2, anchor_dir)
531
-
532
- loci_indices = (anchor_list['start'] <= loci[chr_name]) & (loci[chr_name] <= anchor_list['end']) & (anchor_list['chr'] == chr_name)
533
-
534
- for i, locus in enumerate(loci_indices):
535
- if locus:
536
- input_tile = input_matrix[i:i + matrix_size, i:i + matrix_size].A
537
- target_tile = target_matrix[i:i + matrix_size, i:i + matrix_size].A
538
- input_tile = np.expand_dims(input_tile, axis=-1)
539
- target_tile = np.expand_dims(target_tile, axis=-1)
540
- input_tile = np.expand_dims(input_tile, axis=0)
541
- target_tile = np.expand_dims(target_tile, axis=0)
542
-
543
- yield input_tile, target_tile, chr_name, loci[chr_name], i
544
-
545
-
546
- def get_top_loops(matrix_data_dir, reference_dir, num_top_loops=None, q=None, dummy=5):
547
- """
548
- Ranks the ratio values of all chromosomes and computes the cutoff value for taking the top ``num_top_loops`` or the ``q`` th quantile
549
-
550
- Args:
551
- matrix_data_dir (:obj:`str`) : directory containing the anchor to anchor files used to count loops
552
- reference_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
553
- num_top_loops (:obj:`str`) : number of top loops to consider
554
- q (:obj:`str`) : quantile range of loops to consider
555
- dummy (:obj:`str`) : dummy value to use to calculate each ratio value
556
-
557
- Returns:
558
- ``float`` : cutoff value for top loops
559
- """
560
- global data_dir
561
- if 'top_loop_values.pickle' in os.listdir(data_dir):
562
- with open(data_dir + 'top_loop_values.pickle', 'rb') as handle:
563
- top_loop_values = pickle.load(handle)
564
- else:
565
- top_loop_values = {}
566
- if q is not None: # select top loops based on quantile not quantity
567
- if matrix_data_dir + str(q) in top_loop_values.keys():
568
- genome_min_loop_value = top_loop_values[matrix_data_dir + str(q)]
569
- else:
570
- top_loops = np.array([])
571
- for file in os.listdir(matrix_data_dir):
572
- sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
573
- sparse = scipy.sparse.triu(sparse)
574
- nonzero_indices = sparse.nonzero()
575
- top_loops = np.append(top_loops, sparse.tocsr()[nonzero_indices].A)
576
- genome_min_loop_value = np.quantile(top_loops, q=q)
577
- top_loop_values[matrix_data_dir + str(q)] = genome_min_loop_value
578
- print('%s %.4f quantile loops cutoff value: %f' % (matrix_data_dir, q, genome_min_loop_value))
579
- else: # select top loops based on rank
580
- if matrix_data_dir + str(num_top_loops) in top_loop_values.keys():
581
- genome_min_loop_value = top_loop_values[matrix_data_dir + str(num_top_loops)]
582
- else:
583
- top_loops = np.array([])
584
- for file in os.listdir(matrix_data_dir):
585
- sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
586
- sparse = scipy.sparse.triu(sparse)
587
- loop_list = np.append(top_loops, sparse.data)
588
- top_loops = loop_list[np.argsort(-loop_list)[:num_top_loops]]
589
- genome_min_loop_value = top_loops[-1]
590
- top_loop_values[matrix_data_dir + str(num_top_loops)] = genome_min_loop_value
591
- print('%s top %d loops cutoff value: %f' % (matrix_data_dir, num_top_loops, genome_min_loop_value))
592
- with open(data_dir + 'top_loop_values.pickle', 'wb') as handle:
593
- pickle.dump(top_loop_values, handle, protocol=pickle.HIGHEST_PROTOCOL)
594
-
595
- return genome_min_loop_value
596
-
597
-
598
- def anchor_list_to_dict(anchors):
599
- """
600
- Converts the array of anchor names to a dictionary mapping each anchor to its chromosomal index
601
-
602
- Args:
603
- anchors (:obj:`numpy.array`) : array of anchor name values
604
-
605
- Returns:
606
- `dict` : dictionary mapping each anchor to its index from the array
607
- """
608
- anchor_dict = {}
609
- for i, anchor in enumerate(anchors):
610
- anchor_dict[anchor] = i
611
- return anchor_dict
612
-
613
-
614
- def anchor_to_locus(anchor_dict):
615
- """
616
- Function to convert an anchor name to its genomic locus which can be easily vectorized
617
-
618
- Args:
619
- anchor_dict (:obj:`dict`) : dictionary mapping each anchor to its chromosomal index
620
-
621
- Returns:
622
- `function` : function which returns the locus of an anchor name
623
- """
624
- def f(anchor):
625
- return anchor_dict[anchor]
626
- return f
627
-
628
-
629
- def sorted_nicely(l):
630
- """
631
- Sorts an iterable object according to file system defaults
632
- Args:
633
- l (:obj:`iterable`) : iterable object containing items which can be interpreted as text
634
-
635
- Returns:
636
- `iterable` : sorted iterable
637
- """
638
- convert = lambda text: int(text) if text.isdigit() else text
639
- alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
640
- return sorted(l, key=alphanum_key)
641
-
642
-
643
- def normalize_matrix(matrix):
644
- """
645
- Normalize ratio values between ``[0, 1]`` using the following function:
646
-
647
- .. math::
648
- f(x) = 1 - \\frac{1}{1 + x}
649
-
650
- .. image:: _static/normalization_function_plot.PNG
651
- :scale: 100 %
652
- :align: center
653
-
654
- Args:
655
- matrix (:obj:`numpy.array`) : matrix of ratio values
656
-
657
- Returns:
658
- ``numpy.array`` : matrix of normalized ratio values between ``[0, 1]``
659
- """
660
- return 1 - (1 / (1 + matrix))
661
-
662
-
663
- def denormalize_matrix(matrix):
664
- """
665
- Reverse the normalization of a matrix to set all valid normalized values back to their original ratio values using the following function:
666
-
667
- .. math::
668
-
669
- f^{-1}(x) = \\frac{1}{1 - g(x)} - 1 &\\quad \\mbox{where} &\\quad g(x) = \\begin{cases} 0.98, & \\mbox{if } x > 1 \\\\ 0, & \\mbox{if } x < 0 \\\\ x & \\mbox{ otherwise} \\end{cases}
670
-
671
- We apply the function :math:`g(x)` to remove invalid values that could be in a predicted result and because :math:`f^{-1}(x)` blows up as we approach 1:
672
-
673
- .. image:: _static/denormalization_function_plot.PNG
674
- :scale: 100 %
675
- :align: center
676
-
677
- Args:
678
- matrix (:obj:`numpy.array`) : matrix of normalized ratio values
679
-
680
- Returns:
681
- ``numpy.array`` : matrix of ratio values
682
- """
683
- matrix[matrix > 1] = 0.98
684
- matrix[matrix < 0] = 0
685
- return (1 / (1 - matrix)) - 1
686
-
687
-
688
- def draw_heatmap(matrix, color_scale, ax=None, return_image=False):
689
- """
690
- Display ratio heatmap containing only strong signals (values > 1 or 0.98th quantile)
691
-
692
- Args:
693
- matrix (:obj:`numpy.array`) : ratio matrix to be displayed
694
- color_scale (:obj:`int`) : max ratio value to be considered strongest by color mapping
695
- ax (:obj:`matplotlib.axes.Axes`) : axes which will contain the heatmap. If None, new axes are created
696
- return_image (:obj:`bool`) : set to True to return the image obtained from drawing the heatmap with the generated color map
697
-
698
- Returns:
699
- ``numpy.array`` : if ``return_image`` is set to True, return the heatmap as an array
700
- """
701
- if color_scale != 0:
702
- breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
703
- elif np.max(matrix) < 2:
704
- breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
705
- else:
706
- step = (np.quantile(matrix, q=0.95) - 1) / 18
707
- up = np.quantile(matrix, q=0.95) + 0.011
708
- if up < 2:
709
- up = 2
710
- step = 0.999 / 18
711
- breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
712
-
713
- n_bin = 20 # Discretizes the interpolation into bins
714
- colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
715
- "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
716
- "#FF0000"]
717
- cmap_name = 'my_list'
718
- # Create the colormap
719
- cm = matplotlib.colors.LinearSegmentedColormap.from_list(
720
- cmap_name, colors, N=n_bin)
721
- norm = matplotlib.colors.BoundaryNorm(breaks, 20)
722
- # Fewer bins will result in "coarser" colomap interpolation
723
- if ax is None:
724
- _, ax = plt.subplots()
725
- img = ax.imshow(matrix, cmap=cm, norm=norm, interpolation='nearest')
726
- if return_image:
727
- plt.close()
728
- return img.get_array()
729
-
730
-
731
- def get_heatmap(matrix, color_scale):
732
- if color_scale != 0:
733
- breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
734
- elif np.max(matrix) < 2:
735
- breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
736
- else:
737
- step = (np.quantile(matrix, q=0.98) - 1) / 18
738
- up = np.quantile(matrix, q=0.98) + 0.011
739
- if up < 2:
740
- up = 2
741
- step = 0.999 / 18
742
- breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
743
-
744
- n_bin = 20 # Discretizes the interpolation into bins
745
- colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
746
- "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
747
- "#FF0000"]
748
- cmap_name = 'my_list'
749
- # Create the colormap
750
- cm = matplotlib.colors.LinearSegmentedColormap.from_list(
751
- cmap_name, colors, N=n_bin)
752
- norm = matplotlib.colors.BoundaryNorm(breaks, 20)
753
- # Fewer bins will result in "coarser" colomap interpolation
754
- m = matplotlib.cm.ScalarMappable(norm=norm, cmap=cm)
755
- heatmap = m.to_rgba(matrix)
756
- mask = matrix > 1.2
757
- heatmap[..., -1] = np.ones_like(mask) * mask
758
- return heatmap
759
-
760
-
761
- def save_images_to_video(output_name, out_dir):
762
- """
763
- Saves all training visualization images to a video file
764
-
765
- Args:
766
- output_name (:obj:`str`) : filename for the saved video file
767
- """
768
- image_folder = 'images'
769
- video_name = out_dir + output_name + '.avi'
770
-
771
- images = [img for img in sorted(os.listdir(image_folder)) if img.endswith(".png")]
772
- frame = cv2.imread(os.path.join(image_folder, images[0]))
773
- height, width, layers = frame.shape
774
-
775
- video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 29.94, (width, height))
776
-
777
- for image in images:
778
- video.write(cv2.imread(os.path.join(image_folder, image)))
779
-
780
- last_frame = cv2.imread(os.path.join(image_folder, images[-1]))
781
- for _ in range(150):
782
- video.write(last_frame)
783
-
784
- cv2.destroyAllWindows()
785
- video.release()
786
-
787
-
788
- def get_model_memory_usage(batch_size, model):
789
- """
790
- Estimates the amount of memory required to train the model using the current batch size.
791
-
792
- Args:
793
- batch_size (:obj:`int`) : number of training samples in each batch
794
- model (:obj:`keras.models.Model`) : uncompiled Keras model to be trained
795
-
796
- Returns:
797
- ``float`` : estimated memory usage in GB
798
- """
799
- shapes_mem_count = 0
800
- for l in model.layers:
801
- single_layer_mem = 1
802
- for s in l.output_shape:
803
- if s is None:
804
- continue
805
- single_layer_mem *= s
806
- shapes_mem_count += single_layer_mem
807
-
808
- trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)])
809
- non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])
810
-
811
- number_size = 4.0
812
- if K.floatx() == 'float16':
813
- number_size = 2.0
814
- if K.floatx() == 'float64':
815
- number_size = 8.0
816
-
817
- total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count)
818
- gbytes = np.round(total_memory / (1024.0 ** 3), 3)
819
- return gbytes