astirn commited on
Commit
79e1e8e
·
1 Parent(s): ecb653b

cleanup and bug fixes

Browse files
Files changed (1) hide show
  1. tiger.py +15 -14
tiger.py CHANGED
@@ -20,8 +20,7 @@ SEQ_COL = 'Sequence'
20
  TARGET_COL = 'Target Sequence'
21
  GUIDE_COL = 'Guide Sequence'
22
  SCORE_COL = 'Guide Score'
23
- RUN_MODE_ALL_PM = 'All on-target guides per transcript'
24
- RUN_MODE_TITRATION = 'Top guides per transcript'
25
  REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
26
  BATCH_SIZE_COMPUTE = 500
27
  BATCH_SIZE_SCAN = 20
@@ -167,10 +166,10 @@ def get_on_target_predictions(transcripts: pd.DataFrame, model: tf.keras.Model,
167
  # progress update
168
  percent_complete = 100 * min((i + 1) / len(transcripts), 1)
169
  update_text = 'Evaluating on-target guides for each transcript: {:.2f}%'.format(percent_complete)
 
170
  if status_bar:
171
  status_text.text()
172
- status_bar.progress(int(100 * min((i + 1) / len(transcripts), 1)))
173
- print('\r' + update_text, end='')
174
  print('')
175
 
176
  return predictions
@@ -178,6 +177,7 @@ def get_on_target_predictions(transcripts: pd.DataFrame, model: tf.keras.Model,
178
 
179
  def top_guides_per_transcript(predictions: pd.DataFrame):
180
 
 
181
  top_guides = pd.DataFrame()
182
  for transcript in predictions[ID_COL].unique():
183
  df = predictions.loc[predictions[ID_COL] == transcript]
@@ -193,7 +193,7 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
193
  reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
194
 
195
  # one-hot encode guides to form a filter
196
- guide_filter = one_hot_encode_sequence(sequence_complement(top_guides['Guide']), add_context_padding=False)
197
  guide_filter = tf.transpose(guide_filter, [1, 2, 0])
198
 
199
  # loop over transcripts in batches
@@ -241,10 +241,12 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
241
  off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
242
 
243
  # progress update
 
 
 
244
  if status_bar:
245
- status_text.text("Scanning for off-targets Percent complete: {:.2f}%".format(int(100 * min(i / len(reference_transcripts), 1))))
246
- status_bar.progress(int(100 * min(i / len(reference_transcripts), 1)))
247
- print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(reference_transcripts), 1)), end='')
248
  print('')
249
 
250
  return off_targets
@@ -264,7 +266,7 @@ def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
264
  return off_targets.sort_values(SCORE_COL)
265
 
266
 
267
- def tiger_exhibit(transcripts: pd.DataFrame, run_mode: str, check_off_targets: bool, status_bar=None, status_text=None):
268
 
269
  # load model
270
  if os.path.exists('model'):
@@ -279,10 +281,10 @@ def tiger_exhibit(transcripts: pd.DataFrame, run_mode: str, check_off_targets: b
279
  # initialize other outputs
280
  off_target_predictions = pd.DataFrame()
281
 
282
- if run_mode == RUN_MODE_ALL_PM:
283
  return on_target_predictions, off_target_predictions
284
 
285
- elif run_mode == RUN_MODE_TITRATION: # TODO: and titration candidates
286
  on_target_predictions = top_guides_per_transcript(on_target_predictions)
287
 
288
  else:
@@ -305,6 +307,7 @@ if __name__ == '__main__':
305
 
306
  # common arguments
307
  parser = argparse.ArgumentParser()
 
308
  parser.add_argument('--check_off_targets', action='store_true', default=False)
309
  parser.add_argument('--fasta_path', type=str, default=None)
310
  args = parser.parse_args()
@@ -334,9 +337,7 @@ if __name__ == '__main__':
334
 
335
  # run batch
336
  idx_stop = min(idx + BATCH_SIZE_TRANSCRIPTS, len(df_transcripts))
337
- df_on_target, df_off_target = tiger_exhibit(df_transcripts[idx:idx_stop],
338
- run_mode=RUN_MODE_TITRATION,
339
- check_off_targets=args.check_off_targets)
340
 
341
  # save batch results
342
  df_on_target.to_csv('on_target.csv', header=batch == 1, index=False, mode='a')
 
20
  TARGET_COL = 'Target Sequence'
21
  GUIDE_COL = 'Guide Sequence'
22
  SCORE_COL = 'Guide Score'
23
+ RUN_MODES = dict(all='All on-target guides per transcript', titration='Top guides per transcript')
 
24
  REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
25
  BATCH_SIZE_COMPUTE = 500
26
  BATCH_SIZE_SCAN = 20
 
166
  # progress update
167
  percent_complete = 100 * min((i + 1) / len(transcripts), 1)
168
  update_text = 'Evaluating on-target guides for each transcript: {:.2f}%'.format(percent_complete)
169
+ print('\r' + update_text, end='')
170
  if status_bar:
171
  status_text.text()
172
+ status_bar.progress(percent_complete)
 
173
  print('')
174
 
175
  return predictions
 
177
 
178
  def top_guides_per_transcript(predictions: pd.DataFrame):
179
 
180
+ # select and sort top guides for each transcript
181
  top_guides = pd.DataFrame()
182
  for transcript in predictions[ID_COL].unique():
183
  df = predictions.loc[predictions[ID_COL] == transcript]
 
193
  reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
194
 
195
  # one-hot encode guides to form a filter
196
+ guide_filter = one_hot_encode_sequence(sequence_complement(top_guides[GUIDE_COL]), add_context_padding=False)
197
  guide_filter = tf.transpose(guide_filter, [1, 2, 0])
198
 
199
  # loop over transcripts in batches
 
241
  off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
242
 
243
  # progress update
244
+ percent_complete = 100 * min((i + 1) / len(reference_transcripts), 1)
245
+ update_text = 'Scanning for off-targets: {:.2f}%'.format(percent_complete)
246
+ print('\r' + update_text, end='')
247
  if status_bar:
248
+ status_text.text()
249
+ status_bar.progress(percent_complete)
 
250
  print('')
251
 
252
  return off_targets
 
266
  return off_targets.sort_values(SCORE_COL)
267
 
268
 
269
+ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
270
 
271
  # load model
272
  if os.path.exists('model'):
 
281
  # initialize other outputs
282
  off_target_predictions = pd.DataFrame()
283
 
284
+ if mode == 'all':
285
  return on_target_predictions, off_target_predictions
286
 
287
+ elif mode == 'titration': # TODO: and titration candidates
288
  on_target_predictions = top_guides_per_transcript(on_target_predictions)
289
 
290
  else:
 
307
 
308
  # common arguments
309
  parser = argparse.ArgumentParser()
310
+ parser.add_argument('--mode', type=str, default='titration')
311
  parser.add_argument('--check_off_targets', action='store_true', default=False)
312
  parser.add_argument('--fasta_path', type=str, default=None)
313
  args = parser.parse_args()
 
337
 
338
  # run batch
339
  idx_stop = min(idx + BATCH_SIZE_TRANSCRIPTS, len(df_transcripts))
340
+ df_on_target, df_off_target = tiger_exhibit(df_transcripts[idx:idx_stop], args.mode, args.check_off_targets)
 
 
341
 
342
  # save batch results
343
  df_on_target.to_csv('on_target.csv', header=batch == 1, index=False, mode='a')