Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
cleanup and bug fixes
Browse files
tiger.py
CHANGED
|
@@ -20,8 +20,7 @@ SEQ_COL = 'Sequence'
|
|
| 20 |
TARGET_COL = 'Target Sequence'
|
| 21 |
GUIDE_COL = 'Guide Sequence'
|
| 22 |
SCORE_COL = 'Guide Score'
|
| 23 |
-
|
| 24 |
-
RUN_MODE_TITRATION = 'Top guides per transcript'
|
| 25 |
REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
|
| 26 |
BATCH_SIZE_COMPUTE = 500
|
| 27 |
BATCH_SIZE_SCAN = 20
|
|
@@ -167,10 +166,10 @@ def get_on_target_predictions(transcripts: pd.DataFrame, model: tf.keras.Model,
|
|
| 167 |
# progress update
|
| 168 |
percent_complete = 100 * min((i + 1) / len(transcripts), 1)
|
| 169 |
update_text = 'Evaluating on-target guides for each transcript: {:.2f}%'.format(percent_complete)
|
|
|
|
| 170 |
if status_bar:
|
| 171 |
status_text.text()
|
| 172 |
-
status_bar.progress(
|
| 173 |
-
print('\r' + update_text, end='')
|
| 174 |
print('')
|
| 175 |
|
| 176 |
return predictions
|
|
@@ -178,6 +177,7 @@ def get_on_target_predictions(transcripts: pd.DataFrame, model: tf.keras.Model,
|
|
| 178 |
|
| 179 |
def top_guides_per_transcript(predictions: pd.DataFrame):
|
| 180 |
|
|
|
|
| 181 |
top_guides = pd.DataFrame()
|
| 182 |
for transcript in predictions[ID_COL].unique():
|
| 183 |
df = predictions.loc[predictions[ID_COL] == transcript]
|
|
@@ -193,7 +193,7 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
|
|
| 193 |
reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
|
| 194 |
|
| 195 |
# one-hot encode guides to form a filter
|
| 196 |
-
guide_filter = one_hot_encode_sequence(sequence_complement(top_guides[
|
| 197 |
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
| 198 |
|
| 199 |
# loop over transcripts in batches
|
|
@@ -241,10 +241,12 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
|
|
| 241 |
off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
|
| 242 |
|
| 243 |
# progress update
|
|
|
|
|
|
|
|
|
|
| 244 |
if status_bar:
|
| 245 |
-
status_text.text(
|
| 246 |
-
status_bar.progress(
|
| 247 |
-
print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(reference_transcripts), 1)), end='')
|
| 248 |
print('')
|
| 249 |
|
| 250 |
return off_targets
|
|
@@ -264,7 +266,7 @@ def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
|
|
| 264 |
return off_targets.sort_values(SCORE_COL)
|
| 265 |
|
| 266 |
|
| 267 |
-
def tiger_exhibit(transcripts: pd.DataFrame,
|
| 268 |
|
| 269 |
# load model
|
| 270 |
if os.path.exists('model'):
|
|
@@ -279,10 +281,10 @@ def tiger_exhibit(transcripts: pd.DataFrame, run_mode: str, check_off_targets: b
|
|
| 279 |
# initialize other outputs
|
| 280 |
off_target_predictions = pd.DataFrame()
|
| 281 |
|
| 282 |
-
if
|
| 283 |
return on_target_predictions, off_target_predictions
|
| 284 |
|
| 285 |
-
elif
|
| 286 |
on_target_predictions = top_guides_per_transcript(on_target_predictions)
|
| 287 |
|
| 288 |
else:
|
|
@@ -305,6 +307,7 @@ if __name__ == '__main__':
|
|
| 305 |
|
| 306 |
# common arguments
|
| 307 |
parser = argparse.ArgumentParser()
|
|
|
|
| 308 |
parser.add_argument('--check_off_targets', action='store_true', default=False)
|
| 309 |
parser.add_argument('--fasta_path', type=str, default=None)
|
| 310 |
args = parser.parse_args()
|
|
@@ -334,9 +337,7 @@ if __name__ == '__main__':
|
|
| 334 |
|
| 335 |
# run batch
|
| 336 |
idx_stop = min(idx + BATCH_SIZE_TRANSCRIPTS, len(df_transcripts))
|
| 337 |
-
df_on_target, df_off_target = tiger_exhibit(df_transcripts[idx:idx_stop],
|
| 338 |
-
run_mode=RUN_MODE_TITRATION,
|
| 339 |
-
check_off_targets=args.check_off_targets)
|
| 340 |
|
| 341 |
# save batch results
|
| 342 |
df_on_target.to_csv('on_target.csv', header=batch == 1, index=False, mode='a')
|
|
|
|
| 20 |
TARGET_COL = 'Target Sequence'
|
| 21 |
GUIDE_COL = 'Guide Sequence'
|
| 22 |
SCORE_COL = 'Guide Score'
|
| 23 |
+
RUN_MODES = dict(all='All on-target guides per transcript', titration='Top guides per transcript')
|
|
|
|
| 24 |
REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
|
| 25 |
BATCH_SIZE_COMPUTE = 500
|
| 26 |
BATCH_SIZE_SCAN = 20
|
|
|
|
| 166 |
# progress update
|
| 167 |
percent_complete = 100 * min((i + 1) / len(transcripts), 1)
|
| 168 |
update_text = 'Evaluating on-target guides for each transcript: {:.2f}%'.format(percent_complete)
|
| 169 |
+
print('\r' + update_text, end='')
|
| 170 |
if status_bar:
|
| 171 |
status_text.text()
|
| 172 |
+
status_bar.progress(percent_complete)
|
|
|
|
| 173 |
print('')
|
| 174 |
|
| 175 |
return predictions
|
|
|
|
| 177 |
|
| 178 |
def top_guides_per_transcript(predictions: pd.DataFrame):
|
| 179 |
|
| 180 |
+
# select and sort top guides for each transcript
|
| 181 |
top_guides = pd.DataFrame()
|
| 182 |
for transcript in predictions[ID_COL].unique():
|
| 183 |
df = predictions.loc[predictions[ID_COL] == transcript]
|
|
|
|
| 193 |
reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
|
| 194 |
|
| 195 |
# one-hot encode guides to form a filter
|
| 196 |
+
guide_filter = one_hot_encode_sequence(sequence_complement(top_guides[GUIDE_COL]), add_context_padding=False)
|
| 197 |
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
| 198 |
|
| 199 |
# loop over transcripts in batches
|
|
|
|
| 241 |
off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
|
| 242 |
|
| 243 |
# progress update
|
| 244 |
+
percent_complete = 100 * min((i + 1) / len(reference_transcripts), 1)
|
| 245 |
+
update_text = 'Scanning for off-targets: {:.2f}%'.format(percent_complete)
|
| 246 |
+
print('\r' + update_text, end='')
|
| 247 |
if status_bar:
|
| 248 |
+
status_text.text()
|
| 249 |
+
status_bar.progress(percent_complete)
|
|
|
|
| 250 |
print('')
|
| 251 |
|
| 252 |
return off_targets
|
|
|
|
| 266 |
return off_targets.sort_values(SCORE_COL)
|
| 267 |
|
| 268 |
|
| 269 |
+
def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
|
| 270 |
|
| 271 |
# load model
|
| 272 |
if os.path.exists('model'):
|
|
|
|
| 281 |
# initialize other outputs
|
| 282 |
off_target_predictions = pd.DataFrame()
|
| 283 |
|
| 284 |
+
if mode == 'all':
|
| 285 |
return on_target_predictions, off_target_predictions
|
| 286 |
|
| 287 |
+
elif mode == 'titration': # TODO: and titration candidates
|
| 288 |
on_target_predictions = top_guides_per_transcript(on_target_predictions)
|
| 289 |
|
| 290 |
else:
|
|
|
|
| 307 |
|
| 308 |
# common arguments
|
| 309 |
parser = argparse.ArgumentParser()
|
| 310 |
+
parser.add_argument('--mode', type=str, default='titration')
|
| 311 |
parser.add_argument('--check_off_targets', action='store_true', default=False)
|
| 312 |
parser.add_argument('--fasta_path', type=str, default=None)
|
| 313 |
args = parser.parse_args()
|
|
|
|
| 337 |
|
| 338 |
# run batch
|
| 339 |
idx_stop = min(idx + BATCH_SIZE_TRANSCRIPTS, len(df_transcripts))
|
| 340 |
+
df_on_target, df_off_target = tiger_exhibit(df_transcripts[idx:idx_stop], args.mode, args.check_off_targets)
|
|
|
|
|
|
|
| 341 |
|
| 342 |
# save batch results
|
| 343 |
df_on_target.to_csv('on_target.csv', header=batch == 1, index=False, mode='a')
|