Spaces:

ChatterjeeLab
/

zero_shot_mutation_prediction

Running

App Files Files Community

Kseniia-Kholina commited on Aug 5, 2024

Commit

3025e1c

verified ·

1 Parent(s): dff19c2

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -29,26 +29,37 @@ def process_sequence(sequence, domain_bounds, n):
     all_logits = []
     for i in range(len(sequence)):
-        if start_index <= i <= (end_index - 1):
-            masked_seq = sequence[:i] + '<mask>' + sequence[i+1:]
-            inputs = tokenizer(masked_seq, return_tensors="pt", padding=True, truncation=True, max_length=2000)
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad():
-                logits = model(**inputs).logits
-            mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
-            mask_token_logits = logits[0, mask_token_index, :]
-            # filter out non-amino acid tokens
-            filtered_indices = list(range(4, 23 + 1))
-            filtered_logits = mask_token_logits[:, filtered_indices]
-            # Decode top n tokens
-            top_n_tokens = torch.topk(filtered_logits, n, dim=1).indices[0].tolist()
-            mutation = [tokenizer.decode([token]) for token in top_n_tokens]
-            top_n_mutations[(sequence[i], i)] = mutation
-            logits_array = mask_token_logits.cpu().numpy()
-            filtered_logits = logits_array[:, filtered_indices]
-            all_logits.append(filtered_logits)
     token_indices = torch.arange(logits.size(-1))
     tokens = [tokenizer.decode([idx]) for idx in token_indices]
@@ -63,7 +74,7 @@ def process_sequence(sequence, domain_bounds, n):
     x_tick_labels = [str(pos + 1) for pos in x_tick_positions]
     plt.figure(figsize=(15, 8))
-    plt.rcParams.update({'font.size': 16})
     sns.heatmap(transposed_logits_array, cmap='plasma', xticklabels=x_tick_labels, yticklabels=filtered_tokens)
     plt.title('Token Probability Heatmap')

     all_logits = []
     for i in range(len(sequence)):
+          if start_index <= i <= (end_index - 1):
+              masked_seq = sequence[:i] + '<mask>' + sequence[i+1:]
+              inputs = tokenizer(masked_seq, return_tensors="pt", padding=True, truncation=True, max_length=2000)
+              inputs = {k: v.to(device) for k, v in inputs.items()}
+              with torch.no_grad():
+                  logits = model(**inputs).logits
+              mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+              mask_token_logits = logits[0, mask_token_index, :]
+              # Define amino acid tokens
+              AAs_tokens = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C']
+              all_tokens_logits = mask_token_logits.squeeze(0)
+              top_tokens_indices = torch.argsort(all_tokens_logits, dim=0, descending=True)
+              top_tokens_logits = all_tokens_logits[top_tokens_indices]
+              mutation = []
+              # make sure we don't include non-AA tokens
+              for token_index in top_tokens_indices:
+                  decoded_token = tokenizer.decode([token_index.item()])
+                  if decoded_token in AAs_tokens:
+                      mutation.append(decoded_token)
+                      if len(mutation) == n:
+                          break
+              top_n_mutations[(sequence[i], i)] = mutation
+              # collecting logits for the heatmap
+              logits_array = mask_token_logits.cpu().numpy()
+              # filter out non-amino acid tokens
+              filtered_indices = list(range(4, 23 + 1))
+              filtered_logits = logits_array[:, filtered_indices]
+              all_logits.append(filtered_logits)
     token_indices = torch.arange(logits.size(-1))
     tokens = [tokenizer.decode([idx]) for idx in token_indices]
     x_tick_labels = [str(pos + 1) for pos in x_tick_positions]
     plt.figure(figsize=(15, 8))
+    plt.rcParams.update({'font.size': 18})
     sns.heatmap(transposed_logits_array, cmap='plasma', xticklabels=x_tick_labels, yticklabels=filtered_tokens)
     plt.title('Token Probability Heatmap')