Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,19 +123,15 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
|
|
| 123 |
sequence = protein_seq + binder_seq
|
| 124 |
original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
|
| 125 |
length_of_binder = len(binder_seq)
|
| 126 |
-
print("line 126, protein_seq",len(protein_seq))
|
| 127 |
-
print("line 127, length_of_binder",length_of_binder)
|
| 128 |
|
| 129 |
# Prepare a batch with each row having one masked token from the binder sequence
|
| 130 |
masked_inputs = original_input.repeat(length_of_binder, 1)
|
| 131 |
positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
|
| 132 |
masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
|
| 133 |
-
print("line 131 : masked_inputs:", masked_inputs.shape)
|
| 134 |
|
| 135 |
# Prepare labels for the masked tokens
|
| 136 |
labels = torch.full_like(masked_inputs, -100)
|
| 137 |
labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
|
| 138 |
-
print("line 136 : labels:", labels.shape)
|
| 139 |
|
| 140 |
# Get model predictions and calculate loss
|
| 141 |
with torch.no_grad():
|
|
@@ -170,16 +166,10 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
|
|
| 170 |
|
| 171 |
# Apply top-k sampling
|
| 172 |
top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
|
| 176 |
-
print("line 172 : probabilities:", probabilities)
|
| 177 |
-
predicted_indices = Categorical(probabilities).sample()
|
| 178 |
-
print("line 174 : predicted_indices:", predicted_indices)
|
| 179 |
predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
|
| 180 |
-
print("line 176 : predicted_token_ids:", predicted_token_ids)
|
| 181 |
generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
|
| 182 |
-
print("line 178 : generated_binder:", generated_binder)
|
| 183 |
# Compute PPL for the generated binder
|
| 184 |
ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
|
| 185 |
|
|
@@ -212,7 +202,7 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
|
|
| 212 |
results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
|
| 213 |
print(results_df)
|
| 214 |
|
| 215 |
-
#combine target protein and peptide with 20 G amino acids.
|
| 216 |
separator = 'G' * 20
|
| 217 |
peptide_lp = results_df['Binder'][results_df['Pseudo Perplexity'].idxmin()] #Choosing the one with the lowest perplexity
|
| 218 |
print("lowest perplesity:", peptide_lp)
|
|
|
|
| 123 |
sequence = protein_seq + binder_seq
|
| 124 |
original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
|
| 125 |
length_of_binder = len(binder_seq)
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# Prepare a batch with each row having one masked token from the binder sequence
|
| 128 |
masked_inputs = original_input.repeat(length_of_binder, 1)
|
| 129 |
positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
|
| 130 |
masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
|
|
|
|
| 131 |
|
| 132 |
# Prepare labels for the masked tokens
|
| 133 |
labels = torch.full_like(masked_inputs, -100)
|
| 134 |
labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
|
|
|
|
| 135 |
|
| 136 |
# Get model predictions and calculate loss
|
| 137 |
with torch.no_grad():
|
|
|
|
| 166 |
|
| 167 |
# Apply top-k sampling
|
| 168 |
top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
|
| 169 |
+
probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
|
| 170 |
+
predicted_indices = Categorical(probabilities).sample()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
|
|
|
|
| 172 |
generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
|
|
|
|
| 173 |
# Compute PPL for the generated binder
|
| 174 |
ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
|
| 175 |
|
|
|
|
| 202 |
results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
|
| 203 |
print(results_df)
|
| 204 |
|
| 205 |
+
#combine target protein and predicted peptide with 20 G amino acids.
|
| 206 |
separator = 'G' * 20
|
| 207 |
peptide_lp = results_df['Binder'][results_df['Pseudo Perplexity'].idxmin()] #Choosing the one with the lowest perplexity
|
| 208 |
print("lowest perplesity:", peptide_lp)
|