Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -122,8 +122,6 @@ def finetune(base_model_path, peptide_length): #, train_dataset, test_dataset)
|
|
| 122 |
def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
|
| 123 |
sequence = protein_seq + binder_seq
|
| 124 |
original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
|
| 125 |
-
print("125: original_input:", original_input)
|
| 126 |
-
print("126: original_input.size:", original_input.shape)
|
| 127 |
length_of_binder = len(binder_seq)
|
| 128 |
|
| 129 |
# Prepare a batch with each row having one masked token from the binder sequence
|
|
@@ -134,14 +132,11 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
|
|
| 134 |
# Prepare labels for the masked tokens
|
| 135 |
labels = torch.full_like(masked_inputs, -100)
|
| 136 |
labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
|
| 137 |
-
|
| 138 |
-
print("136: masked_inputs.shape:",masked_inputs.shape)
|
| 139 |
# Get model predictions and calculate loss
|
| 140 |
with torch.no_grad():
|
| 141 |
outputs = model(masked_inputs, labels=labels)
|
| 142 |
loss = outputs.loss
|
| 143 |
-
print("140: logits:", outputs.logits)
|
| 144 |
-
print("141: logits.size:", outputs.logits.shape)
|
| 145 |
|
| 146 |
# Loss is already averaged by the model
|
| 147 |
avg_loss = loss.item()
|
|
@@ -163,23 +158,17 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
|
|
| 163 |
masked_peptide = '<mask>' * peptide_length
|
| 164 |
input_sequence = protein_seq + masked_peptide
|
| 165 |
inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
|
| 166 |
-
|
| 167 |
-
#print("165: inputs.shape:",inputs.size)
|
| 168 |
with torch.no_grad():
|
| 169 |
logits = model(**inputs).logits
|
| 170 |
-
|
| 171 |
-
print("167: logits.size:", logits.shape)
|
| 172 |
mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
|
| 173 |
logits_at_masks = logits[0, mask_token_indices]
|
| 174 |
|
| 175 |
# Apply top-k sampling
|
| 176 |
top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
|
| 177 |
-
print("171:top_k_logits, top_k_indices:", top_k_logits, top_k_indices)
|
| 178 |
-
print("174: top_k_logits.shape:", top_k_logits.shape)
|
| 179 |
probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
|
| 180 |
predicted_indices = Categorical(probabilities).sample()
|
| 181 |
-
print("174:predicted_indices:", predicted_indices)
|
| 182 |
-
print("178: predicted_indices.shape:", predicted_indices.shape)
|
| 183 |
predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
|
| 184 |
generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
|
| 185 |
# Compute PPL for the generated binder
|
|
@@ -293,7 +282,7 @@ with demo:
|
|
| 293 |
interactive = True,
|
| 294 |
)
|
| 295 |
with gr.Row():
|
| 296 |
-
peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=
|
| 297 |
num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=4)
|
| 298 |
with gr.Column(scale=5, variant="compact"):
|
| 299 |
name = gr.Dropdown(
|
|
|
|
| 122 |
def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
|
| 123 |
sequence = protein_seq + binder_seq
|
| 124 |
original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
|
|
|
|
|
|
|
| 125 |
length_of_binder = len(binder_seq)
|
| 126 |
|
| 127 |
# Prepare a batch with each row having one masked token from the binder sequence
|
|
|
|
| 132 |
# Prepare labels for the masked tokens
|
| 133 |
labels = torch.full_like(masked_inputs, -100)
|
| 134 |
labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
|
| 135 |
+
|
|
|
|
| 136 |
# Get model predictions and calculate loss
|
| 137 |
with torch.no_grad():
|
| 138 |
outputs = model(masked_inputs, labels=labels)
|
| 139 |
loss = outputs.loss
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# Loss is already averaged by the model
|
| 142 |
avg_loss = loss.item()
|
|
|
|
| 158 |
masked_peptide = '<mask>' * peptide_length
|
| 159 |
input_sequence = protein_seq + masked_peptide
|
| 160 |
inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
|
| 161 |
+
|
|
|
|
| 162 |
with torch.no_grad():
|
| 163 |
logits = model(**inputs).logits
|
| 164 |
+
|
|
|
|
| 165 |
mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
|
| 166 |
logits_at_masks = logits[0, mask_token_indices]
|
| 167 |
|
| 168 |
# Apply top-k sampling
|
| 169 |
top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
|
|
|
|
|
|
|
| 170 |
probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
|
| 171 |
predicted_indices = Categorical(probabilities).sample()
|
|
|
|
|
|
|
| 172 |
predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
|
| 173 |
generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
|
| 174 |
# Compute PPL for the generated binder
|
|
|
|
| 282 |
interactive = True,
|
| 283 |
)
|
| 284 |
with gr.Row():
|
| 285 |
+
peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
|
| 286 |
num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=4)
|
| 287 |
with gr.Column(scale=5, variant="compact"):
|
| 288 |
name = gr.Dropdown(
|