wangjin2000 commited on
Commit
fa9d40f
·
verified ·
1 Parent(s): 04fa7ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -15
app.py CHANGED
@@ -122,8 +122,6 @@ def finetune(base_model_path, peptide_length): #, train_dataset, test_dataset)
122
  def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
123
  sequence = protein_seq + binder_seq
124
  original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
125
- print("125: original_input:", original_input)
126
- print("126: original_input.size:", original_input.shape)
127
  length_of_binder = len(binder_seq)
128
 
129
  # Prepare a batch with each row having one masked token from the binder sequence
@@ -134,14 +132,11 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
134
  # Prepare labels for the masked tokens
135
  labels = torch.full_like(masked_inputs, -100)
136
  labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
137
- print("135: masked_inputs:",masked_inputs)
138
- print("136: masked_inputs.shape:",masked_inputs.shape)
139
  # Get model predictions and calculate loss
140
  with torch.no_grad():
141
  outputs = model(masked_inputs, labels=labels)
142
  loss = outputs.loss
143
- print("140: logits:", outputs.logits)
144
- print("141: logits.size:", outputs.logits.shape)
145
 
146
  # Loss is already averaged by the model
147
  avg_loss = loss.item()
@@ -163,23 +158,17 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
163
  masked_peptide = '<mask>' * peptide_length
164
  input_sequence = protein_seq + masked_peptide
165
  inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
166
- print("164: inputs:",inputs)
167
- #print("165: inputs.shape:",inputs.size)
168
  with torch.no_grad():
169
  logits = model(**inputs).logits
170
- print("166: logits:", logits)
171
- print("167: logits.size:", logits.shape)
172
  mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
173
  logits_at_masks = logits[0, mask_token_indices]
174
 
175
  # Apply top-k sampling
176
  top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
177
- print("171:top_k_logits, top_k_indices:", top_k_logits, top_k_indices)
178
- print("174: top_k_logits.shape:", top_k_logits.shape)
179
  probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
180
  predicted_indices = Categorical(probabilities).sample()
181
- print("174:predicted_indices:", predicted_indices)
182
- print("178: predicted_indices.shape:", predicted_indices.shape)
183
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
184
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
185
  # Compute PPL for the generated binder
@@ -293,7 +282,7 @@ with demo:
293
  interactive = True,
294
  )
295
  with gr.Row():
296
- peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=50)
297
  num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=4)
298
  with gr.Column(scale=5, variant="compact"):
299
  name = gr.Dropdown(
 
122
  def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
123
  sequence = protein_seq + binder_seq
124
  original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
 
 
125
  length_of_binder = len(binder_seq)
126
 
127
  # Prepare a batch with each row having one masked token from the binder sequence
 
132
  # Prepare labels for the masked tokens
133
  labels = torch.full_like(masked_inputs, -100)
134
  labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
135
+
 
136
  # Get model predictions and calculate loss
137
  with torch.no_grad():
138
  outputs = model(masked_inputs, labels=labels)
139
  loss = outputs.loss
 
 
140
 
141
  # Loss is already averaged by the model
142
  avg_loss = loss.item()
 
158
  masked_peptide = '<mask>' * peptide_length
159
  input_sequence = protein_seq + masked_peptide
160
  inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
161
+
 
162
  with torch.no_grad():
163
  logits = model(**inputs).logits
164
+
 
165
  mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
166
  logits_at_masks = logits[0, mask_token_indices]
167
 
168
  # Apply top-k sampling
169
  top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
 
 
170
  probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
171
  predicted_indices = Categorical(probabilities).sample()
 
 
172
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
173
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
174
  # Compute PPL for the generated binder
 
282
  interactive = True,
283
  )
284
  with gr.Row():
285
+ peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
286
  num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=4)
287
  with gr.Column(scale=5, variant="compact"):
288
  name = gr.Dropdown(