wangjin2000 commited on
Commit
6ffebf7
·
verified ·
1 Parent(s): e3c897c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -13
app.py CHANGED
@@ -123,19 +123,15 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
123
  sequence = protein_seq + binder_seq
124
  original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
125
  length_of_binder = len(binder_seq)
126
- print("line 126, protein_seq",len(protein_seq))
127
- print("line 127, length_of_binder",length_of_binder)
128
 
129
  # Prepare a batch with each row having one masked token from the binder sequence
130
  masked_inputs = original_input.repeat(length_of_binder, 1)
131
  positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
132
  masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
133
- print("line 131 : masked_inputs:", masked_inputs.shape)
134
 
135
  # Prepare labels for the masked tokens
136
  labels = torch.full_like(masked_inputs, -100)
137
  labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
138
- print("line 136 : labels:", labels.shape)
139
 
140
  # Get model predictions and calculate loss
141
  with torch.no_grad():
@@ -170,16 +166,10 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
170
 
171
  # Apply top-k sampling
172
  top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
173
- print("line 169 : top_k_logits:", top_k_logits)
174
- print("line 170 : top_k_indices:", top_k_indices)
175
- probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
176
- print("line 172 : probabilities:", probabilities)
177
- predicted_indices = Categorical(probabilities).sample()
178
- print("line 174 : predicted_indices:", predicted_indices)
179
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
180
- print("line 176 : predicted_token_ids:", predicted_token_ids)
181
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
182
- print("line 178 : generated_binder:", generated_binder)
183
  # Compute PPL for the generated binder
184
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
185
 
@@ -212,7 +202,7 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
212
  results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
213
  print(results_df)
214
 
215
- #combine target protein and peptide with 20 G amino acids.
216
  separator = 'G' * 20
217
  peptide_lp = results_df['Binder'][results_df['Pseudo Perplexity'].idxmin()] #Choosing the one with the lowest perplexity
218
  print("lowest perplesity:", peptide_lp)
 
123
  sequence = protein_seq + binder_seq
124
  original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
125
  length_of_binder = len(binder_seq)
 
 
126
 
127
  # Prepare a batch with each row having one masked token from the binder sequence
128
  masked_inputs = original_input.repeat(length_of_binder, 1)
129
  positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
130
  masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
 
131
 
132
  # Prepare labels for the masked tokens
133
  labels = torch.full_like(masked_inputs, -100)
134
  labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
 
135
 
136
  # Get model predictions and calculate loss
137
  with torch.no_grad():
 
166
 
167
  # Apply top-k sampling
168
  top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
169
+ probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
170
+ predicted_indices = Categorical(probabilities).sample()
 
 
 
 
171
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
 
172
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
 
173
  # Compute PPL for the generated binder
174
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
175
 
 
202
  results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
203
  print(results_df)
204
 
205
+ #combine target protein and predicted peptide with 20 G amino acids.
206
  separator = 'G' * 20
207
  peptide_lp = results_df['Binder'][results_df['Pseudo Perplexity'].idxmin()] #Choosing the one with the lowest perplexity
208
  print("lowest perplesity:", peptide_lp)