wangjin2000 commited on
Commit
ff7b8e3
·
verified ·
1 Parent(s): e296b93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -41,10 +41,10 @@ class ProteinDataset(Dataset):
41
  def __init__(self, file, tokenizer, peptide_length):
42
  data = pd.read_csv(file)
43
  self.tokenizer = tokenizer
44
- self.proteins = data["Receptor Sequence"].tolist()
45
- self.peptides = data["Binder"].tolist()
46
- #self.proteins = data["P_Sequence"].tolist() #header defined by Lin Qiao
47
- #self.peptides = data["p_Sequence"].tolist()
48
  self.max_length_pm = 500 + 2 + peptide_length #assume the maz length of protein is 500
49
 
50
  def __len__(self):
@@ -80,9 +80,11 @@ def finetune(base_model_path, peptide_length): #, train_dataset, test_dataset)
80
  # Tokenization
81
  tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
82
 
83
- train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer, peptide_length)
84
- test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer, peptide_length)
85
-
 
 
86
  model_name_base = base_model_path.split("/")[1]
87
  timestamp = datetime.now().strftime('%Y-%m-%d_%H')
88
  lr = 0.0007984276816171436
@@ -269,10 +271,10 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, m
269
  results = []
270
 
271
  for i, row in input.iterrows():
272
- protein_seq = row['Receptor Sequence']
273
- peptide_seq = row['Peptide Sequence']
274
- #protein_seq = row['P_Sequence']
275
- #peptide_seq = row['p_Sequence']
276
  peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
277
 
278
  #get metrics for ground truth peptide
 
41
  def __init__(self, file, tokenizer, peptide_length):
42
  data = pd.read_csv(file)
43
  self.tokenizer = tokenizer
44
+ #self.proteins = data["Receptor Sequence"].tolist()
45
+ #self.peptides = data["Binder"].tolist()
46
+ self.proteins = data["P_Sequence"].tolist() #header defined by Lin Qiao
47
+ self.peptides = data["p_Sequence"].tolist()
48
  self.max_length_pm = 500 + 2 + peptide_length #assume the maz length of protein is 500
49
 
50
  def __len__(self):
 
80
  # Tokenization
81
  tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
82
 
83
+ #train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer, peptide_length)
84
+ #test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer, peptide_length)
85
+ train_dataset = ProteinDataset("./datasets/peptide-protein-train.csv", tokenizer, peptide_length)
86
+ test_dataset = ProteinDataset("./datasets/peptide-protein-test.csv", tokenizer, peptide_length)
87
+
88
  model_name_base = base_model_path.split("/")[1]
89
  timestamp = datetime.now().strftime('%Y-%m-%d_%H')
90
  lr = 0.0007984276816171436
 
271
  results = []
272
 
273
  for i, row in input.iterrows():
274
+ #protein_seq = row['Receptor Sequence']
275
+ #peptide_seq = row['Peptide Sequence']
276
+ protein_seq = row['P_Sequence']
277
+ peptide_seq = row['p_Sequence']
278
  peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
279
 
280
  #get metrics for ground truth peptide