Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,10 +41,10 @@ class ProteinDataset(Dataset):
|
|
| 41 |
def __init__(self, file, tokenizer, peptide_length):
|
| 42 |
data = pd.read_csv(file)
|
| 43 |
self.tokenizer = tokenizer
|
| 44 |
-
self.proteins = data["Receptor Sequence"].tolist()
|
| 45 |
-
self.peptides = data["Binder"].tolist()
|
| 46 |
-
|
| 47 |
-
|
| 48 |
self.max_length_pm = 500 + 2 + peptide_length #assume the maz length of protein is 500
|
| 49 |
|
| 50 |
def __len__(self):
|
|
@@ -80,9 +80,11 @@ def finetune(base_model_path, peptide_length): #, train_dataset, test_dataset)
|
|
| 80 |
# Tokenization
|
| 81 |
tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
|
| 82 |
|
| 83 |
-
train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer, peptide_length)
|
| 84 |
-
test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer, peptide_length)
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
model_name_base = base_model_path.split("/")[1]
|
| 87 |
timestamp = datetime.now().strftime('%Y-%m-%d_%H')
|
| 88 |
lr = 0.0007984276816171436
|
|
@@ -269,10 +271,10 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, m
|
|
| 269 |
results = []
|
| 270 |
|
| 271 |
for i, row in input.iterrows():
|
| 272 |
-
protein_seq = row['Receptor Sequence']
|
| 273 |
-
peptide_seq = row['Peptide Sequence']
|
| 274 |
-
|
| 275 |
-
|
| 276 |
peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
|
| 277 |
|
| 278 |
#get metrics for ground truth peptide
|
|
|
|
| 41 |
def __init__(self, file, tokenizer, peptide_length):
|
| 42 |
data = pd.read_csv(file)
|
| 43 |
self.tokenizer = tokenizer
|
| 44 |
+
#self.proteins = data["Receptor Sequence"].tolist()
|
| 45 |
+
#self.peptides = data["Binder"].tolist()
|
| 46 |
+
self.proteins = data["P_Sequence"].tolist() #header defined by Lin Qiao
|
| 47 |
+
self.peptides = data["p_Sequence"].tolist()
|
| 48 |
self.max_length_pm = 500 + 2 + peptide_length #assume the maz length of protein is 500
|
| 49 |
|
| 50 |
def __len__(self):
|
|
|
|
| 80 |
# Tokenization
|
| 81 |
tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
|
| 82 |
|
| 83 |
+
#train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer, peptide_length)
|
| 84 |
+
#test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer, peptide_length)
|
| 85 |
+
train_dataset = ProteinDataset("./datasets/peptide-protein-train.csv", tokenizer, peptide_length)
|
| 86 |
+
test_dataset = ProteinDataset("./datasets/peptide-protein-test.csv", tokenizer, peptide_length)
|
| 87 |
+
|
| 88 |
model_name_base = base_model_path.split("/")[1]
|
| 89 |
timestamp = datetime.now().strftime('%Y-%m-%d_%H')
|
| 90 |
lr = 0.0007984276816171436
|
|
|
|
| 271 |
results = []
|
| 272 |
|
| 273 |
for i, row in input.iterrows():
|
| 274 |
+
#protein_seq = row['Receptor Sequence']
|
| 275 |
+
#peptide_seq = row['Peptide Sequence']
|
| 276 |
+
protein_seq = row['P_Sequence']
|
| 277 |
+
peptide_seq = row['p_Sequence']
|
| 278 |
peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
|
| 279 |
|
| 280 |
#get metrics for ground truth peptide
|