Spaces:
Sleeping
Sleeping
File size: 3,586 Bytes
c925c32 5d3ceee afe1d29 a34c6e6 5d3ceee afe1d29 bbd8ca0 5d3ceee a34c6e6 c925c32 a34c6e6 afe1d29 a34c6e6 c925c32 a34c6e6 afe1d29 5d3ceee afe1d29 bbd8ca0 afe1d29 5d3ceee 884e12d c925c32 884e12d c925c32 884e12d c925c32 884e12d c925c32 5d3ceee 26e916a 5d3ceee 26e916a 5d3ceee afe1d29 26e916a afe1d29 da1f78f 26e916a a34c6e6 d29037f a34c6e6 26e916a afe1d29 5d3ceee 26e916a a34c6e6 26e916a c925c32 26e916a c925c32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import re
import torch
import pandas as pd
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer
# --- Load tokenizer & model ---
tokenizer = EsmTokenizer.from_pretrained(
"facebook/esm2_t6_8M_UR50D",
do_lower_case=False
)
model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval()
# --- FASTA Reader ---
def read_fasta(fasta_string):
sequences = []
headers = []
seq_buffer = []
for line in StringIO(fasta_string):
line = line.strip()
if not line:
continue
if line.startswith(">"):
if seq_buffer:
sequences.append("".join(seq_buffer))
seq_buffer.clear()
headers.append(line)
else:
if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
raise ValueError(
"Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed."
)
if len(line) > 30:
raise ValueError(
f"Sequence too long: '{line}' ({len(line)} > 30 characters)."
)
seq_buffer.append(line)
if seq_buffer:
sequences.append("".join(seq_buffer))
if len(headers) != len(sequences):
raise ValueError(
f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. "
"Each header must be followed by a sequence."
)
return headers, sequences
def predict_peptide_class(sequences):
sequences = [str(s) for s in sequences]
inputs = tokenizer(
sequences,
return_tensors='pt',
padding=True,
truncation=True,
max_length=30
)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
return probs, classes
def predict_from_fasta(fasta_input):
"""Processes FASTA input and returns predictions in table + CSV download."""
try:
headers, sequences = read_fasta(fasta_input)
if not sequences:
df = pd.DataFrame({"Error": ["No valid sequences found."]})
return df, None
probs, classes = predict_peptide_class(sequences)
probs_rounded = [f"{p:.2f}" for p in probs]
df = pd.DataFrame({
"Header": headers,
"Sequence": sequences,
"Probability": probs_rounded,
"Predicted Class": classes
})
# Save as CSV file
csv_path = "predictions.csv"
df.to_csv(csv_path, index=False)
return df, csv_path
except ValueError as e:
df = pd.DataFrame({"Error": [str(e)]})
return df, None
except Exception as e:
df = pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
return df, None
iface = gr.Interface(
fn=predict_from_fasta,
inputs=gr.Textbox(
lines=10,
placeholder="Paste your peptide sequences in FASTA format here"
),
outputs=[
gr.Dataframe(label="Predictions"),
gr.File(label="Download CSV")
],
title="B3PP Predictor",
description=(
"Submit peptide sequences in FASTA format to determine their potential as "
"blood-brain barrier penetration peptides. Sequences must consist exclusively "
"of natural amino acids in uppercase letters, with a maximum length of 30 characters."
)
)
if __name__ == "__main__":
iface.launch()
|