File size: 3,586 Bytes
c925c32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d3ceee
 
 
afe1d29
 
 
 
a34c6e6
 
5d3ceee
afe1d29
 
 
bbd8ca0
5d3ceee
 
a34c6e6
c925c32
a34c6e6
afe1d29
a34c6e6
c925c32
a34c6e6
afe1d29
5d3ceee
afe1d29
 
 
bbd8ca0
 
 
 
 
 
afe1d29
5d3ceee
884e12d
c925c32
884e12d
c925c32
884e12d
c925c32
 
 
 
 
 
 
884e12d
c925c32
 
 
5d3ceee
26e916a
5d3ceee
26e916a
5d3ceee
afe1d29
 
26e916a
 
afe1d29
 
da1f78f
26e916a
a34c6e6
 
d29037f
a34c6e6
 
26e916a
 
 
 
afe1d29
5d3ceee
26e916a
 
a34c6e6
26e916a
 
 
c925c32
 
 
 
 
 
 
26e916a
 
 
 
c925c32
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import torch
import pandas as pd
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer

# --- Load tokenizer & model ---
tokenizer = EsmTokenizer.from_pretrained(
    "facebook/esm2_t6_8M_UR50D",
    do_lower_case=False
)

model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval()

# --- FASTA Reader ---
def read_fasta(fasta_string):
    sequences = []
    headers = []
    seq_buffer = []

    for line in StringIO(fasta_string):
        line = line.strip()
        if not line:
            continue
        if line.startswith(">"):
            if seq_buffer:
                sequences.append("".join(seq_buffer))
                seq_buffer.clear()
            headers.append(line)
        else:
            if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                raise ValueError(
                    "Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed."
                )
            if len(line) > 30:
                raise ValueError(
                    f"Sequence too long: '{line}' ({len(line)} > 30 characters)."
                )
            seq_buffer.append(line)

    if seq_buffer:
        sequences.append("".join(seq_buffer))

    if len(headers) != len(sequences):
        raise ValueError(
            f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. "
            "Each header must be followed by a sequence."
        )

    return headers, sequences


def predict_peptide_class(sequences):
    sequences = [str(s) for s in sequences]
    inputs = tokenizer(
        sequences,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=30
    )
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
    classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
    return probs, classes


def predict_from_fasta(fasta_input):
    """Processes FASTA input and returns predictions in table + CSV download."""
    try:
        headers, sequences = read_fasta(fasta_input)
        if not sequences:
            df = pd.DataFrame({"Error": ["No valid sequences found."]})
            return df, None

        probs, classes = predict_peptide_class(sequences)
        probs_rounded = [f"{p:.2f}" for p in probs]
        df = pd.DataFrame({
            "Header": headers,
            "Sequence": sequences,
            "Probability": probs_rounded,
            "Predicted Class": classes
        })
        # Save as CSV file
        csv_path = "predictions.csv"
        df.to_csv(csv_path, index=False)
        return df, csv_path

    except ValueError as e:
        df = pd.DataFrame({"Error": [str(e)]})
        return df, None
    except Exception as e:
        df = pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
        return df, None


iface = gr.Interface(
    fn=predict_from_fasta,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Paste your peptide sequences in FASTA format here"
    ),
    outputs=[
        gr.Dataframe(label="Predictions"),
        gr.File(label="Download CSV")
    ],
    title="B3PP Predictor",
    description=(
        "Submit peptide sequences in FASTA format to determine their potential as "
        "blood-brain barrier penetration peptides. Sequences must consist exclusively "
        "of natural amino acids in uppercase letters, with a maximum length of 30 characters."
    )
)

if __name__ == "__main__":
    iface.launch()