Spaces:

isyslab
/

LocPred-Prok

Running

App Files Files Community

wangleiofficial commited on Sep 2, 2025

Commit

4c7c3fa

verified ·

1 Parent(s): 662dd77

Create app.py

Browse files

Files changed (1) hide show

app.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import json
+import os
+import re
+# --- 1. Model Definition (Must be identical to the one used during training) ---
+class AttentionPooling(nn.Module):
+    """Attention Pooling Layer"""
+    def __init__(self, d_model):
+        super().__init__()
+        self.attention_net = nn.Linear(d_model, 1)
+    def forward(self, x, mask):
+        attn_logits = self.attention_net(x).squeeze(2)
+        attn_logits.masked_fill_(mask == 0, -float('inf'))
+        attn_weights = F.softmax(attn_logits, dim=1)
+        return torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1)
+class ProtDualBranchEnhancedClassifier(nn.Module):
+    """Enhanced dual-branch model"""
+    def __init__(self, d_model, projection_dim, num_classes, dropout, kernel_size):
+        super().__init__()
+        self.cls_projector = nn.Linear(d_model, projection_dim)
+        self.token_refiner = nn.Sequential(
+            nn.Conv1d(d_model, d_model, kernel_size, padding='same'),
+            nn.ReLU())
+        self.attention_pooling = AttentionPooling(d_model)
+        self.tok_projector = nn.Linear(d_model, projection_dim)
+        fused_dim = projection_dim * 2
+        self.gate = nn.Sequential(nn.Linear(fused_dim, fused_dim), nn.Sigmoid())
+        self.classifier_head = nn.Sequential(
+            nn.LayerNorm(fused_dim),
+            nn.Linear(fused_dim, fused_dim * 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(fused_dim * 2, num_classes))
+    def forward(self, cls_embedding, token_embeddings, mask):
+        z_cls = self.cls_projector(cls_embedding)
+        tok_emb_permuted = token_embeddings.permute(0, 2, 1)
+        refined_tok_emb = self.token_refiner(tok_emb_permuted).permute(0, 2, 1)
+        z_tok_pooled = self.attention_pooling(refined_tok_emb, mask)
+        z_tok = self.tok_projector(z_tok_pooled)
+        z_fused_concat = torch.cat([z_cls, z_tok], dim=1)
+        gate_values = self.gate(z_fused_concat)
+        z_fused_gated = z_fused_concat * gate_values
+        return self.classifier_head(z_fused_gated)
+# --- 2. Load Models and Auxiliary Files ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
+CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
+LABEL_MAP_PATH = "label_map.json"
+# Load the label map file
+try:
+    with open(LABEL_MAP_PATH, 'r') as f:
+        label_to_idx = json.load(f)
+        idx_to_label = {v: k for k, v in label_to_idx.items()}
+except FileNotFoundError:
+    raise FileNotFoundError(f"Error: Could not find '{LABEL_MAP_PATH}'. Please make sure this file is uploaded to the Space.")
+NUM_CLASSES = len(idx_to_label)
+D_MODEL = 640 # Dimension for esm2_t30_150M_UR50D
+# Load Protein Language Model (PLM) and tokenizer
+print("Loading Protein Language Model...")
+tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
+plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
+plm_model.eval()
+print("PLM loaded successfully.")
+# Load your trained downstream classifier
+print("Loading downstream classifier...")
+classifier = ProtDualBranchEnhancedClassifier(
+    d_model=D_MODEL,
+    projection_dim=32,
+    num_classes=NUM_CLASSES,
+    dropout=0.3,
+    kernel_size=3
+).to(DEVICE)
+if not os.path.exists(CLASSIFIER_PATH):
+    raise FileNotFoundError(f"Error: Could not find the trained model file '{CLASSIFIER_PATH}'. Please make sure the correct .pth file is uploaded.")
+classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
+classifier.eval()
+print("Classifier loaded. Application is ready!")
+# --- 3. Prediction Function ---
+def predict(sequence_input):
+    """
+    Receives a protein sequence and returns a dictionary of class probabilities.
+    """
+    if not sequence_input or sequence_input.isspace():
+        return {"Error": "Please enter a protein sequence."}
+    # Clean the input, support FASTA format
+    if sequence_input.startswith('>'):
+        sequence = "".join(sequence_input.split('\n')[1:])
+    else:
+        sequence = sequence_input
+    sequence = re.sub(r'[^A-Z]', '', sequence.upper())
+    if not sequence:
+        return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
+    # Feature extraction with PLM
+    with torch.no_grad():
+        inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
+        outputs = plm_model(**inputs)
+        hidden_states = outputs.last_hidden_state
+        cls_embedding = hidden_states[:, 0, :]
+        token_embeddings = hidden_states[:, 1:-1, :]
+        token_mask = inputs['attention_mask'][:, 2:]
+    # Prediction with the downstream classifier
+    with torch.no_grad():
+        logits = classifier(cls_embedding, token_embeddings, token_mask)
+        probabilities = F.softmax(logits, dim=1)[0]
+    # Format the output
+    confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
+    return confidences
+# --- 4. Create Gradio Interface ---
+title = "Predicting the subcellular location of prokaryotic proteins with LocPred-Prok"
+description = """
+This is a prediction tool based on the **ESM-2 (150M)** Protein Language Model and a custom **`dual_branch_enhanced`** classifier.
+Simply paste a protein's amino acid sequence (FASTA format or raw sequence are both supported) into the text box below, and the model will predict its localization within the cell.
+"""
+examples = [
+    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2 OS=Escherichia coli (strain K12) OX=83333 GN=mrdA PE=1 SV=2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
+    ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
+]
+gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(
+        lines=10,
+        label="Protein Sequence",
+        placeholder="Paste your amino acid sequence here..."
+    ),
+    outputs=gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results"),
+    title=title,
+    description=description,
+    examples=examples,
+    allow_flagging="never",
+    theme=gr.themes.Soft()
+).launch()