Spaces:

isyslab
/

LocPred-Prok

Running

App Files Files Community

wangleiofficial commited on Oct 13, 2025

Commit

4782d51

verified ·

1 Parent(s): 7b163dc

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -44

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
-import gradio as gr
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
-import json
-import os
-import re
-# --- 1. Model Definition (Must be identical to the one used during training) ---
 class AttentionPooling(nn.Module):
     """Attention Pooling Layer"""
     def __init__(self, d_model):
@@ -27,17 +39,23 @@ class ProtDualBranchEnhancedClassifier(nn.Module):
         self.cls_projector = nn.Linear(d_model, projection_dim)
         self.token_refiner = nn.Sequential(
             nn.Conv1d(d_model, d_model, kernel_size, padding='same'),
-            nn.ReLU())
         self.attention_pooling = AttentionPooling(d_model)
         self.tok_projector = nn.Linear(d_model, projection_dim)
         fused_dim = projection_dim * 2
-        self.gate = nn.Sequential(nn.Linear(fused_dim, fused_dim), nn.Sigmoid())
         self.classifier_head = nn.Sequential(
             nn.LayerNorm(fused_dim),
             nn.Linear(fused_dim, fused_dim * 2),
             nn.ReLU(),
             nn.Dropout(dropout),
-            nn.Linear(fused_dim * 2, num_classes))
     def forward(self, cls_embedding, token_embeddings, mask):
         z_cls = self.cls_projector(cls_embedding)
         tok_emb_permuted = token_embeddings.permute(0, 2, 1)
@@ -49,76 +67,81 @@ class ProtDualBranchEnhancedClassifier(nn.Module):
         z_fused_gated = z_fused_concat * gate_values
         return self.classifier_head(z_fused_gated)
-# --- 2. Load Models and Auxiliary Files ---
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
-try:
-    with open(LABEL_MAP_PATH, 'r') as f:
-        label_to_idx = json.load(f)
-        idx_to_label = {v: k for k, v in label_to_idx.items()}
-except FileNotFoundError:
-    raise FileNotFoundError(f"Error: Could not find '{LABEL_MAP_PATH}'. Please make sure this file is uploaded to the Space.")
 NUM_CLASSES = len(idx_to_label)
 D_MODEL = 640
-print("Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
-print("PLM loaded successfully.")
-print("Loading downstream classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
     d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
     dropout=0.3, kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
-    raise FileNotFoundError(f"Error: Could not find the trained model file '{CLASSIFIER_PATH}'. Please make sure the correct .pth file is uploaded.")
 classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
 classifier.eval()
-print("Classifier loaded. Application is ready!")
-# --- 3. Prediction Function ---
 def predict(sequence_input):
     if not sequence_input or sequence_input.isspace():
         return {"Error": "Please enter a protein sequence."}
     sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
         return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
         outputs = plm_model(**inputs)
         hidden_states = outputs.last_hidden_state
         cls_embedding = hidden_states[:, 0, :]
         token_embeddings = hidden_states[:, 1:-1, :]
         token_mask = inputs['attention_mask'][:, 1:-1]
-    with torch.no_grad():
         logits = classifier(cls_embedding, token_embeddings, token_mask)
         probabilities = F.softmax(logits, dim=1)[0]
     confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
     return confidences
-# --- 4. Create Beautified Gradio Interface using Blocks ---
 with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px; margin: auto;}") as app:
     gr.Markdown(
         """
-        # Protein Subcellular Localization Prediction
-        An online prediction tool based on the **ESM-2 (150M)** Protein Language Model and a custom **`dual_branch_enhanced`** classifier.
-        Just paste the amino acid sequence of a protein (FASTA format or raw sequence are supported), and the model will predict its location within the cell.
         """
     )
@@ -129,7 +152,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px;
                 label="Protein Sequence",
                 placeholder="Paste your amino acid sequence here..."
             )
             with gr.Row():
                 clear_btn = gr.ClearButton()
                 submit_btn = gr.Button("🚀 Predict", variant="primary")
@@ -145,25 +168,19 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px;
         with gr.Column(scale=1):
             output_label = gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results")
             with gr.Accordion("Model Information", open=False):
                 gr.Markdown(
                     """
-                    * **Protein Language Model (PLM)**: `facebook/esm2_t30_150M_UR50D`
-                    * **Downstream Classifier**: `ProtDualBranchEnhancedClassifier`
-                    * **GitHub Repository**: github.com/isyslab-hust
                     """
                 )
-    gr.Markdown(
-        """
-        ---
-        *Built by isyslab*
-        """
-    )
     submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label, api_name="predict")
     clear_btn.click(lambda: [None, None], outputs=[sequence_input, output_label])
-app.launch()

+import os, shutil, json, re
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import gradio as gr
 from transformers import AutoTokenizer, AutoModel
+# ==========================
+# 🚧 0. 防止 Hugging Face 缓存溢出
+# ==========================
+os.environ["HF_HOME"] = "/tmp/hf_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+# 每次启动时清理旧缓存，防止超过 50G 限制
+for path in ["/tmp/hf_cache", os.path.expanduser("~/.cache/huggingface")]:
+    shutil.rmtree(path, ignore_errors=True)
+    os.makedirs(path, exist_ok=True)
+# ==========================
+# 1. Model Definition
+# ==========================
 class AttentionPooling(nn.Module):
     """Attention Pooling Layer"""
     def __init__(self, d_model):
         self.cls_projector = nn.Linear(d_model, projection_dim)
         self.token_refiner = nn.Sequential(
             nn.Conv1d(d_model, d_model, kernel_size, padding='same'),
+            nn.ReLU()
+        )
         self.attention_pooling = AttentionPooling(d_model)
         self.tok_projector = nn.Linear(d_model, projection_dim)
         fused_dim = projection_dim * 2
+        self.gate = nn.Sequential(
+            nn.Linear(fused_dim, fused_dim),
+            nn.Sigmoid()
+        )
         self.classifier_head = nn.Sequential(
             nn.LayerNorm(fused_dim),
             nn.Linear(fused_dim, fused_dim * 2),
             nn.ReLU(),
             nn.Dropout(dropout),
+            nn.Linear(fused_dim * 2, num_classes)
+        )
     def forward(self, cls_embedding, token_embeddings, mask):
         z_cls = self.cls_projector(cls_embedding)
         tok_emb_permuted = token_embeddings.permute(0, 2, 1)
         z_fused_gated = z_fused_concat * gate_values
         return self.classifier_head(z_fused_gated)
+# ==========================
+# 2. Load Models and Files
+# ==========================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"  # 可改为 esm2_t12_35M_UR50D 减少体积
 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
+# --- 加载标签映射 ---
+if not os.path.exists(LABEL_MAP_PATH):
+    raise FileNotFoundError(f"Error: Missing '{LABEL_MAP_PATH}'. Please upload it to your Space.")
+with open(LABEL_MAP_PATH, 'r') as f:
+    label_to_idx = json.load(f)
+    idx_to_label = {v: k for k, v in label_to_idx.items()}
 NUM_CLASSES = len(idx_to_label)
 D_MODEL = 640
+# --- 加载预训练蛋白模型 ---
+print("🔹 Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
+print("✅ PLM loaded successfully.")
+# --- 加载下游分类器 ---
+print("🔹 Loading downstream classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
     d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
     dropout=0.3, kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
+    raise FileNotFoundError(f"Error: Could not find '{CLASSIFIER_PATH}'. Please upload your trained .pth file.")
 classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
 classifier.eval()
+print("✅ Classifier loaded. Application is ready!")
+# ==========================
+# 3. Prediction Function
+# ==========================
 def predict(sequence_input):
     if not sequence_input or sequence_input.isspace():
         return {"Error": "Please enter a protein sequence."}
+    # Clean FASTA header if present
     sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
         return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
         outputs = plm_model(**inputs)
         hidden_states = outputs.last_hidden_state
         cls_embedding = hidden_states[:, 0, :]
         token_embeddings = hidden_states[:, 1:-1, :]
         token_mask = inputs['attention_mask'][:, 1:-1]
         logits = classifier(cls_embedding, token_embeddings, token_mask)
         probabilities = F.softmax(logits, dim=1)[0]
     confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
     return confidences
+# ==========================
+# 4. Gradio Interface
+# ==========================
 with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px; margin: auto;}") as app:
     gr.Markdown(
         """
+        # 🧬 Protein Subcellular Localization Prediction
+        A prediction tool based on **ESM-2 (150M)** and a custom **dual-branch enhanced classifier**.
         """
     )
                 label="Protein Sequence",
                 placeholder="Paste your amino acid sequence here..."
             )
             with gr.Row():
                 clear_btn = gr.ClearButton()
                 submit_btn = gr.Button("🚀 Predict", variant="primary")
         with gr.Column(scale=1):
             output_label = gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results")
             with gr.Accordion("Model Information", open=False):
                 gr.Markdown(
                     """
+                    * **Protein Language Model (PLM)**: `facebook/esm2_t30_150M_UR50D`
+                    * **Downstream Classifier**: `ProtDualBranchEnhancedClassifier`
+                    * **GitHub**: github.com/isyslab-hust
                     """
                 )
+    gr.Markdown("---\n*Built by isyslab*")
     submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label, api_name="predict")
     clear_btn.click(lambda: [None, None], outputs=[sequence_input, output_label])
+app.launch()