Spaces:

isyslab
/

LocPred-Prok

Running

App Files Files Community

wangleiofficial commited on Dec 9, 2025

Commit

b3298fd

verified ·

1 Parent(s): 4782d51

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -45

app.py CHANGED Viewed

@@ -6,22 +6,20 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModel
 # ==========================
-# 🚧 0. 防止 Hugging Face 缓存溢出
 # ==========================
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
-# 每次启动时清理旧缓存，防止超过 50G 限制
 for path in ["/tmp/hf_cache", os.path.expanduser("~/.cache/huggingface")]:
     shutil.rmtree(path, ignore_errors=True)
     os.makedirs(path, exist_ok=True)
 # ==========================
-# 1. Model Definition
 # ==========================
 class AttentionPooling(nn.Module):
-    """Attention Pooling Layer"""
     def __init__(self, d_model):
         super().__init__()
         self.attention_net = nn.Linear(d_model, 1)
@@ -33,7 +31,6 @@ class AttentionPooling(nn.Module):
         return torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1)
 class ProtDualBranchEnhancedClassifier(nn.Module):
-    """Enhanced dual-branch model"""
     def __init__(self, d_model, projection_dim, num_classes, dropout, kernel_size):
         super().__init__()
         self.cls_projector = nn.Linear(d_model, projection_dim)
@@ -68,16 +65,15 @@ class ProtDualBranchEnhancedClassifier(nn.Module):
         return self.classifier_head(z_fused_gated)
 # ==========================
-# 2. Load Models and Files
 # ==========================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"  # 可改为 esm2_t12_35M_UR50D 减少体积
 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
-# --- 加载标签映射 ---
 if not os.path.exists(LABEL_MAP_PATH):
-    raise FileNotFoundError(f"Error: Missing '{LABEL_MAP_PATH}'. Please upload it to your Space.")
 with open(LABEL_MAP_PATH, 'r') as f:
     label_to_idx = json.load(f)
     idx_to_label = {v: k for k, v in label_to_idx.items()}
@@ -85,40 +81,38 @@ with open(LABEL_MAP_PATH, 'r') as f:
 NUM_CLASSES = len(idx_to_label)
 D_MODEL = 640
-# --- 加载预训练蛋白模型 ---
 print("🔹 Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
-print("✅ PLM loaded successfully.")
-# --- 加载下游分类器 ---
-print("🔹 Loading downstream classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
     d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
     dropout=0.3, kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
-    raise FileNotFoundError(f"Error: Could not find '{CLASSIFIER_PATH}'. Please upload your trained .pth file.")
 classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
 classifier.eval()
-print("✅ Classifier loaded. Application is ready!")
 # ==========================
-# 3. Prediction Function
 # ==========================
 def predict(sequence_input):
     if not sequence_input or sequence_input.isspace():
-        return {"Error": "Please enter a protein sequence."}
-    # Clean FASTA header if present
     sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
-        return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
@@ -135,52 +129,129 @@ def predict(sequence_input):
     return confidences
 # ==========================
-# 4. Gradio Interface
 # ==========================
-with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px; margin: auto;}") as app:
-    gr.Markdown(
-        """
-        # 🧬 Protein Subcellular Localization Prediction
-        A prediction tool based on **ESM-2 (150M)** and a custom **dual-branch enhanced classifier**.
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
             sequence_input = gr.Textbox(
-                lines=10,
-                label="Protein Sequence",
-                placeholder="Paste your amino acid sequence here..."
             )
             with gr.Row():
-                clear_btn = gr.ClearButton()
-                submit_btn = gr.Button("🚀 Predict", variant="primary")
             gr.Examples(
                 examples=[
-                    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2 OS=Escherichia coli (strain K12) OX=83333 GN=mrdA PE=1 SV=2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
                     ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
                 ],
                 inputs=sequence_input,
-                label="Examples"
             )
-        with gr.Column(scale=1):
-            output_label = gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results")
-            with gr.Accordion("Model Information", open=False):
                 gr.Markdown(
                     """
-                    * **Protein Language Model (PLM)**: `facebook/esm2_t30_150M_UR50D`
-                    * **Downstream Classifier**: `ProtDualBranchEnhancedClassifier`
-                    * **GitHub**: github.com/isyslab-hust
                     """
                 )
-    gr.Markdown("---\n*Built by isyslab*")
-    submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label, api_name="predict")
-    clear_btn.click(lambda: [None, None], outputs=[sequence_input, output_label])
 app.launch()

 from transformers import AutoTokenizer, AutoModel
 # ==========================
+# 🚧 0. 防止 Hugging Face 缓存溢出 (保持不变)
 # ==========================
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 for path in ["/tmp/hf_cache", os.path.expanduser("~/.cache/huggingface")]:
     shutil.rmtree(path, ignore_errors=True)
     os.makedirs(path, exist_ok=True)
 # ==========================
+# 1. Model Definition (保持不变)
 # ==========================
 class AttentionPooling(nn.Module):
     def __init__(self, d_model):
         super().__init__()
         self.attention_net = nn.Linear(d_model, 1)
         return torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1)
 class ProtDualBranchEnhancedClassifier(nn.Module):
     def __init__(self, d_model, projection_dim, num_classes, dropout, kernel_size):
         super().__init__()
         self.cls_projector = nn.Linear(d_model, projection_dim)
         return self.classifier_head(z_fused_gated)
 # ==========================
+# 2. Load Models and Files (保持不变)
 # ==========================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
 if not os.path.exists(LABEL_MAP_PATH):
+    raise FileNotFoundError(f"Error: Missing '{LABEL_MAP_PATH}'.")
 with open(LABEL_MAP_PATH, 'r') as f:
     label_to_idx = json.load(f)
     idx_to_label = {v: k for k, v in label_to_idx.items()}
 NUM_CLASSES = len(idx_to_label)
 D_MODEL = 640
 print("🔹 Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
+print("✅ PLM loaded.")
+print("🔹 Loading classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
     d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
     dropout=0.3, kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
+    raise FileNotFoundError(f"Error: Could not find '{CLASSIFIER_PATH}'.")
 classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
 classifier.eval()
+print("✅ System Ready.")
 # ==========================
+# 3. Prediction Function (微调)
 # ==========================
 def predict(sequence_input):
     if not sequence_input or sequence_input.isspace():
+        # 返回 None 而不是字典，让 Label 组件显示更干净
+        raise gr.Error("Sequence cannot be empty.")
     sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
+        raise gr.Error("Invalid sequence format.")
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
     return confidences
 # ==========================
+# 4. Modernized Gradio Interface
 # ==========================
+# 自定义 CSS：增加渐变标题、阴影、圆角
+custom_css = """
+.gradio-container {
+    font-family: 'IBM Plex Sans', sans-serif;
+}
+.main-header {
+    text-align: center;
+    background: linear-gradient(135deg, #3b82f6 0%, #06b6d4 100%);
+    color: white;
+    padding: 2rem;
+    border-radius: 12px;
+    margin-bottom: 1.5rem;
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+}
+.main-header h1 {
+    color: white;
+    margin-bottom: 0.5rem;
+    font-size: 2.2rem;
+}
+.main-header p {
+    color: #e0f2fe;
+    font-size: 1.1rem;
+}
+.input-card, .output-card {
+    border: 1px solid #e5e7eb;
+    border-radius: 12px;
+    padding: 1.5rem;
+    background: white;
+    box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
+}
+"""
+# 使用更清爽的 Teal (青色) 主题，符合生物信息学特征
+theme = gr.themes.Soft(
+    primary_hue="teal",
+    secondary_hue="blue",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("IBM Plex Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set(
+    button_primary_background_fill="*primary_600",
+    button_primary_background_fill_hover="*primary_700",
+    block_shadow="*shadow_drop_lg"
+)
+with gr.Blocks(theme=theme, css=custom_css, title="LocPred-Prok") as app:
+    # --- 顶部 Header ---
+    with gr.Column(elem_classes="main-header"):
+        gr.Markdown(
+            """
+            # 🧬 Prokaryotic Subcellular Localization
+            ### Dual-Branch Architecture with Protein Language Models
+            Identify where your protein functions using State-of-the-Art Deep Learning.
+            """
+        )
+    # --- 主体内容 ---
+    with gr.Row(equal_height=False):
+        # 左侧：输入区
+        with gr.Column(scale=5, elem_classes="input-card"):
+            gr.Markdown("### 📥 Input Sequence")
+            gr.Markdown("Paste your amino acid sequence (FASTA format supported).")
             sequence_input = gr.Textbox(
+                lines=8,
+                label="",
+                placeholder=">Example Header\nMKFKLTAGCLAVAGVLLASSFGADAEIVV...",
+                show_label=False
             )
             with gr.Row():
+                clear_btn = gr.ClearButton(components=[sequence_input], value="Clear")
+                submit_btn = gr.Button("✨ Run Prediction", variant="primary", scale=2)
+            gr.Markdown("#### 💡 Example Sequences")
             gr.Examples(
                 examples=[
+                    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
                     ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
                 ],
                 inputs=sequence_input,
+                label=None
             )
+        # 右侧：输出区
+        with gr.Column(scale=4, elem_classes="output-card"):
+            gr.Markdown("### 📊 Prediction Results")
+            output_label = gr.Label(
+                num_top_classes=NUM_CLASSES,
+                label="Probability Distribution",
+                show_label=False
+            )
+            # 信息折叠面板
+            with gr.Accordion("📘 Model Architecture & Details", open=False):
                 gr.Markdown(
                     """
+                    This model utilizes a **Dual-Branch Architecture**:
+                    1.  **Semantic Branch**: Extracts global features using `ESM-2 (150M)` CLS token.
+                    2.  **Structural Branch**: Refines residue-level embeddings via CNN and Attention Pooling.
+                    **Citation:**
+                    *LocPred-Prok: Prokaryotic protein subcellular localization prediction with a dual-branch architecture.*
                     """
                 )
+    # --- 底部 Footer ---
+    gr.Markdown(
+        """
+        <div style="text-align: center; margin-top: 2rem; color: #64748b; font-size: 0.9rem;">
+        © 2025 iSysLab HUST | Powered by ESM-2 & PyTorch
+        </div>
+        """
+    )
+    # --- ���定事件 ---
+    submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label)
+    clear_btn.click(lambda: None, outputs=[output_label])
+# 启动
 app.launch()