Spaces:

isyslab
/

LocPred-Prok

Running

App Files Files Community

wangleiofficial commited on Dec 9, 2025

Commit

886c88b

verified ·

1 Parent(s): b3298fd

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -222

app.py CHANGED Viewed

@@ -6,252 +6,192 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModel
 # ==========================
-# 🚧 0. 防止 Hugging Face 缓存溢出 (保持不变)
 # ==========================
-os.environ["HF_HOME"] = "/tmp/hf_cache"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
-os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
-for path in ["/tmp/hf_cache", os.path.expanduser("~/.cache/huggingface")]:
-    shutil.rmtree(path, ignore_errors=True)
-    os.makedirs(path, exist_ok=True)
-# ==========================
-# 1. Model Definition (保持不变)
-# ==========================
-class AttentionPooling(nn.Module):
-    def __init__(self, d_model):
-        super().__init__()
-        self.attention_net = nn.Linear(d_model, 1)
-    def forward(self, x, mask):
-        attn_logits = self.attention_net(x).squeeze(2)
-        attn_logits.masked_fill_(mask == 0, -float('inf'))
-        attn_weights = F.softmax(attn_logits, dim=1)
-        return torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1)
-class ProtDualBranchEnhancedClassifier(nn.Module):
-    def __init__(self, d_model, projection_dim, num_classes, dropout, kernel_size):
-        super().__init__()
-        self.cls_projector = nn.Linear(d_model, projection_dim)
-        self.token_refiner = nn.Sequential(
-            nn.Conv1d(d_model, d_model, kernel_size, padding='same'),
-            nn.ReLU()
-        )
-        self.attention_pooling = AttentionPooling(d_model)
-        self.tok_projector = nn.Linear(d_model, projection_dim)
-        fused_dim = projection_dim * 2
-        self.gate = nn.Sequential(
-            nn.Linear(fused_dim, fused_dim),
-            nn.Sigmoid()
-        )
-        self.classifier_head = nn.Sequential(
-            nn.LayerNorm(fused_dim),
-            nn.Linear(fused_dim, fused_dim * 2),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(fused_dim * 2, num_classes)
-        )
-    def forward(self, cls_embedding, token_embeddings, mask):
-        z_cls = self.cls_projector(cls_embedding)
-        tok_emb_permuted = token_embeddings.permute(0, 2, 1)
-        refined_tok_emb = self.token_refiner(tok_emb_permuted).permute(0, 2, 1)
-        z_tok_pooled = self.attention_pooling(refined_tok_emb, mask)
-        z_tok = self.tok_projector(z_tok_pooled)
-        z_fused_concat = torch.cat([z_cls, z_tok], dim=1)
-        gate_values = self.gate(z_fused_concat)
-        z_fused_gated = z_fused_concat * gate_values
-        return self.classifier_head(z_fused_gated)
-# ==========================
-# 2. Load Models and Files (保持不变)
-# ==========================
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
-CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
-LABEL_MAP_PATH = "label_map.json"
-if not os.path.exists(LABEL_MAP_PATH):
-    raise FileNotFoundError(f"Error: Missing '{LABEL_MAP_PATH}'.")
-with open(LABEL_MAP_PATH, 'r') as f:
-    label_to_idx = json.load(f)
-    idx_to_label = {v: k for k, v in label_to_idx.items()}
-NUM_CLASSES = len(idx_to_label)
-D_MODEL = 640
-print("🔹 Loading Protein Language Model...")
-tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
-plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
-plm_model.eval()
-print("✅ PLM loaded.")
-print("🔹 Loading classifier...")
-classifier = ProtDualBranchEnhancedClassifier(
-    d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
-    dropout=0.3, kernel_size=3
-).to(DEVICE)
-if not os.path.exists(CLASSIFIER_PATH):
-    raise FileNotFoundError(f"Error: Could not find '{CLASSIFIER_PATH}'.")
-classifier.load_state_dict(torch.load(CLASSIFIER_PATH, map_location=DEVICE))
-classifier.eval()
-print("✅ System Ready.")
-# ==========================
-# 3. Prediction Function (微调)
-# ==========================
-def predict(sequence_input):
-    if not sequence_input or sequence_input.isspace():
-        # 返回 None 而不是字典，让 Label 组件显示更干净
-        raise gr.Error("Sequence cannot be empty.")
-    sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
-    sequence = re.sub(r'[^A-Z]', '', sequence.upper())
-    if not sequence:
-        raise gr.Error("Invalid sequence format.")
-    with torch.no_grad():
-        inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
-        outputs = plm_model(**inputs)
-        hidden_states = outputs.last_hidden_state
-        cls_embedding = hidden_states[:, 0, :]
-        token_embeddings = hidden_states[:, 1:-1, :]
-        token_mask = inputs['attention_mask'][:, 1:-1]
-        logits = classifier(cls_embedding, token_embeddings, token_mask)
-        probabilities = F.softmax(logits, dim=1)[0]
-    confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
-    return confidences
 # ==========================
-# 4. Modernized Gradio Interface
 # ==========================
-# 自定义 CSS：增加渐变标题、阴影、圆角
-custom_css = """
-.gradio-container {
-    font-family: 'IBM Plex Sans', sans-serif;
-}
-.main-header {
-    text-align: center;
-    background: linear-gradient(135deg, #3b82f6 0%, #06b6d4 100%);
-    color: white;
-    padding: 2rem;
-    border-radius: 12px;
-    margin-bottom: 1.5rem;
-    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
-}
-.main-header h1 {
     color: white;
-    margin-bottom: 0.5rem;
-    font-size: 2.2rem;
 }
-.main-header p {
-    color: #e0f2fe;
-    font-size: 1.1rem;
 }
-.input-card, .output-card {
-    border: 1px solid #e5e7eb;
-    border-radius: 12px;
-    padding: 1.5rem;
-    background: white;
-    box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
 }
 """
-# 使用更清爽的 Teal (青色) 主题，符合生物信息学特征
-theme = gr.themes.Soft(
-    primary_hue="teal",
-    secondary_hue="blue",
     neutral_hue="slate",
-    font=[gr.themes.GoogleFont("IBM Plex Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
-).set(
-    button_primary_background_fill="*primary_600",
-    button_primary_background_fill_hover="*primary_700",
-    block_shadow="*shadow_drop_lg"
 )
-with gr.Blocks(theme=theme, css=custom_css, title="LocPred-Prok") as app:
-    # --- 顶部 Header ---
-    with gr.Column(elem_classes="main-header"):
-        gr.Markdown(
-            """
-            # 🧬 Prokaryotic Subcellular Localization
-            ### Dual-Branch Architecture with Protein Language Models
-            Identify where your protein functions using State-of-the-Art Deep Learning.
-            """
-        )
-    # --- 主体内容 ---
-    with gr.Row(equal_height=False):
-        # 左侧：输入区
-        with gr.Column(scale=5, elem_classes="input-card"):
-            gr.Markdown("### 📥 Input Sequence")
-            gr.Markdown("Paste your amino acid sequence (FASTA format supported).")
-            sequence_input = gr.Textbox(
-                lines=8,
-                label="",
-                placeholder=">Example Header\nMKFKLTAGCLAVAGVLLASSFGADAEIVV...",
-                show_label=False
-            )
             with gr.Row():
-                clear_btn = gr.ClearButton(components=[sequence_input], value="Clear")
-                submit_btn = gr.Button("✨ Run Prediction", variant="primary", scale=2)
-            gr.Markdown("#### 💡 Example Sequences")
-            gr.Examples(
-                examples=[
-                    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
-                    ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
-                ],
-                inputs=sequence_input,
-                label=None
             )
-        # 右侧：输出区
-        with gr.Column(scale=4, elem_classes="output-card"):
-            gr.Markdown("### 📊 Prediction Results")
-            output_label = gr.Label(
-                num_top_classes=NUM_CLASSES,
-                label="Probability Distribution",
-                show_label=False
             )
-            # 信息折叠面板
-            with gr.Accordion("📘 Model Architecture & Details", open=False):
-                gr.Markdown(
-                    """
-                    This model utilizes a **Dual-Branch Architecture**:
-                    1.  **Semantic Branch**: Extracts global features using `ESM-2 (150M)` CLS token.
-                    2.  **Structural Branch**: Refines residue-level embeddings via CNN and Attention Pooling.
-                    **Citation:**
-                    *LocPred-Prok: Prokaryotic protein subcellular localization prediction with a dual-branch architecture.*
-                    """
-                )
-    # --- 底部 Footer ---
-    gr.Markdown(
-        """
-        <div style="text-align: center; margin-top: 2rem; color: #64748b; font-size: 0.9rem;">
-        © 2025 iSysLab HUST | Powered by ESM-2 & PyTorch
         </div>
-        """
-    )
-    # --- 绑定事件 ---
     submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label)
     clear_btn.click(lambda: None, outputs=[output_label])
-# 启动
 app.launch()

 from transformers import AutoTokenizer, AutoModel
 # ==========================
+# 0-3 部分：保持你的底层逻辑完全不变
 # ==========================
+# ... (请保持之前的 Imports, Model Definition, Load Models, Predict Function 代码完全一致) ...
+# 为了节省篇幅，这里假设你已经保留了之前代码的第0到第3部分 (直到 def predict 为止)
+# 务必确保运行前包含之前的 Model 类定义和加载逻辑！
 # ==========================
+# 4. Academic Research Interface
 # ==========================
+# 学术风格 CSS
+academic_css = """
+body { font-family: 'Roboto', 'Helvetica Neue', Arial, sans-serif; }
+.header-container {
+    background: linear-gradient(to right, #1e3a8a, #3b82f6); /* 深蓝学术风 */
     color: white;
+    padding: 2.5rem;
+    border-radius: 10px;
+    margin-bottom: 20px;
+    text-align: center;
 }
+.header-title { font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
+.header-subtitle { font-size: 1.2rem; opacity: 0.9; font-weight: 300; }
+.badge-container { display: flex; justify-content: center; gap: 15px; margin-top: 15px; }
+.badge {
+    background: rgba(255,255,255,0.2);
+    padding: 5px 15px;
+    border-radius: 20px;
+    font-size: 0.9rem;
+    border: 1px solid rgba(255,255,255,0.4);
 }
+.highlight-box {
+    background: #f8fafc;
+    border-left: 5px solid #3b82f6;
+    padding: 15px;
+    margin: 20px 0;
+    color: #334155;
 }
+.performance-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
+.performance-table th { background: #e2e8f0; padding: 8px; text-align: left; }
+.performance-table td { border-bottom: 1px solid #e2e8f0; padding: 8px; }
+.footer { text-align: center; color: #94a3b8; margin-top: 30px; font-size: 0.85rem; }
 """
+# 定义主题
+theme = gr.themes.Default(
+    primary_hue="blue",
+    secondary_hue="slate",
     neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Roboto"), "ui-sans-serif", "system-ui"]
 )
+with gr.Blocks(theme=theme, css=academic_css, title="LocPred-Prok Web Server") as app:
+    # --- 1. 学术 Header ---
+    with gr.Column(elem_classes="header-container"):
+        gr.HTML("""
+            <div class="header-title">LocPred-Prok</div>
+            <div class="header-subtitle">
+                Prokaryotic Protein Subcellular Localization Prediction with Dual-Branch Architecture
+            </div>
+            <div class="badge-container">
+                <span class="badge">🧬 ESM-2 150M Backbone</span>
+                <span class="badge">🏆 91.2% Accuracy</span>
+                <span class="badge">🎯 MCC 0.889</span>
+            </div>
+        """)
+    # --- 2. 核心功能区 (Tab结构) ---
+    with gr.Tabs():
+        # === Tab 1: Web Server (预测工具) ===
+        with gr.TabItem("🚀 Prediction Server"):
             with gr.Row():
+                # 左侧输入
+                with gr.Column(scale=5):
+                    gr.Markdown("### 📥 Input Sequence (FASTA)")
+                    sequence_input = gr.Textbox(
+                        lines=8,
+                        placeholder=">Example_Protein\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL...",
+                        show_label=False,
+                        elem_id="seq-input"
+                    )
+                    with gr.Row():
+                        clear_btn = gr.ClearButton(components=[sequence_input], value="Clear")
+                        submit_btn = gr.Button("Run Prediction", variant="primary", scale=2)
+                    gr.Markdown("#### Example Sequences")
+                    gr.Examples(
+                        examples=[
+                            [">Gram-negative Outer Membrane Protein\nMSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
+                            [">Gram-positive Cell Wall Protein\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
+                        ],
+                        inputs=sequence_input,
+                        label=None
+                    )
+                # 右侧输出
+                with gr.Column(scale=4):
+                    gr.Markdown("### 📊 Prediction Results")
+                    output_label = gr.Label(num_top_classes=NUM_CLASSES, label="Probabilities")
+                    # 解释性文字
+                    gr.Markdown(
+                        """
+                        <div style="font-size: 0.9rem; color: #64748b; margin-top: 10px;">
+                        <b>Note:</b> This model is optimized for challenging classes including
+                        <i>Gram-positive cell wall</i> and <i>Gram-negative outer membrane</i> proteins.
+                        </div>
+                        """
+                    )
+        # === Tab 2: About & Abstract (论文展示) ===
+        with gr.TabItem("📖 About & Abstract"):
+            gr.Markdown("### Abstract")
+            gr.Markdown(
+                """
+                The precise localization of proteins within prokaryotic cells is fundamental to understanding their function.
+                **LocPred-Prok** is a novel deep learning framework that employs a purpose-built **dual-branch architecture**,
+                synergistically integrating global and local sequence features extracted from **ESM-2 (150M)** embeddings.
+                """
             )
+            # 高亮核心发现
+            gr.HTML("""
+            <div class="highlight-box">
+                <b>💡 Key Findings:</b><br>
+                1. <b>Bigger ≠ Better:</b> Peak performance is achieved by the mid-sized ESM-2-150M, not the largest models.<br>
+                2. <b>Hard Classes Solved:</b> Exceptional performance on Gram-positive cell wall (MCC=0.84) and Gram-negative outer membrane (MCC=0.91).
+            </div>
+            """)
+            gr.Markdown("### 📈 Performance Metrics (Homology-Partitioned Benchmark)")
+            gr.HTML("""
+            <table class="performance-table">
+                <tr>
+                    <th>Metric</th>
+                    <th>LocPred-Prok Score</th>
+                    <th>Improvement</th>
+                </tr>
+                <tr>
+                    <td><b>Accuracy</b></td>
+                    <td><b>91.2%</b></td>
+                    <td>State-of-the-Art</td>
+                </tr>
+                <tr>
+                    <td><b>MCC (Overall)</b></td>
+                    <td><b>0.889</b></td>
+                    <td>Significant Leap</td>
+                </tr>
+                 <tr>
+                    <td>MCC (Outer Membrane)</td>
+                    <td>0.91</td>
+                    <td>High Precision</td>
+                </tr>
+            </table>
+            """)
+            # 这里可以放架构图，如果你有图片链接的话
+            # gr.Image("https://your-image-url.com/architecture.png", label="Model Architecture")
+        # === Tab 3: Citation (引用) ===
+        with gr.TabItem("📝 Citation"):
+            gr.Markdown("If you use LocPred-Prok in your research, please cite our paper:")
+            gr.Code(
+                """
+@article{LocPredProk2025,
+  title={LocPred-Prok: Prokaryotic protein subcellular localization prediction with a dual-branch architecture and protein language model},
+  author={Your Name and Co-authors},
+  journal={Submission Journal},
+  year={2025}
+}
+                """,
+                language="bibtex",
+                label="BibTeX"
             )
+    # --- Footer ---
+    gr.HTML("""
+        <div class="footer">
+            Developed by iSysLab | <a href="https://github.com/isyslab-hust" target="_blank">GitHub</a> | Based on ESM-2 & PyTorch
         </div>
+    """)
+    # 绑定事件
     submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label)
     clear_btn.click(lambda: None, outputs=[output_label])
 app.launch()