Spaces:

yisen888
/

protein_Activity_detection_classifier

Sleeping

App Files Files Community

yisen888 commited on Feb 1

Commit

bf19137

verified ·

1 Parent(s): f973fbb

Upload app.py

Browse files

Files changed (1) hide show

app.py +454 -0

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import gradio as gr
+import torch
+import numpy as np
+import pandas as pd
+from torch import nn
+from transformers import AutoTokenizer, AutoModel
+from peft import get_peft_model, LoraConfig, TaskType
+import os
+# ================= 配置区 (保持不变) =================
+MODEL_DIR = "."
+BASE_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
+LABELS = ['anti_acne', 'anti_aging', 'anti_inflammatory', 'anti_oxidant', 'repair', 'whitening', 'delivery', 'negative']
+# ================= 核心组件 (保持不变) =================
+AA_PROPS = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}
+AA_CHARGE = {'R': 1, 'K': 1, 'H': 0.1, 'D': -1, 'E': -1}
+def compute_biophysics(seq):
+    length = len(seq)
+    if length == 0: return [0]*5
+    hydro = sum([AA_PROPS.get(aa, 0) for aa in seq]) / length
+    charge = sum([AA_CHARGE.get(aa, 0) for aa in seq])
+    weight = length * 110 / 1000.0
+    n_term = AA_PROPS.get(seq[0], 0)
+    c_term = AA_CHARGE.get(seq[-1], 0)
+    return np.array([hydro, charge, weight, n_term, c_term], dtype=np.float32)
+class AdaptiveFusionModel(nn.Module):
+    def __init__(self, base_model, num_labels, feature_dim=5):
+        super().__init__()
+        self.esm = base_model
+        self.num_labels = num_labels
+        hidden_size = base_model.config.hidden_size
+        self.esm_classifier = nn.Sequential(nn.Dropout(0.1), nn.Linear(hidden_size, num_labels))
+        self.feature_classifier = nn.Sequential(nn.Linear(feature_dim, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, num_labels))
+        self.gate_weight = nn.Parameter(torch.tensor([1.38]))
+    def forward(self, input_ids, attention_mask=None, extra_features=None, **kwargs):
+        outputs = self.esm(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+        cls_embedding = outputs.last_hidden_state[:, 0, :]
+        logits_esm = self.esm_classifier(cls_embedding)
+        if extra_features is not None:
+            logits_feat = self.feature_classifier(extra_features)
+            alpha = torch.sigmoid(self.gate_weight)
+            logits = alpha * logits_esm + (1 - alpha) * logits_feat
+        else:
+            logits = logits_esm
+            alpha = None
+        return logits, alpha
+# ================= 模型加载 (保持不变) =================
+print("🚀 正在加载 BioOracle V14...")
+device = torch.device('cpu')
+# 加载 Tokenizer
+print("📥 加载 Tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
+# 加载基座模型
+print("🧠 加载 ESM-2 150M 模型（首次约 600MB，请等待）...")
+base_model = AutoModel.from_pretrained(BASE_MODEL_NAME)
+# 应用 LoRA
+print("🔧 应用 LoRA 配置...")
+peft_config = LoraConfig(
+    task_type=TaskType.FEATURE_EXTRACTION,
+    r=32, lora_alpha=64, lora_dropout=0.1,
+    target_modules=["query", "key", "value", "dense"]
+)
+base_model = get_peft_model(base_model, peft_config)
+# 构建模型
+print("⚙️ 构建融合架构...")
+model = AdaptiveFusionModel(base_model, num_labels=len(LABELS))
+# 加载权重
+weights_path = os.path.join(MODEL_DIR, "v14_weights.bin")
+if not os.path.exists(weights_path):
+    raise FileNotFoundError(f"❌ 找不到权重文件: {weights_path}")
+print("💾 加载 V14 权重（638MB）...")
+state_dict = torch.load(weights_path, map_location=torch.device('cpu'), weights_only=False)
+# 🔥 智能匹配权重键名（修复 PEFT 前缀不匹配问题）
+model_keys = set(model.state_dict().keys())
+weight_keys = set(state_dict.keys())
+# 情况1：权重没有 base_model 前缀，但模型有（需要添加前缀）
+if any('base_model.model' in k for k in model_keys) and not any('base_model.model' in k for k in weight_keys):
+    print("⚙️ 调整权重键名以匹配 PEFT 模型结构...")
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith('esm.'):
+            # esm.xxx → esm.base_model.model.xxx
+            new_key = key.replace('esm.', 'esm.base_model.model.', 1)
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+    state_dict = new_state_dict
+# 情况2：权重有 base_model 前缀，但模型没有（需要删除前缀）
+elif not any('base_model.model' in k for k in model_keys) and any('base_model.model' in k for k in weight_keys):
+    print("⚙️ 移除 PEFT 前缀以匹配标准模型结构...")
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = key.replace('base_model.model.', '')
+        new_state_dict[new_key] = value
+    state_dict = new_state_dict
+# 加载权重（使用 strict=False 允许部分不匹配）
+missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+if missing_keys:
+    print(f"⚠️ 缺失 {len(missing_keys)} 个键（可能是新增的参���，如 pooler 层）")
+    print(f"   示例: {list(missing_keys)[:3]}")
+if unexpected_keys:
+    print(f"⚠️ 忽略 {len(unexpected_keys)} 个多余的键")
+    print(f"   示例: {list(unexpected_keys)[:3]}")
+model.to('cpu')
+model.eval()
+print("✅ 模型加载完成！")
+# 获取门控权重
+gate_val = torch.sigmoid(model.gate_weight).item()
+esm_weight = gate_val
+feat_weight = 1 - gate_val
+# ================= 预测函数 (中文版 - 保持原有逻辑) =================
+def predict_peptide(sequence):
+    """
+    预测肽序列的生物活性 (中文输出)
+    """
+    # 输入验证
+    seq = sequence.strip().upper()
+    valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
+    if not seq:
+        return "❌ 请输入序列", None, None
+    if not set(seq).issubset(valid_aa):
+        return "❌ 请输入有效的氨基酸序列（仅限20种标准氨基酸单字母缩写）", None, None
+    # 数据准备
+    inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
+    raw_feats = compute_biophysics(seq)
+    feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)
+    # 模型推理
+    with torch.no_grad():
+        logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
+        probs = torch.sigmoid(logits).cpu().numpy()[0]
+    # 处理结果
+    df_res = pd.DataFrame({"功效标签": LABELS, "置信度": probs})
+    df_res = df_res.sort_values(by="置信度", ascending=False).reset_index(drop=True)
+    top_label = df_res.iloc[0]['功效标签']
+    top_score = df_res.iloc[0]['置信度']
+    # 生成结论
+    if top_score > 0.8:
+        conclusion = f"""
+### ✅ 高潜力活性肽
+**主要预测功效**: {top_label}
+**置信度**: {top_score:.2%}
+模型强烈建议将此序列纳入后续湿实验验证流程。
+"""
+    elif top_score > 0.3:
+        conclusion = f"""
+### ⚠️ 中等潜力 / 需进一步改造
+**主要预测功效**: {top_label}
+**置信度**: {top_score:.2%}
+该序列可能具有一定活性，或是已知活性肽的突变体。建议结合结构生物学分析。
+"""
+    else:
+        conclusion = f"""
+### ❌ 疑似无效序列（负样本）
+**最高置信度**: {top_score:.2%}
+模型判断该序列主要表现为负样本特征，建议剔除。
+"""
+    # 生成生物物理特征文本
+    biophysics_text = f"""
+**生物物理特征分析**:
+- 平均疏水性: {raw_feats[0]:.2f}
+- 净电荷: {raw_feats[1]:.2f}
+- 估算分子量: {raw_feats[2]:.3f} kDa
+- N端疏水性: {raw_feats[3]:.2f}
+- C端电荷: {raw_feats[4]:.2f}
+"""
+    # 格式化完整结果表
+    df_formatted = df_res.copy()
+    df_formatted['置信度'] = df_formatted['置信度'].apply(lambda x: f"{x:.4%}")
+    return conclusion, biophysics_text, df_formatted
+# ================= 预测函数 (英文版 - 新增) =================
+def predict_peptide_en(sequence):
+    """
+    Predict peptide bioactivity (English Output)
+    """
+    # Input Validation
+    seq = sequence.strip().upper()
+    valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
+    if not seq:
+        return "❌ Please enter a sequence", None, None
+    if not set(seq).issubset(valid_aa):
+        return "❌ Invalid sequence. Please use standard 1-letter amino acid codes.", None, None
+    # Data Preparation (Same as Chinese version)
+    inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
+    raw_feats = compute_biophysics(seq)
+    feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)
+    # Inference
+    with torch.no_grad():
+        logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
+        probs = torch.sigmoid(logits).cpu().numpy()[0]
+    # Process Results
+    df_res = pd.DataFrame({"Efficacy Label": LABELS, "Confidence": probs})
+    df_res = df_res.sort_values(by="Confidence", ascending=False).reset_index(drop=True)
+    top_label = df_res.iloc[0]['Efficacy Label']
+    top_score = df_res.iloc[0]['Confidence']
+    # Generate Conclusion (English)
+    if top_score > 0.8:
+        conclusion = f"""
+### ✅ High Potential Peptide
+**Predicted Efficacy**: {top_label}
+**Confidence**: {top_score:.2%}
+Strongly recommended for wet-lab validation.
+"""
+    elif top_score > 0.3:
+        conclusion = f"""
+### ⚠️ Moderate Potential / Optimization Needed
+**Predicted Efficacy**: {top_label}
+**Confidence**: {top_score:.2%}
+May have some activity or be a mutant of a known peptide. Structural analysis suggested.
+"""
+    else:
+        conclusion = f"""
+### ❌ Likely Negative / Inactive
+**Max Confidence**: {top_score:.2%}
+Predicted as a negative sample. Suggested to discard.
+"""
+    # Biophysics Text (English)
+    biophysics_text = f"""
+**Biophysical Properties**:
+- Avg Hydrophobicity: {raw_feats[0]:.2f}
+- Net Charge: {raw_feats[1]:.2f}
+- Est. Molecular Weight: {raw_feats[2]:.3f} kDa
+- N-term Hydrophobicity: {raw_feats[3]:.2f}
+- C-term Charge: {raw_feats[4]:.2f}
+"""
+    # Format Table
+    df_formatted = df_res.copy()
+    df_formatted['Confidence'] = df_formatted['Confidence'].apply(lambda x: f"{x:.4%}")
+    return conclusion, biophysics_text, df_formatted
+# ================= Gradio 界面 (前端设计升级) =================
+# 自定义 CSS - 增强医疗科技感
+custom_css = """
+.gradio-container {
+    font-family: 'Helvetica Neue', Arial, sans-serif;
+    background-color: #f9fbfd;
+}
+.header-area {
+    text-align: center;
+    margin-bottom: 20px;
+    padding: 20px;
+    background: linear-gradient(135deg, #eef2f3 0%, #8e9eab 100%);
+    border-radius: 12px;
+    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+}
+.header-area h1 {
+    color: #2c3e50;
+    font-size: 2.5em;
+    margin-bottom: 5px;
+}
+.header-area h3 {
+    color: #546e7a;
+    font-weight: 300;
+}
+.stat-box {
+    background: white;
+    padding: 15px;
+    border-radius: 8px;
+    border-left: 5px solid #3498db;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+}
+.primary-btn {
+    background-color: #2980b9 !important;
+}
+"""
+# 使用 Soft 主题作为基础
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="slate",
+).set(
+    button_primary_background_fill="#2980b9",
+    button_primary_background_fill_hover="#3498db",
+)
+# 创建界面
+with gr.Blocks(css=custom_css, theme=theme, title="BioOracle V14") as demo:
+    # 顶部 Header 区域
+    with gr.Row():
+        gr.HTML(
+            """
+            <div class="header-area">
+                <h1>🧬 BioOracle V14</h1>
+                <h3>Giant Biogene AI Screening System | 巨子智筛 AI 活性肽发现系统</h3>
+                <p>Powered by ESM-2 150M & Biophysics Guided Learning</p>
+            </div>
+            """
+        )
+    # 模型状态折叠面板 (双语通用)
+    with gr.Accordion("🧠 Model Internal Status / 模型大脑状态", open=False):
+        with gr.Row():
+            gr.Markdown(
+                f"""
+                <div class="stat-box">
+                <b>自适应融合权重 (Adaptive Fusion Weights)</b>:<br>
+                <ul>
+                    <li>ESM-2 Deep Semantics (AI Intuition): <b>{esm_weight:.1%}</b></li>
+                    <li>Biophysics Rules (Physical Laws): <b>{feat_weight:.1%}</b></li>
+                </ul>
+                <p style="color: grey; font-size: 0.9em;">
+                The model automatically balances between deep learning features and physical rules.<br>
+                模型自动学会了主要依赖 ESM-2 大模型的深度理解，同时使用物理化学规则作为辅助校验。
+                </p>
+                </div>
+                """
+            )
+    # 多语言选项卡
+    with gr.Tabs():
+        # ============ Tab 1: 中文版 ============
+        with gr.TabItem("🇨🇳 中文版 (Chinese)"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    sequence_input_zh = gr.Textbox(
+                        label="输入待筛选的肽序列",
+                        placeholder="例如: GHK",
+                        info="输入氨基酸序列（单字母缩写），模型将评估其潜在生物活性",
+                        lines=2
+                    )
+                    predict_btn_zh = gr.Button("🚀 开始演算", variant="primary", size="lg")
+                with gr.Column(scale=3):
+                    conclusion_output_zh = gr.Markdown(label="活性评估结论")
+            with gr.Row():
+                biophysics_output_zh = gr.Markdown(label="生物物理特征")
+                results_table_zh = gr.Dataframe(
+                    label="完整预测数据表",
+                    headers=["功效标签", "置信度"],
+                    datatype=["str", "str"],
+                    row_count=8
+                )
+            gr.Examples(
+                examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
+                inputs=sequence_input_zh,
+                label="示例序列"
+            )
+            # 中文版事件绑定
+            predict_btn_zh.click(
+                fn=predict_peptide,
+                inputs=sequence_input_zh,
+                outputs=[conclusion_output_zh, biophysics_output_zh, results_table_zh]
+            )
+        # ============ Tab 2: 英文版 ============
+        with gr.TabItem("🇺🇸 English Version"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    sequence_input_en = gr.Textbox(
+                        label="Input Peptide Sequence",
+                        placeholder="e.g., GHK",
+                        info="Enter amino acid sequence (single letter codes) for bioactivity assessment",
+                        lines=2
+                    )
+                    predict_btn_en = gr.Button("🚀 Analyze Sequence", variant="primary", size="lg")
+                with gr.Column(scale=3):
+                    conclusion_output_en = gr.Markdown(label="Assessment Conclusion")
+            with gr.Row():
+                biophysics_output_en = gr.Markdown(label="Biophysical Properties")
+                results_table_en = gr.Dataframe(
+                    label="Full Prediction Data",
+                    headers=["Efficacy Label", "Confidence"],
+                    datatype=["str", "str"],
+                    row_count=8
+                )
+            gr.Examples(
+                examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
+                inputs=sequence_input_en,
+                label="Example Sequences"
+            )
+            # 英文版事件绑定
+            predict_btn_en.click(
+                fn=predict_peptide_en,
+                inputs=sequence_input_en,
+                outputs=[conclusion_output_en, biophysics_output_en, results_table_en]
+            )
+    # 底部版权信息
+    gr.Markdown(
+        """
+        ---
+        <div style="text-align: center; color: #7f8c8d; font-size: 0.9em;">
+        <b>BioOracle V14</b> | Design for Giant Biogene Internship Project<br>
+        <i>Disclaimer: Predictions are for research reference only. Wet-lab validation is required.</i>
+        </div>
+        """
+    )
+# 启动应用
+if __name__ == "__main__":
+    demo.launch()