| import gradio as gr |
| import torch |
| import numpy as np |
| import pandas as pd |
| from torch import nn |
| from transformers import AutoTokenizer, AutoModel |
| from peft import get_peft_model, LoraConfig, TaskType |
| import os |
|
|
| |
| MODEL_DIR = "." |
| BASE_MODEL_NAME = "facebook/esm2_t30_150M_UR50D" |
| LABELS = ['anti_acne', 'anti_aging', 'anti_inflammatory', 'anti_oxidant', 'repair', 'whitening', 'delivery', 'negative'] |
|
|
| |
| AA_PROPS = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2} |
| AA_CHARGE = {'R': 1, 'K': 1, 'H': 0.1, 'D': -1, 'E': -1} |
|
|
| def compute_biophysics(seq): |
| length = len(seq) |
| if length == 0: return [0]*5 |
| hydro = sum([AA_PROPS.get(aa, 0) for aa in seq]) / length |
| charge = sum([AA_CHARGE.get(aa, 0) for aa in seq]) |
| weight = length * 110 / 1000.0 |
| n_term = AA_PROPS.get(seq[0], 0) |
| c_term = AA_CHARGE.get(seq[-1], 0) |
| return np.array([hydro, charge, weight, n_term, c_term], dtype=np.float32) |
|
|
| class AdaptiveFusionModel(nn.Module): |
| def __init__(self, base_model, num_labels, feature_dim=5): |
| super().__init__() |
| self.esm = base_model |
| self.num_labels = num_labels |
| hidden_size = base_model.config.hidden_size |
| |
| self.esm_classifier = nn.Sequential(nn.Dropout(0.1), nn.Linear(hidden_size, num_labels)) |
| self.feature_classifier = nn.Sequential(nn.Linear(feature_dim, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, num_labels)) |
| self.gate_weight = nn.Parameter(torch.tensor([1.38])) |
| |
| def forward(self, input_ids, attention_mask=None, extra_features=None, **kwargs): |
| outputs = self.esm(input_ids=input_ids, attention_mask=attention_mask, **kwargs) |
| cls_embedding = outputs.last_hidden_state[:, 0, :] |
| logits_esm = self.esm_classifier(cls_embedding) |
| |
| if extra_features is not None: |
| logits_feat = self.feature_classifier(extra_features) |
| alpha = torch.sigmoid(self.gate_weight) |
| logits = alpha * logits_esm + (1 - alpha) * logits_feat |
| else: |
| logits = logits_esm |
| alpha = None |
| return logits, alpha |
|
|
| |
| print("🚀 正在加载 BioOracle V14...") |
| device = torch.device('cpu') |
|
|
| |
| print("📥 加载 Tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME) |
|
|
| |
| print("🧠 加载 ESM-2 150M 模型(首次约 600MB,请等待)...") |
| base_model = AutoModel.from_pretrained(BASE_MODEL_NAME) |
|
|
| |
| print("🔧 应用 LoRA 配置...") |
| peft_config = LoraConfig( |
| task_type=TaskType.FEATURE_EXTRACTION, |
| r=32, lora_alpha=64, lora_dropout=0.1, |
| target_modules=["query", "key", "value", "dense"] |
| ) |
| base_model = get_peft_model(base_model, peft_config) |
|
|
| |
| print("⚙️ 构建融合架构...") |
| model = AdaptiveFusionModel(base_model, num_labels=len(LABELS)) |
|
|
| |
| weights_path = os.path.join(MODEL_DIR, "v14_weights.bin") |
| if not os.path.exists(weights_path): |
| raise FileNotFoundError(f"❌ 找不到权重文件: {weights_path}") |
|
|
| print("💾 加载 V14 权重(638MB)...") |
| state_dict = torch.load(weights_path, map_location=torch.device('cpu'), weights_only=False) |
|
|
| |
| model_keys = set(model.state_dict().keys()) |
| weight_keys = set(state_dict.keys()) |
|
|
| |
| if any('base_model.model' in k for k in model_keys) and not any('base_model.model' in k for k in weight_keys): |
| print("⚙️ 调整权重键名以匹配 PEFT 模型结构...") |
| new_state_dict = {} |
| for key, value in state_dict.items(): |
| if key.startswith('esm.'): |
| |
| new_key = key.replace('esm.', 'esm.base_model.model.', 1) |
| new_state_dict[new_key] = value |
| else: |
| new_state_dict[key] = value |
| state_dict = new_state_dict |
|
|
| |
| elif not any('base_model.model' in k for k in model_keys) and any('base_model.model' in k for k in weight_keys): |
| print("⚙️ 移除 PEFT 前缀以匹配标准模型结构...") |
| new_state_dict = {} |
| for key, value in state_dict.items(): |
| new_key = key.replace('base_model.model.', '') |
| new_state_dict[new_key] = value |
| state_dict = new_state_dict |
|
|
| |
| missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) |
|
|
| if missing_keys: |
| print(f"⚠️ 缺失 {len(missing_keys)} 个键(可能是新增的参数,如 pooler 层)") |
| print(f" 示例: {list(missing_keys)[:3]}") |
| if unexpected_keys: |
| print(f"⚠️ 忽略 {len(unexpected_keys)} 个多余的键") |
| print(f" 示例: {list(unexpected_keys)[:3]}") |
|
|
| model.to('cpu') |
| model.eval() |
|
|
| print("✅ 模型加载完成!") |
|
|
| |
| gate_val = torch.sigmoid(model.gate_weight).item() |
| esm_weight = gate_val |
| feat_weight = 1 - gate_val |
|
|
| |
| def predict_peptide(sequence): |
| """ |
| 预测肽序列的生物活性 (中文输出) |
| """ |
| |
| seq = sequence.strip().upper() |
| valid_aa = set("ACDEFGHIKLMNPQRSTVWY") |
| |
| if not seq: |
| return "❌ 请输入序列", None, None |
| |
| if not set(seq).issubset(valid_aa): |
| return "❌ 请输入有效的氨基酸序列(仅限20种标准氨基酸单字母缩写)", None, None |
| |
| |
| inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device) |
| raw_feats = compute_biophysics(seq) |
| feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device) |
| |
| |
| with torch.no_grad(): |
| logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor) |
| probs = torch.sigmoid(logits).cpu().numpy()[0] |
| |
| |
| df_res = pd.DataFrame({"功效标签": LABELS, "置信度": probs}) |
| df_res = df_res.sort_values(by="置信度", ascending=False).reset_index(drop=True) |
| |
| top_label = df_res.iloc[0]['功效标签'] |
| top_score = df_res.iloc[0]['置信度'] |
| |
| |
| if top_score > 0.8: |
| conclusion = f""" |
| ### ✅ 高潜力活性肽 |
| |
| **主要预测功效**: {top_label} |
| **置信度**: {top_score:.2%} |
| |
| 模型强烈建议将此序列纳入后续湿实验验证流程。 |
| """ |
| elif top_score > 0.3: |
| conclusion = f""" |
| ### ⚠️ 中等潜力 / 需进一步改造 |
| |
| **主要预测功效**: {top_label} |
| **置信度**: {top_score:.2%} |
| |
| 该序列可能具有一定活性,或是已知活性肽的突变体。建议结合结构生物学分析。 |
| """ |
| else: |
| conclusion = f""" |
| ### ❌ 疑似无效序列(负样本) |
| |
| **最高置信度**: {top_score:.2%} |
| |
| 模型判断该序列主要表现为负样本特征,建议剔除。 |
| """ |
| |
| |
| biophysics_text = f""" |
| **生物物理特征分析**: |
| - 平均疏水性: {raw_feats[0]:.2f} |
| - 净电荷: {raw_feats[1]:.2f} |
| - 估算分子量: {raw_feats[2]:.3f} kDa |
| - N端疏水性: {raw_feats[3]:.2f} |
| - C端电荷: {raw_feats[4]:.2f} |
| """ |
| |
| |
| df_formatted = df_res.copy() |
| df_formatted['置信度'] = df_formatted['置信度'].apply(lambda x: f"{x:.4%}") |
| |
| return conclusion, biophysics_text, df_formatted |
|
|
| |
| def predict_peptide_en(sequence): |
| """ |
| Predict peptide bioactivity (English Output) |
| """ |
| |
| seq = sequence.strip().upper() |
| valid_aa = set("ACDEFGHIKLMNPQRSTVWY") |
| |
| if not seq: |
| return "❌ Please enter a sequence", None, None |
| |
| if not set(seq).issubset(valid_aa): |
| return "❌ Invalid sequence. Please use standard 1-letter amino acid codes.", None, None |
| |
| |
| inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device) |
| raw_feats = compute_biophysics(seq) |
| feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device) |
| |
| |
| with torch.no_grad(): |
| logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor) |
| probs = torch.sigmoid(logits).cpu().numpy()[0] |
| |
| |
| df_res = pd.DataFrame({"Efficacy Label": LABELS, "Confidence": probs}) |
| df_res = df_res.sort_values(by="Confidence", ascending=False).reset_index(drop=True) |
| |
| top_label = df_res.iloc[0]['Efficacy Label'] |
| top_score = df_res.iloc[0]['Confidence'] |
| |
| |
| if top_score > 0.8: |
| conclusion = f""" |
| ### ✅ High Potential Peptide |
| |
| **Predicted Efficacy**: {top_label} |
| **Confidence**: {top_score:.2%} |
| |
| Strongly recommended for wet-lab validation. |
| """ |
| elif top_score > 0.3: |
| conclusion = f""" |
| ### ⚠️ Moderate Potential / Optimization Needed |
| |
| **Predicted Efficacy**: {top_label} |
| **Confidence**: {top_score:.2%} |
| |
| May have some activity or be a mutant of a known peptide. Structural analysis suggested. |
| """ |
| else: |
| conclusion = f""" |
| ### ❌ Likely Negative / Inactive |
| |
| **Max Confidence**: {top_score:.2%} |
| |
| Predicted as a negative sample. Suggested to discard. |
| """ |
| |
| |
| biophysics_text = f""" |
| **Biophysical Properties**: |
| - Avg Hydrophobicity: {raw_feats[0]:.2f} |
| - Net Charge: {raw_feats[1]:.2f} |
| - Est. Molecular Weight: {raw_feats[2]:.3f} kDa |
| - N-term Hydrophobicity: {raw_feats[3]:.2f} |
| - C-term Charge: {raw_feats[4]:.2f} |
| """ |
| |
| |
| df_formatted = df_res.copy() |
| df_formatted['Confidence'] = df_formatted['Confidence'].apply(lambda x: f"{x:.4%}") |
| |
| return conclusion, biophysics_text, df_formatted |
|
|
| |
| |
| custom_css = """ |
| .gradio-container { |
| font-family: 'Helvetica Neue', Arial, sans-serif; |
| background-color: #f9fbfd; |
| } |
| .header-area { |
| text-align: center; |
| margin-bottom: 20px; |
| padding: 20px; |
| background: linear-gradient(135deg, #eef2f3 0%, #8e9eab 100%); |
| border-radius: 12px; |
| box-shadow: 0 4px 6px rgba(0,0,0,0.1); |
| } |
| .header-area h1 { |
| color: #2c3e50; |
| font-size: 2.5em; |
| margin-bottom: 5px; |
| } |
| .header-area h3 { |
| color: #546e7a; |
| font-weight: 300; |
| } |
| .stat-box { |
| background: white; |
| padding: 15px; |
| border-radius: 8px; |
| border-left: 5px solid #3498db; |
| box-shadow: 0 2px 4px rgba(0,0,0,0.05); |
| } |
| .primary-btn { |
| background-color: #2980b9 !important; |
| } |
| """ |
|
|
| |
| theme = gr.themes.Soft( |
| primary_hue="blue", |
| secondary_hue="slate", |
| ).set( |
| button_primary_background_fill="#2980b9", |
| button_primary_background_fill_hover="#3498db", |
| ) |
|
|
| |
| with gr.Blocks(css=custom_css, theme=theme, title="BioOracle V14") as demo: |
| |
| |
| with gr.Row(): |
| gr.HTML( |
| """ |
| <div class="header-area"> |
| <h1>🧬 BioOracle V14</h1> |
| <h3>Giant Biogene AI Screening System | 巨子智筛 AI 活性肽发现系统</h3> |
| <p>Powered by ESM-2 150M & Biophysics Guided Learning</p> |
| </div> |
| """ |
| ) |
|
|
| |
| with gr.Accordion("🧠 Model Internal Status / 模型大脑状态", open=False): |
| with gr.Row(): |
| gr.Markdown( |
| f""" |
| <div class="stat-box"> |
| <b>自适应融合权重 (Adaptive Fusion Weights)</b>:<br> |
| <ul> |
| <li>ESM-2 Deep Semantics (AI Intuition): <b>{esm_weight:.1%}</b></li> |
| <li>Biophysics Rules (Physical Laws): <b>{feat_weight:.1%}</b></li> |
| </ul> |
| <p style="color: grey; font-size: 0.9em;"> |
| The model automatically balances between deep learning features and physical rules.<br> |
| 模型自动学会了主要依赖 ESM-2 大模型的深度理解,同时使用物理化学规则作为辅助校验。 |
| </p> |
| </div> |
| """ |
| ) |
|
|
| |
| with gr.Tabs(): |
| |
| |
| with gr.TabItem("🇨🇳 中文版 (Chinese)"): |
| with gr.Row(): |
| with gr.Column(scale=2): |
| sequence_input_zh = gr.Textbox( |
| label="输入待筛选的肽序列", |
| placeholder="例如: GHK", |
| info="输入氨基酸序列(单字母缩写),模型将评估其潜在生物活性", |
| lines=2 |
| ) |
| predict_btn_zh = gr.Button("🚀 开始演算", variant="primary", size="lg") |
| |
| with gr.Column(scale=3): |
| conclusion_output_zh = gr.Markdown(label="活性评估结论") |
| |
| with gr.Row(): |
| biophysics_output_zh = gr.Markdown(label="生物物理特征") |
| results_table_zh = gr.Dataframe( |
| label="完整预测数据表", |
| headers=["功效标签", "置信度"], |
| datatype=["str", "str"], |
| row_count=8 |
| ) |
| |
| gr.Examples( |
| examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]], |
| inputs=sequence_input_zh, |
| label="示例序列" |
| ) |
| |
| |
| predict_btn_zh.click( |
| fn=predict_peptide, |
| inputs=sequence_input_zh, |
| outputs=[conclusion_output_zh, biophysics_output_zh, results_table_zh] |
| ) |
|
|
| |
| with gr.TabItem("🇺🇸 English Version"): |
| with gr.Row(): |
| with gr.Column(scale=2): |
| sequence_input_en = gr.Textbox( |
| label="Input Peptide Sequence", |
| placeholder="e.g., GHK", |
| info="Enter amino acid sequence (single letter codes) for bioactivity assessment", |
| lines=2 |
| ) |
| predict_btn_en = gr.Button("🚀 Analyze Sequence", variant="primary", size="lg") |
| |
| with gr.Column(scale=3): |
| conclusion_output_en = gr.Markdown(label="Assessment Conclusion") |
| |
| with gr.Row(): |
| biophysics_output_en = gr.Markdown(label="Biophysical Properties") |
| results_table_en = gr.Dataframe( |
| label="Full Prediction Data", |
| headers=["Efficacy Label", "Confidence"], |
| datatype=["str", "str"], |
| row_count=8 |
| ) |
| |
| gr.Examples( |
| examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]], |
| inputs=sequence_input_en, |
| label="Example Sequences" |
| ) |
| |
| |
| predict_btn_en.click( |
| fn=predict_peptide_en, |
| inputs=sequence_input_en, |
| outputs=[conclusion_output_en, biophysics_output_en, results_table_en] |
| ) |
|
|
| |
| gr.Markdown( |
| """ |
| --- |
| <div style="text-align: center; color: #7f8c8d; font-size: 0.9em;"> |
| <b>BioOracle V14</b> | Design for Giant Biogene Internship Project<br> |
| <i>Disclaimer: Predictions are for research reference only. Wet-lab validation is required.</i> |
| </div> |
| """ |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch() |