yisen888 commited on
Commit
bf19137
·
verified ·
1 Parent(s): f973fbb

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +454 -0
app.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from torch import nn
6
+ from transformers import AutoTokenizer, AutoModel
7
+ from peft import get_peft_model, LoraConfig, TaskType
8
+ import os
9
+
10
+ # ================= 配置区 (保持不变) =================
11
+ MODEL_DIR = "."
12
+ BASE_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
13
+ LABELS = ['anti_acne', 'anti_aging', 'anti_inflammatory', 'anti_oxidant', 'repair', 'whitening', 'delivery', 'negative']
14
+
15
+ # ================= 核心组件 (保持不变) =================
16
+ AA_PROPS = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}
17
+ AA_CHARGE = {'R': 1, 'K': 1, 'H': 0.1, 'D': -1, 'E': -1}
18
+
19
+ def compute_biophysics(seq):
20
+ length = len(seq)
21
+ if length == 0: return [0]*5
22
+ hydro = sum([AA_PROPS.get(aa, 0) for aa in seq]) / length
23
+ charge = sum([AA_CHARGE.get(aa, 0) for aa in seq])
24
+ weight = length * 110 / 1000.0
25
+ n_term = AA_PROPS.get(seq[0], 0)
26
+ c_term = AA_CHARGE.get(seq[-1], 0)
27
+ return np.array([hydro, charge, weight, n_term, c_term], dtype=np.float32)
28
+
29
+ class AdaptiveFusionModel(nn.Module):
30
+ def __init__(self, base_model, num_labels, feature_dim=5):
31
+ super().__init__()
32
+ self.esm = base_model
33
+ self.num_labels = num_labels
34
+ hidden_size = base_model.config.hidden_size
35
+
36
+ self.esm_classifier = nn.Sequential(nn.Dropout(0.1), nn.Linear(hidden_size, num_labels))
37
+ self.feature_classifier = nn.Sequential(nn.Linear(feature_dim, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, num_labels))
38
+ self.gate_weight = nn.Parameter(torch.tensor([1.38]))
39
+
40
+ def forward(self, input_ids, attention_mask=None, extra_features=None, **kwargs):
41
+ outputs = self.esm(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
42
+ cls_embedding = outputs.last_hidden_state[:, 0, :]
43
+ logits_esm = self.esm_classifier(cls_embedding)
44
+
45
+ if extra_features is not None:
46
+ logits_feat = self.feature_classifier(extra_features)
47
+ alpha = torch.sigmoid(self.gate_weight)
48
+ logits = alpha * logits_esm + (1 - alpha) * logits_feat
49
+ else:
50
+ logits = logits_esm
51
+ alpha = None
52
+ return logits, alpha
53
+
54
+ # ================= 模型加载 (保持不变) =================
55
+ print("🚀 正在加载 BioOracle V14...")
56
+ device = torch.device('cpu')
57
+
58
+ # 加载 Tokenizer
59
+ print("📥 加载 Tokenizer...")
60
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
61
+
62
+ # 加载基座模型
63
+ print("🧠 加载 ESM-2 150M 模型(首次约 600MB,请等待)...")
64
+ base_model = AutoModel.from_pretrained(BASE_MODEL_NAME)
65
+
66
+ # 应用 LoRA
67
+ print("🔧 应用 LoRA 配置...")
68
+ peft_config = LoraConfig(
69
+ task_type=TaskType.FEATURE_EXTRACTION,
70
+ r=32, lora_alpha=64, lora_dropout=0.1,
71
+ target_modules=["query", "key", "value", "dense"]
72
+ )
73
+ base_model = get_peft_model(base_model, peft_config)
74
+
75
+ # 构建模型
76
+ print("⚙️ 构建融合架构...")
77
+ model = AdaptiveFusionModel(base_model, num_labels=len(LABELS))
78
+
79
+ # 加载权重
80
+ weights_path = os.path.join(MODEL_DIR, "v14_weights.bin")
81
+ if not os.path.exists(weights_path):
82
+ raise FileNotFoundError(f"❌ 找不到权重文件: {weights_path}")
83
+
84
+ print("💾 加载 V14 权重(638MB)...")
85
+ state_dict = torch.load(weights_path, map_location=torch.device('cpu'), weights_only=False)
86
+
87
+ # 🔥 智能匹配权重键名(修复 PEFT 前缀不匹配问题)
88
+ model_keys = set(model.state_dict().keys())
89
+ weight_keys = set(state_dict.keys())
90
+
91
+ # 情况1:权重没有 base_model 前缀,但模型有(需要添加前缀)
92
+ if any('base_model.model' in k for k in model_keys) and not any('base_model.model' in k for k in weight_keys):
93
+ print("⚙️ 调整权重键名以匹配 PEFT 模型结构...")
94
+ new_state_dict = {}
95
+ for key, value in state_dict.items():
96
+ if key.startswith('esm.'):
97
+ # esm.xxx → esm.base_model.model.xxx
98
+ new_key = key.replace('esm.', 'esm.base_model.model.', 1)
99
+ new_state_dict[new_key] = value
100
+ else:
101
+ new_state_dict[key] = value
102
+ state_dict = new_state_dict
103
+
104
+ # 情况2:权重有 base_model 前缀,但模型没有(需要删除前缀)
105
+ elif not any('base_model.model' in k for k in model_keys) and any('base_model.model' in k for k in weight_keys):
106
+ print("⚙️ 移除 PEFT 前缀以匹配标准模型结构...")
107
+ new_state_dict = {}
108
+ for key, value in state_dict.items():
109
+ new_key = key.replace('base_model.model.', '')
110
+ new_state_dict[new_key] = value
111
+ state_dict = new_state_dict
112
+
113
+ # 加载权重(使用 strict=False 允许部分不匹配)
114
+ missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
115
+
116
+ if missing_keys:
117
+ print(f"⚠️ 缺失 {len(missing_keys)} 个键(可能是新增的参���,如 pooler 层)")
118
+ print(f" 示例: {list(missing_keys)[:3]}")
119
+ if unexpected_keys:
120
+ print(f"⚠️ 忽略 {len(unexpected_keys)} 个多余的键")
121
+ print(f" 示例: {list(unexpected_keys)[:3]}")
122
+
123
+ model.to('cpu')
124
+ model.eval()
125
+
126
+ print("✅ 模型加载完成!")
127
+
128
+ # 获取门控权重
129
+ gate_val = torch.sigmoid(model.gate_weight).item()
130
+ esm_weight = gate_val
131
+ feat_weight = 1 - gate_val
132
+
133
+ # ================= 预测函数 (中文版 - 保持原有逻辑) =================
134
+ def predict_peptide(sequence):
135
+ """
136
+ 预测肽序列的生物活性 (中文输出)
137
+ """
138
+ # 输入验证
139
+ seq = sequence.strip().upper()
140
+ valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
141
+
142
+ if not seq:
143
+ return "❌ 请输入序列", None, None
144
+
145
+ if not set(seq).issubset(valid_aa):
146
+ return "❌ 请输入有效的氨基酸序列(仅限20种标准氨基酸单字母缩写)", None, None
147
+
148
+ # 数据准备
149
+ inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
150
+ raw_feats = compute_biophysics(seq)
151
+ feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)
152
+
153
+ # 模型推理
154
+ with torch.no_grad():
155
+ logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
156
+ probs = torch.sigmoid(logits).cpu().numpy()[0]
157
+
158
+ # 处理结果
159
+ df_res = pd.DataFrame({"功效标签": LABELS, "置信度": probs})
160
+ df_res = df_res.sort_values(by="置信度", ascending=False).reset_index(drop=True)
161
+
162
+ top_label = df_res.iloc[0]['功效标签']
163
+ top_score = df_res.iloc[0]['置信度']
164
+
165
+ # 生成结论
166
+ if top_score > 0.8:
167
+ conclusion = f"""
168
+ ### ✅ 高潜力活性肽
169
+
170
+ **主要预测功效**: {top_label}
171
+ **置信度**: {top_score:.2%}
172
+
173
+ 模型强烈建议将此序列纳入后续湿实验验证流程。
174
+ """
175
+ elif top_score > 0.3:
176
+ conclusion = f"""
177
+ ### ⚠️ 中等潜力 / 需进一步改造
178
+
179
+ **主要预测功效**: {top_label}
180
+ **置信度**: {top_score:.2%}
181
+
182
+ 该序列可能具有一定活性,或是已知活性肽的突变体。建议结合结构生物学分析。
183
+ """
184
+ else:
185
+ conclusion = f"""
186
+ ### ❌ 疑似无效序列(负样本)
187
+
188
+ **最高置信度**: {top_score:.2%}
189
+
190
+ 模型判断该序列主要表现为负样本特征,建议剔除。
191
+ """
192
+
193
+ # 生成生物物理特征文本
194
+ biophysics_text = f"""
195
+ **生物物理特征分析**:
196
+ - 平均疏水性: {raw_feats[0]:.2f}
197
+ - 净电荷: {raw_feats[1]:.2f}
198
+ - 估算分子量: {raw_feats[2]:.3f} kDa
199
+ - N端疏水性: {raw_feats[3]:.2f}
200
+ - C端电荷: {raw_feats[4]:.2f}
201
+ """
202
+
203
+ # 格式化完整结果表
204
+ df_formatted = df_res.copy()
205
+ df_formatted['置信度'] = df_formatted['置信度'].apply(lambda x: f"{x:.4%}")
206
+
207
+ return conclusion, biophysics_text, df_formatted
208
+
209
+ # ================= 预测函数 (英文版 - 新增) =================
210
+ def predict_peptide_en(sequence):
211
+ """
212
+ Predict peptide bioactivity (English Output)
213
+ """
214
+ # Input Validation
215
+ seq = sequence.strip().upper()
216
+ valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
217
+
218
+ if not seq:
219
+ return "❌ Please enter a sequence", None, None
220
+
221
+ if not set(seq).issubset(valid_aa):
222
+ return "❌ Invalid sequence. Please use standard 1-letter amino acid codes.", None, None
223
+
224
+ # Data Preparation (Same as Chinese version)
225
+ inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
226
+ raw_feats = compute_biophysics(seq)
227
+ feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)
228
+
229
+ # Inference
230
+ with torch.no_grad():
231
+ logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
232
+ probs = torch.sigmoid(logits).cpu().numpy()[0]
233
+
234
+ # Process Results
235
+ df_res = pd.DataFrame({"Efficacy Label": LABELS, "Confidence": probs})
236
+ df_res = df_res.sort_values(by="Confidence", ascending=False).reset_index(drop=True)
237
+
238
+ top_label = df_res.iloc[0]['Efficacy Label']
239
+ top_score = df_res.iloc[0]['Confidence']
240
+
241
+ # Generate Conclusion (English)
242
+ if top_score > 0.8:
243
+ conclusion = f"""
244
+ ### ✅ High Potential Peptide
245
+
246
+ **Predicted Efficacy**: {top_label}
247
+ **Confidence**: {top_score:.2%}
248
+
249
+ Strongly recommended for wet-lab validation.
250
+ """
251
+ elif top_score > 0.3:
252
+ conclusion = f"""
253
+ ### ⚠️ Moderate Potential / Optimization Needed
254
+
255
+ **Predicted Efficacy**: {top_label}
256
+ **Confidence**: {top_score:.2%}
257
+
258
+ May have some activity or be a mutant of a known peptide. Structural analysis suggested.
259
+ """
260
+ else:
261
+ conclusion = f"""
262
+ ### ❌ Likely Negative / Inactive
263
+
264
+ **Max Confidence**: {top_score:.2%}
265
+
266
+ Predicted as a negative sample. Suggested to discard.
267
+ """
268
+
269
+ # Biophysics Text (English)
270
+ biophysics_text = f"""
271
+ **Biophysical Properties**:
272
+ - Avg Hydrophobicity: {raw_feats[0]:.2f}
273
+ - Net Charge: {raw_feats[1]:.2f}
274
+ - Est. Molecular Weight: {raw_feats[2]:.3f} kDa
275
+ - N-term Hydrophobicity: {raw_feats[3]:.2f}
276
+ - C-term Charge: {raw_feats[4]:.2f}
277
+ """
278
+
279
+ # Format Table
280
+ df_formatted = df_res.copy()
281
+ df_formatted['Confidence'] = df_formatted['Confidence'].apply(lambda x: f"{x:.4%}")
282
+
283
+ return conclusion, biophysics_text, df_formatted
284
+
285
+ # ================= Gradio 界面 (前端设计升级) =================
286
+ # 自定义 CSS - 增强医疗科技感
287
+ custom_css = """
288
+ .gradio-container {
289
+ font-family: 'Helvetica Neue', Arial, sans-serif;
290
+ background-color: #f9fbfd;
291
+ }
292
+ .header-area {
293
+ text-align: center;
294
+ margin-bottom: 20px;
295
+ padding: 20px;
296
+ background: linear-gradient(135deg, #eef2f3 0%, #8e9eab 100%);
297
+ border-radius: 12px;
298
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
299
+ }
300
+ .header-area h1 {
301
+ color: #2c3e50;
302
+ font-size: 2.5em;
303
+ margin-bottom: 5px;
304
+ }
305
+ .header-area h3 {
306
+ color: #546e7a;
307
+ font-weight: 300;
308
+ }
309
+ .stat-box {
310
+ background: white;
311
+ padding: 15px;
312
+ border-radius: 8px;
313
+ border-left: 5px solid #3498db;
314
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
315
+ }
316
+ .primary-btn {
317
+ background-color: #2980b9 !important;
318
+ }
319
+ """
320
+
321
+ # 使用 Soft 主题作为基础
322
+ theme = gr.themes.Soft(
323
+ primary_hue="blue",
324
+ secondary_hue="slate",
325
+ ).set(
326
+ button_primary_background_fill="#2980b9",
327
+ button_primary_background_fill_hover="#3498db",
328
+ )
329
+
330
+ # 创建界面
331
+ with gr.Blocks(css=custom_css, theme=theme, title="BioOracle V14") as demo:
332
+
333
+ # 顶部 Header 区域
334
+ with gr.Row():
335
+ gr.HTML(
336
+ """
337
+ <div class="header-area">
338
+ <h1>🧬 BioOracle V14</h1>
339
+ <h3>Giant Biogene AI Screening System | 巨子智筛 AI 活性肽发现系统</h3>
340
+ <p>Powered by ESM-2 150M & Biophysics Guided Learning</p>
341
+ </div>
342
+ """
343
+ )
344
+
345
+ # 模型状态折叠面板 (双语通用)
346
+ with gr.Accordion("🧠 Model Internal Status / 模型大脑状态", open=False):
347
+ with gr.Row():
348
+ gr.Markdown(
349
+ f"""
350
+ <div class="stat-box">
351
+ <b>自适应融合权重 (Adaptive Fusion Weights)</b>:<br>
352
+ <ul>
353
+ <li>ESM-2 Deep Semantics (AI Intuition): <b>{esm_weight:.1%}</b></li>
354
+ <li>Biophysics Rules (Physical Laws): <b>{feat_weight:.1%}</b></li>
355
+ </ul>
356
+ <p style="color: grey; font-size: 0.9em;">
357
+ The model automatically balances between deep learning features and physical rules.<br>
358
+ 模型自动学会了主要依赖 ESM-2 大模型的深度理解,同时使用物理化学规则作为辅助校验。
359
+ </p>
360
+ </div>
361
+ """
362
+ )
363
+
364
+ # 多语言选项卡
365
+ with gr.Tabs():
366
+
367
+ # ============ Tab 1: 中文版 ============
368
+ with gr.TabItem("🇨🇳 中文版 (Chinese)"):
369
+ with gr.Row():
370
+ with gr.Column(scale=2):
371
+ sequence_input_zh = gr.Textbox(
372
+ label="输入待筛选的肽序列",
373
+ placeholder="例如: GHK",
374
+ info="输入氨基酸序列(单字母缩写),模型将评估其潜在生物活性",
375
+ lines=2
376
+ )
377
+ predict_btn_zh = gr.Button("🚀 开始演算", variant="primary", size="lg")
378
+
379
+ with gr.Column(scale=3):
380
+ conclusion_output_zh = gr.Markdown(label="活性评估结论")
381
+
382
+ with gr.Row():
383
+ biophysics_output_zh = gr.Markdown(label="生物物理特征")
384
+ results_table_zh = gr.Dataframe(
385
+ label="完整预测数据表",
386
+ headers=["功效标签", "置信度"],
387
+ datatype=["str", "str"],
388
+ row_count=8
389
+ )
390
+
391
+ gr.Examples(
392
+ examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
393
+ inputs=sequence_input_zh,
394
+ label="示例序列"
395
+ )
396
+
397
+ # 中文版事件绑定
398
+ predict_btn_zh.click(
399
+ fn=predict_peptide,
400
+ inputs=sequence_input_zh,
401
+ outputs=[conclusion_output_zh, biophysics_output_zh, results_table_zh]
402
+ )
403
+
404
+ # ============ Tab 2: 英文版 ============
405
+ with gr.TabItem("🇺🇸 English Version"):
406
+ with gr.Row():
407
+ with gr.Column(scale=2):
408
+ sequence_input_en = gr.Textbox(
409
+ label="Input Peptide Sequence",
410
+ placeholder="e.g., GHK",
411
+ info="Enter amino acid sequence (single letter codes) for bioactivity assessment",
412
+ lines=2
413
+ )
414
+ predict_btn_en = gr.Button("🚀 Analyze Sequence", variant="primary", size="lg")
415
+
416
+ with gr.Column(scale=3):
417
+ conclusion_output_en = gr.Markdown(label="Assessment Conclusion")
418
+
419
+ with gr.Row():
420
+ biophysics_output_en = gr.Markdown(label="Biophysical Properties")
421
+ results_table_en = gr.Dataframe(
422
+ label="Full Prediction Data",
423
+ headers=["Efficacy Label", "Confidence"],
424
+ datatype=["str", "str"],
425
+ row_count=8
426
+ )
427
+
428
+ gr.Examples(
429
+ examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
430
+ inputs=sequence_input_en,
431
+ label="Example Sequences"
432
+ )
433
+
434
+ # 英文版事件绑定
435
+ predict_btn_en.click(
436
+ fn=predict_peptide_en,
437
+ inputs=sequence_input_en,
438
+ outputs=[conclusion_output_en, biophysics_output_en, results_table_en]
439
+ )
440
+
441
+ # 底部版权信息
442
+ gr.Markdown(
443
+ """
444
+ ---
445
+ <div style="text-align: center; color: #7f8c8d; font-size: 0.9em;">
446
+ <b>BioOracle V14</b> | Design for Giant Biogene Internship Project<br>
447
+ <i>Disclaimer: Predictions are for research reference only. Wet-lab validation is required.</i>
448
+ </div>
449
+ """
450
+ )
451
+
452
+ # 启动应用
453
+ if __name__ == "__main__":
454
+ demo.launch()