yisen888 commited on
Commit
d483033
·
verified ·
1 Parent(s): 930bc9d

Delete BioOracle_V14

Browse files
BioOracle_V14/app.py DELETED
@@ -1,247 +0,0 @@
1
- import streamlit as st
2
- import torch
3
- import numpy as np
4
- import pandas as pd
5
- from torch import nn
6
- from transformers import AutoTokenizer, AutoModel
7
- from peft import get_peft_model, LoraConfig, TaskType
8
- import os
9
- import plotly.express as px
10
- from sklearn.preprocessing import MinMaxScaler
11
-
12
- # ================= 配置区 =================
13
- # 页面设置
14
- st.set_page_config(
15
- page_title="巨子智筛 BioOracle V14",
16
- page_icon="🧬",
17
- layout="wide",
18
- initial_sidebar_state="expanded"
19
- )
20
-
21
- # 路径配置 (默认权重文件在当前目录)
22
- MODEL_DIR = "."
23
- BASE_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
24
- # 标签列表 (顺序必须与训练时完全一致!)
25
- LABELS = ['anti_acne', 'anti_aging', 'anti_inflammatory', 'anti_oxidant', 'repair', 'whitening', 'delivery', 'negative']
26
-
27
- # ================= 1. 核心组件复刻 (必须与训练代码一致) =================
28
- AA_PROPS = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}
29
- AA_CHARGE = {'R': 1, 'K': 1, 'H': 0.1, 'D': -1, 'E': -1}
30
-
31
- def compute_biophysics(seq):
32
- length = len(seq)
33
- if length == 0: return [0]*5
34
- hydro = sum([AA_PROPS.get(aa, 0) for aa in seq]) / length
35
- charge = sum([AA_CHARGE.get(aa, 0) for aa in seq])
36
- weight = length * 110 / 1000.0 # 简化的归一化分子量
37
- n_term = AA_PROPS.get(seq[0], 0)
38
- c_term = AA_CHARGE.get(seq[-1], 0)
39
- return np.array([hydro, charge, weight, n_term, c_term], dtype=np.float32)
40
-
41
- # V14 模型架构定义
42
- class AdaptiveFusionModel(nn.Module):
43
- def __init__(self, base_model, num_labels, feature_dim=5):
44
- super().__init__()
45
- self.esm = base_model
46
- self.num_labels = num_labels
47
- hidden_size = base_model.config.hidden_size
48
-
49
- self.esm_classifier = nn.Sequential(nn.Dropout(0.1), nn.Linear(hidden_size, num_labels))
50
- self.feature_classifier = nn.Sequential(nn.Linear(feature_dim, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, num_labels))
51
- self.gate_weight = nn.Parameter(torch.tensor([1.38]))
52
-
53
- def forward(self, input_ids, attention_mask=None, extra_features=None, **kwargs):
54
- outputs = self.esm(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
55
- cls_embedding = outputs.last_hidden_state[:, 0, :]
56
- logits_esm = self.esm_classifier(cls_embedding)
57
-
58
- if extra_features is not None:
59
- logits_feat = self.feature_classifier(extra_features)
60
- alpha = torch.sigmoid(self.gate_weight)
61
- logits = alpha * logits_esm + (1 - alpha) * logits_feat
62
- else:
63
- logits = logits_esm
64
- alpha = None # Should not happen in inference
65
- return logits, alpha
66
-
67
- # ================= 2. 加载引擎 (带缓存) =================
68
- @st.cache_resource
69
- def load_engine():
70
- # 优先使用 GPU,没有则使用 CPU
71
- device_str = "cuda" if torch.cuda.is_available() else "cpu"
72
- device = torch.device(device_str)
73
- print(f"正在使用设备: {device_str} 加载模型...")
74
-
75
- try:
76
- # A. 加载 Tokenizer
77
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
78
-
79
- # B. 加载基座模型并应用 LoRA
80
- # 注意:首次运行会自动下载 ESM-2 150M 权重 (约600MB),请保持网络通畅
81
- base_model = AutoModel.from_pretrained(BASE_MODEL_NAME)
82
- peft_config = LoraConfig(
83
- task_type=TaskType.FEATURE_EXTRACTION,
84
- r=32, lora_alpha=64, lora_dropout=0.1,
85
- target_modules=["query", "key", "value", "dense"]
86
- )
87
- base_model = get_peft_model(base_model, peft_config)
88
-
89
- # C. 构建 V14 融合架构
90
- model = AdaptiveFusionModel(base_model, num_labels=len(LABELS))
91
-
92
- # D. 加载训练好的 V14 权重
93
- weights_path = os.path.join(MODEL_DIR, "v14_weights.bin")
94
- if not os.path.exists(weights_path):
95
- st.error(f"❌ 严重错误:找不到权重文件 `v14_weights.bin`!\n请确认你已解压 zip 文件,并且 `app.py` 和 `.bin` 文件在同一个文件夹内。")
96
- st.stop()
97
-
98
- # map_location 确保在 CPU 机器上也能加载 GPU 训练的权重
99
- state_dict = torch.load(weights_path, map_location=device)
100
- model.load_state_dict(state_dict)
101
- model.to(device)
102
- model.eval()
103
- print("✅ 模型加载完成!")
104
- return model, tokenizer, device
105
-
106
- except Exception as e:
107
- st.error(f"模型加载失败: {e}")
108
- st.stop()
109
-
110
- # ================= 3. 可视化辅助函数 =================
111
- def plot_radar_chart(seq, feats):
112
- """绘制生物物理特征雷达图"""
113
- categories = ['平均疏水性', '净电荷', '估算分子量', 'N端疏水性', 'C端电荷']
114
- # 为了雷达图好看,做一个简单的归一化 (基于短肽��经验范围)
115
- # 注意:这只是为了可视化展示差异,不影响模型预测
116
- scaler = MinMaxScaler(feature_range=(0, 1))
117
- # 预设一些经验最大最小值用于缩放
118
- min_vals = np.array([-4.5, -3, 0.1, -4.5, -1]).reshape(1, -1)
119
- max_vals = np.array([4.5, 3, 1.0, 4.5, 1]).reshape(1, -1)
120
- scaler.fit(np.vstack([min_vals, max_vals]))
121
-
122
- normalized_feats = scaler.transform(feats.reshape(1, -1)).flatten()
123
-
124
- df = pd.DataFrame(dict(
125
- r=normalized_feats,
126
- theta=categories
127
- ))
128
- fig = px.line_polar(df, r='r', theta='theta', line_close=True,
129
- title=f"肽 {seq} 的生物物理指纹 (Biophysical Fingerprint)",
130
- range_r=[0, 1], markers=True)
131
- fig.update_traces(fill='toself', line_color='#00CC96')
132
- fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=False)
133
- return fig
134
-
135
- # ================= 4. 网页 UI 主体 =================
136
-
137
- # 加载模型 (显示加载转圈)
138
- with st.spinner('正在启动 V14 生物计算引擎 (首次加载可能需要1分钟)...'):
139
- model, tokenizer, device = load_engine()
140
-
141
- # 侧边栏信息
142
- with st.sidebar:
143
- st.image("https://img.icons8.com/fluency/96/dna-helix.png", width=60)
144
- st.title("BioOracle V14")
145
- st.markdown("---")
146
- if model:
147
- # 获取当前门控权重
148
- gate_val = torch.sigmoid(model.gate_weight).item()
149
- esm_weight = gate_val
150
- feat_weight = 1 - gate_val
151
-
152
- st.header("🧠 模型大脑状态")
153
- # 用两个进度条直观展示权重对比
154
- st.write(f"ESM-2 深度语义 (AI直觉): **{esm_weight:.1%}**")
155
- st.progress(esm_weight)
156
- st.write(f"生物物理特征 (理化规则): **{feat_weight:.1%}**")
157
- st.progress(feat_weight)
158
-
159
- st.info("""
160
- **自适应融合机制 (Adaptive Fusion)**:
161
- 模型自动学会了主要依赖 ESM-2 大模型的深度理解,同时使用物理化学规则作为辅助校验,以防止死记硬背。
162
- """)
163
- st.markdown("---")
164
- st.markdown("Design for Giant Biogene Internship Project.")
165
-
166
- # 主页面
167
- st.write("# 🧬 巨子智筛:AI活性肽发现系统")
168
- st.caption("Powered by ESM-2 150M Parameter Model & Biophysics Guided Learning")
169
-
170
- # 输入区
171
- col1, col2 = st.columns([3, 1])
172
- with col1:
173
- seq_input = st.text_input("请输入待筛选的肽序列 (支持单字母缩写,如 GHK)", value="GHK", help="输入氨基酸序列,模型将评估其潜在生物活性。")
174
- with col2:
175
- st.write("") # Spacer
176
- st.write("")
177
- run_button = st.button("🚀 开始演算 (Analyze)", type="primary", use_container_width=True)
178
-
179
- if run_button and seq_input:
180
- seq = seq_input.strip().upper()
181
- # 基本校验
182
- valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
183
- if not seq or not set(seq).issubset(valid_aa):
184
- st.error("请输入有效的氨基酸序列 (仅限20种标准氨基酸单字母缩写)。")
185
- else:
186
- # --- 预测流程 ---
187
- with st.spinner(f'正在分析序列 {seq} 的构象与活性...'):
188
- # 1. 数据准备
189
- inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
190
- raw_feats = compute_biophysics(seq)
191
- feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)
192
-
193
- # 2. 模型推理
194
- with torch.no_grad():
195
- logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
196
- probs = torch.sigmoid(logits).cpu().numpy()[0]
197
-
198
- # --- 结果展示区 ---
199
- st.divider()
200
-
201
- # 布局:左侧雷达图,右侧预测结果
202
- res_col1, res_col2 = st.columns([2, 3])
203
-
204
- with res_col1:
205
- st.subheader("🔬 生物物理指纹可视化")
206
- radar_fig = plot_radar_chart(seq, raw_feats)
207
- st.plotly_chart(radar_fig, use_container_width=True)
208
- st.caption("展示了该肽段在5个关键物理化学维度上的特征分布。不同功能的肽通常具有不同的指纹形状。")
209
-
210
- with res_col2:
211
- st.subheader("🎯 活性评估结论")
212
-
213
- # 数据处理
214
- df_res = pd.DataFrame({"功效标签": LABELS, "置信度": probs})
215
- df_res = df_res.sort_values(by="置信度", ascending=False).reset_index(drop=True)
216
-
217
- top_label = df_res.iloc[0]['功效标签']
218
- top_score = df_res.iloc[0]['置信度']
219
-
220
- # 核心结论卡片
221
- if top_score > 0.8:
222
- st.success(f"### ✅ 高潜力活性肽")
223
- st.metric(label="主要预测功效", value=top_label, delta=f"置信度: {top_score:.2%}")
224
- st.write("模型强烈建议将此序列纳入后续湿实验验证流程。")
225
- elif top_score > 0.3:
226
- st.warning(f"### ⚠️ 中等���力 / 需进一步改造")
227
- st.metric(label="主要预测功效", value=top_label, delta=f"置信度: {top_score:.2%}", delta_color="off")
228
- st.write("该序列可能具有一定活性,或是已知活性肽的突变体。建议结合结构生物学分析。")
229
- else:
230
- st.error(f"### ❌ 疑似无效序列 (负样本)")
231
- st.metric(label="最高置信度", value=f"{top_score:.2%}", delta="未达到活性阈值", delta_color="inverse")
232
- st.write("模型判断该序列主要表现为负样本特征,建议剔除。")
233
-
234
- st.divider()
235
- st.write("**Top 3 可能性分布:**")
236
- for i in range(3):
237
- row = df_res.iloc[i]
238
- st.write(f"{row['功效标签']}")
239
- st.progress(float(row['置信度']))
240
-
241
- # 完整数据折叠窗
242
- with st.expander("查看完整预测数据表"):
243
- st.dataframe(df_res.style.format({"置信度": "{:.4%}"}), use_container_width=True)
244
-
245
- # 页脚
246
- st.divider()
247
- st.caption("注:本系统基于 ESM-2 150M 大模型微调,预测结果仅供科研参考,实际功效需经实验验证。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
BioOracle_V14/requirements.txt DELETED
@@ -1,8 +0,0 @@
1
- torch
2
- transformers
3
- peft
4
- streamlit
5
- pandas
6
- numpy
7
- plotly
8
- scikit-learn
 
 
 
 
 
 
 
 
 
BioOracle_V14/special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "<cls>",
3
- "eos_token": "<eos>",
4
- "mask_token": "<mask>",
5
- "pad_token": "<pad>",
6
- "unk_token": "<unk>"
7
- }
 
 
 
 
 
 
 
 
BioOracle_V14/tokenizer_config.json DELETED
@@ -1,53 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<cls>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "<eos>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "32": {
36
- "content": "<mask>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "clean_up_tokenization_spaces": false,
45
- "cls_token": "<cls>",
46
- "eos_token": "<eos>",
47
- "extra_special_tokens": {},
48
- "mask_token": "<mask>",
49
- "model_max_length": 1000000000000000019884624838656,
50
- "pad_token": "<pad>",
51
- "tokenizer_class": "EsmTokenizer",
52
- "unk_token": "<unk>"
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
BioOracle_V14/v14_weights.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c81b5a33d66a61483c771a635f88e586ff534d9380cb4c3eb52487151fc2cd2b
3
- size 637522573
 
 
 
 
BioOracle_V14/vocab.txt DELETED
@@ -1,33 +0,0 @@
1
- <cls>
2
- <pad>
3
- <eos>
4
- <unk>
5
- L
6
- A
7
- G
8
- V
9
- S
10
- E
11
- R
12
- T
13
- I
14
- D
15
- P
16
- K
17
- Q
18
- N
19
- F
20
- Y
21
- M
22
- H
23
- W
24
- C
25
- X
26
- B
27
- U
28
- Z
29
- O
30
- .
31
- -
32
- <null_1>
33
- <mask>