Spaces:

yisen888
/

protein_Activity_detection_classifier

Sleeping

App Files Files Community

protein_Activity_detection_classifier / app.py

yisen888

Upload app.py

bf19137 verified 3 months ago

raw

history blame contribute delete

16.4 kB

	import gradio as gr
	import torch
	import numpy as np
	import pandas as pd
	from torch import nn
	from transformers import AutoTokenizer, AutoModel
	from peft import get_peft_model, LoraConfig, TaskType
	import os

	# ================= 配置区 (保持不变) =================
	MODEL_DIR = "."
	BASE_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
	LABELS = ['anti_acne', 'anti_aging', 'anti_inflammatory', 'anti_oxidant', 'repair', 'whitening', 'delivery', 'negative']

	# ================= 核心组件 (保持不变) =================
	AA_PROPS = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}
	AA_CHARGE = {'R': 1, 'K': 1, 'H': 0.1, 'D': -1, 'E': -1}

	def compute_biophysics(seq):
	length = len(seq)
	if length == 0: return [0]*5
	hydro = sum([AA_PROPS.get(aa, 0) for aa in seq]) / length
	charge = sum([AA_CHARGE.get(aa, 0) for aa in seq])
	weight = length * 110 / 1000.0
	n_term = AA_PROPS.get(seq[0], 0)
	c_term = AA_CHARGE.get(seq[-1], 0)
	return np.array([hydro, charge, weight, n_term, c_term], dtype=np.float32)

	class AdaptiveFusionModel(nn.Module):
	def __init__(self, base_model, num_labels, feature_dim=5):
	super().__init__()
	self.esm = base_model
	self.num_labels = num_labels
	hidden_size = base_model.config.hidden_size

	self.esm_classifier = nn.Sequential(nn.Dropout(0.1), nn.Linear(hidden_size, num_labels))
	self.feature_classifier = nn.Sequential(nn.Linear(feature_dim, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, num_labels))
	self.gate_weight = nn.Parameter(torch.tensor([1.38]))

	def forward(self, input_ids, attention_mask=None, extra_features=None, **kwargs):
	outputs = self.esm(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
	cls_embedding = outputs.last_hidden_state[:, 0, :]
	logits_esm = self.esm_classifier(cls_embedding)

	if extra_features is not None:
	logits_feat = self.feature_classifier(extra_features)
	alpha = torch.sigmoid(self.gate_weight)
	logits = alpha * logits_esm + (1 - alpha) * logits_feat
	else:
	logits = logits_esm
	alpha = None
	return logits, alpha

	# ================= 模型加载 (保持不变) =================
	print("🚀 正在加载 BioOracle V14...")
	device = torch.device('cpu')

	# 加载 Tokenizer
	print("📥 加载 Tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

	# 加载基座模型
	print("🧠 加载 ESM-2 150M 模型（首次约 600MB，请等待）...")
	base_model = AutoModel.from_pretrained(BASE_MODEL_NAME)

	# 应用 LoRA
	print("🔧 应用 LoRA 配置...")
	peft_config = LoraConfig(
	task_type=TaskType.FEATURE_EXTRACTION,
	r=32, lora_alpha=64, lora_dropout=0.1,
	target_modules=["query", "key", "value", "dense"]
	)
	base_model = get_peft_model(base_model, peft_config)

	# 构建模型
	print("⚙️ 构建融合架构...")
	model = AdaptiveFusionModel(base_model, num_labels=len(LABELS))

	# 加载权重
	weights_path = os.path.join(MODEL_DIR, "v14_weights.bin")
	if not os.path.exists(weights_path):
	raise FileNotFoundError(f"❌ 找不到权重文件: {weights_path}")

	print("💾 加载 V14 权重（638MB）...")
	state_dict = torch.load(weights_path, map_location=torch.device('cpu'), weights_only=False)

	# 🔥 智能匹配权重键名（修复 PEFT 前缀不匹配问题）
	model_keys = set(model.state_dict().keys())
	weight_keys = set(state_dict.keys())

	# 情况1：权重没有 base_model 前缀，但模型有（需要添加前缀）
	if any('base_model.model' in k for k in model_keys) and not any('base_model.model' in k for k in weight_keys):
	print("⚙️ 调整权重键名以匹配 PEFT 模型结构...")
	new_state_dict = {}
	for key, value in state_dict.items():
	if key.startswith('esm.'):
	# esm.xxx → esm.base_model.model.xxx
	new_key = key.replace('esm.', 'esm.base_model.model.', 1)
	new_state_dict[new_key] = value
	else:
	new_state_dict[key] = value
	state_dict = new_state_dict

	# 情况2：权重有 base_model 前缀，但模型没有（需要删除前缀）
	elif not any('base_model.model' in k for k in model_keys) and any('base_model.model' in k for k in weight_keys):
	print("⚙️ 移除 PEFT 前缀以匹配标准模型结构...")
	new_state_dict = {}
	for key, value in state_dict.items():
	new_key = key.replace('base_model.model.', '')
	new_state_dict[new_key] = value
	state_dict = new_state_dict

	# 加载权重（使用 strict=False 允许部分不匹配）
	missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

	if missing_keys:
	print(f"⚠️ 缺失 {len(missing_keys)} 个键（可能是新增的参数，如 pooler 层）")
	print(f" 示例: {list(missing_keys)[:3]}")
	if unexpected_keys:
	print(f"⚠️ 忽略 {len(unexpected_keys)} 个多余的键")
	print(f" 示例: {list(unexpected_keys)[:3]}")

	model.to('cpu')
	model.eval()

	print("✅ 模型加载完成！")

	# 获取门控权重
	gate_val = torch.sigmoid(model.gate_weight).item()
	esm_weight = gate_val
	feat_weight = 1 - gate_val

	# ================= 预测函数 (中文版 - 保持原有逻辑) =================
	def predict_peptide(sequence):
	"""
	预测肽序列的生物活性 (中文输出)
	"""
	# 输入验证
	seq = sequence.strip().upper()
	valid_aa = set("ACDEFGHIKLMNPQRSTVWY")

	if not seq:
	return "❌ 请输入序列", None, None

	if not set(seq).issubset(valid_aa):
	return "❌ 请输入有效的氨基酸序列（仅限20种标准氨基酸单字母缩写）", None, None

	# 数据准备
	inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
	raw_feats = compute_biophysics(seq)
	feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)

	# 模型推理
	with torch.no_grad():
	logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
	probs = torch.sigmoid(logits).cpu().numpy()[0]

	# 处理结果
	df_res = pd.DataFrame({"功效标签": LABELS, "置信度": probs})
	df_res = df_res.sort_values(by="置信度", ascending=False).reset_index(drop=True)

	top_label = df_res.iloc[0]['功效标签']
	top_score = df_res.iloc[0]['置信度']

	# 生成结论
	if top_score > 0.8:
	conclusion = f"""
	### ✅ 高潜力活性肽

	主要预测功效: {top_label}
	置信度: {top_score:.2%}

	模型强烈建议将此序列纳入后续湿实验验证流程。
	"""
	elif top_score > 0.3:
	conclusion = f"""
	### ⚠️ 中等潜力 / 需进一步改造

	主要预测功效: {top_label}
	置信度: {top_score:.2%}

	该序列可能具有一定活性，或是已知活性肽的突变体。建议结合结构生物学分析。
	"""
	else:
	conclusion = f"""
	### ❌ 疑似无效序列（负样本）

	最高置信度: {top_score:.2%}

	模型判断该序列主要表现为负样本特征，建议剔除。
	"""

	# 生成生物物理特征文本
	biophysics_text = f"""
	生物物理特征分析:
	- 平均疏水性: {raw_feats[0]:.2f}
	- 净电荷: {raw_feats[1]:.2f}
	- 估算分子量: {raw_feats[2]:.3f} kDa
	- N端疏水性: {raw_feats[3]:.2f}
	- C端电荷: {raw_feats[4]:.2f}
	"""

	# 格式化完整结果表
	df_formatted = df_res.copy()
	df_formatted['置信度'] = df_formatted['置信度'].apply(lambda x: f"{x:.4%}")

	return conclusion, biophysics_text, df_formatted

	# ================= 预测函数 (英文版 - 新增) =================
	def predict_peptide_en(sequence):
	"""
	Predict peptide bioactivity (English Output)
	"""
	# Input Validation
	seq = sequence.strip().upper()
	valid_aa = set("ACDEFGHIKLMNPQRSTVWY")

	if not seq:
	return "❌ Please enter a sequence", None, None

	if not set(seq).issubset(valid_aa):
	return "❌ Invalid sequence. Please use standard 1-letter amino acid codes.", None, None

	# Data Preparation (Same as Chinese version)
	inputs = tokenizer(seq, return_tensors="pt", padding="max_length", max_length=128).to(device)
	raw_feats = compute_biophysics(seq)
	feats_tensor = torch.tensor([raw_feats], dtype=torch.float).to(device)

	# Inference
	with torch.no_grad():
	logits, _ = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], extra_features=feats_tensor)
	probs = torch.sigmoid(logits).cpu().numpy()[0]

	# Process Results
	df_res = pd.DataFrame({"Efficacy Label": LABELS, "Confidence": probs})
	df_res = df_res.sort_values(by="Confidence", ascending=False).reset_index(drop=True)

	top_label = df_res.iloc[0]['Efficacy Label']
	top_score = df_res.iloc[0]['Confidence']

	# Generate Conclusion (English)
	if top_score > 0.8:
	conclusion = f"""
	### ✅ High Potential Peptide

	Predicted Efficacy: {top_label}
	Confidence: {top_score:.2%}

	Strongly recommended for wet-lab validation.
	"""
	elif top_score > 0.3:
	conclusion = f"""
	### ⚠️ Moderate Potential / Optimization Needed

	Predicted Efficacy: {top_label}
	Confidence: {top_score:.2%}

	May have some activity or be a mutant of a known peptide. Structural analysis suggested.
	"""
	else:
	conclusion = f"""
	### ❌ Likely Negative / Inactive

	Max Confidence: {top_score:.2%}

	Predicted as a negative sample. Suggested to discard.
	"""

	# Biophysics Text (English)
	biophysics_text = f"""
	Biophysical Properties:
	- Avg Hydrophobicity: {raw_feats[0]:.2f}
	- Net Charge: {raw_feats[1]:.2f}
	- Est. Molecular Weight: {raw_feats[2]:.3f} kDa
	- N-term Hydrophobicity: {raw_feats[3]:.2f}
	- C-term Charge: {raw_feats[4]:.2f}
	"""

	# Format Table
	df_formatted = df_res.copy()
	df_formatted['Confidence'] = df_formatted['Confidence'].apply(lambda x: f"{x:.4%}")

	return conclusion, biophysics_text, df_formatted

	# ================= Gradio 界面 (前端设计升级) =================
	# 自定义 CSS - 增强医疗科技感
	custom_css = """
	.gradio-container {
	font-family: 'Helvetica Neue', Arial, sans-serif;
	background-color: #f9fbfd;
	}
	.header-area {
	text-align: center;
	margin-bottom: 20px;
	padding: 20px;
	background: linear-gradient(135deg, #eef2f3 0%, #8e9eab 100%);
	border-radius: 12px;
	box-shadow: 0 4px 6px rgba(0,0,0,0.1);
	}
	.header-area h1 {
	color: #2c3e50;
	font-size: 2.5em;
	margin-bottom: 5px;
	}
	.header-area h3 {
	color: #546e7a;
	font-weight: 300;
	}
	.stat-box {
	background: white;
	padding: 15px;
	border-radius: 8px;
	border-left: 5px solid #3498db;
	box-shadow: 0 2px 4px rgba(0,0,0,0.05);
	}
	.primary-btn {
	background-color: #2980b9 !important;
	}
	"""

	# 使用 Soft 主题作为基础
	theme = gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	).set(
	button_primary_background_fill="#2980b9",
	button_primary_background_fill_hover="#3498db",
	)

	# 创建界面
	with gr.Blocks(css=custom_css, theme=theme, title="BioOracle V14") as demo:

	# 顶部 Header 区域
	with gr.Row():
	gr.HTML(
	"""
	<div class="header-area">
	<h1>🧬 BioOracle V14</h1>
	<h3>Giant Biogene AI Screening System \| 巨子智筛 AI 活性肽发现系统</h3>
	<p>Powered by ESM-2 150M & Biophysics Guided Learning</p>
	</div>
	"""
	)

	# 模型状态折叠面板 (双语通用)
	with gr.Accordion("🧠 Model Internal Status / 模型大脑状态", open=False):
	with gr.Row():
	gr.Markdown(
	f"""
	<div class="stat-box">
	<b>自适应融合权重 (Adaptive Fusion Weights)</b>:<br>
	<ul>
	<li>ESM-2 Deep Semantics (AI Intuition): <b>{esm_weight:.1%}</b></li>
	<li>Biophysics Rules (Physical Laws): <b>{feat_weight:.1%}</b></li>
	</ul>
	<p style="color: grey; font-size: 0.9em;">
	The model automatically balances between deep learning features and physical rules.<br>
	模型自动学会了主要依赖 ESM-2 大模型的深度理解，同时使用物理化学规则作为辅助校验。
	</p>
	</div>
	"""
	)

	# 多语言选项卡
	with gr.Tabs():

	# ============ Tab 1: 中文版 ============
	with gr.TabItem("🇨🇳 中文版 (Chinese)"):
	with gr.Row():
	with gr.Column(scale=2):
	sequence_input_zh = gr.Textbox(
	label="输入待筛选的肽序列",
	placeholder="例如: GHK",
	info="输入氨基酸序列（单字母缩写），模型将评估其潜在生物活性",
	lines=2
	)
	predict_btn_zh = gr.Button("🚀 开始演算", variant="primary", size="lg")

	with gr.Column(scale=3):
	conclusion_output_zh = gr.Markdown(label="活性评估结论")

	with gr.Row():
	biophysics_output_zh = gr.Markdown(label="生物物理特征")
	results_table_zh = gr.Dataframe(
	label="完整预测数据表",
	headers=["功效标签", "置信度"],
	datatype=["str", "str"],
	row_count=8
	)

	gr.Examples(
	examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
	inputs=sequence_input_zh,
	label="示例序列"
	)

	# 中文版事件绑定
	predict_btn_zh.click(
	fn=predict_peptide,
	inputs=sequence_input_zh,
	outputs=[conclusion_output_zh, biophysics_output_zh, results_table_zh]
	)

	# ============ Tab 2: 英文版 ============
	with gr.TabItem("🇺🇸 English Version"):
	with gr.Row():
	with gr.Column(scale=2):
	sequence_input_en = gr.Textbox(
	label="Input Peptide Sequence",
	placeholder="e.g., GHK",
	info="Enter amino acid sequence (single letter codes) for bioactivity assessment",
	lines=2
	)
	predict_btn_en = gr.Button("🚀 Analyze Sequence", variant="primary", size="lg")

	with gr.Column(scale=3):
	conclusion_output_en = gr.Markdown(label="Assessment Conclusion")

	with gr.Row():
	biophysics_output_en = gr.Markdown(label="Biophysical Properties")
	results_table_en = gr.Dataframe(
	label="Full Prediction Data",
	headers=["Efficacy Label", "Confidence"],
	datatype=["str", "str"],
	row_count=8
	)

	gr.Examples(
	examples=[["GHK"], ["KTTKS"], ["HGK"], ["AECKVQVR"]],
	inputs=sequence_input_en,
	label="Example Sequences"
	)

	# 英文版事件绑定
	predict_btn_en.click(
	fn=predict_peptide_en,
	inputs=sequence_input_en,
	outputs=[conclusion_output_en, biophysics_output_en, results_table_en]
	)

	# 底部版权信息
	gr.Markdown(
	"""
	---
	<div style="text-align: center; color: #7f8c8d; font-size: 0.9em;">
	<b>BioOracle V14</b> \| Design for Giant Biogene Internship Project<br>
	<i>Disclaimer: Predictions are for research reference only. Wet-lab validation is required.</i>
	</div>
	"""
	)

	# 启动应用
	if __name__ == "__main__":
	demo.launch()