Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cheminformatics 多功能平台 - 基礎版
|
| 2 |
+
# 主要涵蓋:分子資料導入、指紋/描述子生成、資料探索、分群、建模、特徵解釋、批量預測、可視化
|
| 3 |
+
# Author: 2025
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
from rdkit import Chem
|
| 9 |
+
from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
|
| 10 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 11 |
+
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
|
| 12 |
+
from sklearn.decomposition import PCA
|
| 13 |
+
from sklearn.cluster import KMeans
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import seaborn as sns
|
| 16 |
+
import io
|
| 17 |
+
from PIL import Image
|
| 18 |
+
|
| 19 |
+
# =========== 功能1: 分子資料導入/轉換 =============
|
| 20 |
+
def load_csv(file):
|
| 21 |
+
# 讀取CSV,要求有 smiles 與 label 欄位
|
| 22 |
+
df = pd.read_csv(file.name if hasattr(file, "name") else file)
|
| 23 |
+
if not {'smiles','label'}.issubset(df.columns):
|
| 24 |
+
raise ValueError("CSV需包含'smiles','label'欄位")
|
| 25 |
+
# 統一SMILES格式
|
| 26 |
+
df['smiles'] = df['smiles'].astype(str)
|
| 27 |
+
return df
|
| 28 |
+
|
| 29 |
+
def mol_img(smiles, size=(160,160)):
|
| 30 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 31 |
+
if mol is None:
|
| 32 |
+
return Image.new("RGB", size, (250,250,250))
|
| 33 |
+
return Draw.MolToImage(mol, size=size)
|
| 34 |
+
|
| 35 |
+
# =========== 功能2: 分子指紋/描述子生成 =============
|
| 36 |
+
def ecfp4_fp(smiles, nbits=2048):
|
| 37 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 38 |
+
return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
|
| 39 |
+
|
| 40 |
+
def maccs_fp(smiles):
|
| 41 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 42 |
+
return np.array(MACCSkeys.GenMACCSKeys(mol)) if mol else np.zeros(167)
|
| 43 |
+
|
| 44 |
+
def calc_rdkit_desc(smiles):
|
| 45 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 46 |
+
if mol is None: return {}
|
| 47 |
+
return {n: f(mol) for n, f in Descriptors.descList}
|
| 48 |
+
|
| 49 |
+
def add_fps_and_desc(df):
|
| 50 |
+
# 產生 ECFP4 指紋 (預設2048 bits)
|
| 51 |
+
df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
|
| 52 |
+
# 產生 MACCS 指紋
|
| 53 |
+
df['maccs'] = df['smiles'].apply(maccs_fp)
|
| 54 |
+
# 計算部分常見描述子(如需更多用mordred/rdkit)
|
| 55 |
+
df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
|
| 56 |
+
df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
|
| 57 |
+
return df
|
| 58 |
+
|
| 59 |
+
# =========== 功能3: 資料集探索分析 (EDA) ============
|
| 60 |
+
def plot_desc_dist(df, desc='MolWt'):
|
| 61 |
+
# 柱狀圖:分子量等物化性質分布
|
| 62 |
+
fig, ax = plt.subplots(figsize=(5,3))
|
| 63 |
+
sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
|
| 64 |
+
ax.set_title(f"{desc} Distribution")
|
| 65 |
+
buf = io.BytesIO()
|
| 66 |
+
plt.tight_layout()
|
| 67 |
+
plt.savefig(buf, format='png')
|
| 68 |
+
buf.seek(0)
|
| 69 |
+
plt.close(fig)
|
| 70 |
+
return Image.open(buf)
|
| 71 |
+
|
| 72 |
+
# =========== 功能4: 分群/降維可視化 ============
|
| 73 |
+
def pca_2d(df, use='ecfp4'):
|
| 74 |
+
X = np.stack(df[use].to_numpy())
|
| 75 |
+
pca = PCA(n_components=2)
|
| 76 |
+
pc = pca.fit_transform(X)
|
| 77 |
+
fig, ax = plt.subplots(figsize=(5,4))
|
| 78 |
+
scatter = ax.scatter(pc[:,0], pc[:,1], c=df['label'], cmap='Set1', alpha=0.7)
|
| 79 |
+
plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"PCA 2D ({use})")
|
| 80 |
+
plt.colorbar(scatter)
|
| 81 |
+
buf = io.BytesIO()
|
| 82 |
+
plt.tight_layout()
|
| 83 |
+
plt.savefig(buf, format='png')
|
| 84 |
+
buf.seek(0)
|
| 85 |
+
plt.close(fig)
|
| 86 |
+
return Image.open(buf)
|
| 87 |
+
|
| 88 |
+
def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
|
| 89 |
+
X = np.stack(df[use].to_numpy())
|
| 90 |
+
km = KMeans(n_clusters=n_clusters, random_state=42)
|
| 91 |
+
labels = km.fit_predict(X)
|
| 92 |
+
pca = PCA(n_components=2)
|
| 93 |
+
pc = pca.fit_transform(X)
|
| 94 |
+
fig, ax = plt.subplots(figsize=(5,4))
|
| 95 |
+
scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
|
| 96 |
+
plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"KMeans Clusters ({n_clusters})")
|
| 97 |
+
plt.colorbar(scatter)
|
| 98 |
+
buf = io.BytesIO()
|
| 99 |
+
plt.tight_layout()
|
| 100 |
+
plt.savefig(buf, format='png')
|
| 101 |
+
buf.seek(0)
|
| 102 |
+
plt.close(fig)
|
| 103 |
+
return Image.open(buf)
|
| 104 |
+
|
| 105 |
+
# =========== 功能5: 機器學習建模與預測 ============
|
| 106 |
+
def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
|
| 107 |
+
X = np.stack(df[fp_type].to_numpy())
|
| 108 |
+
y = df['label'].values
|
| 109 |
+
if task == 'auto':
|
| 110 |
+
task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
|
| 111 |
+
if model_type == 'rf':
|
| 112 |
+
model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
|
| 113 |
+
# 可拓展支援更多模型
|
| 114 |
+
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
|
| 115 |
+
model.fit(X, y)
|
| 116 |
+
return model, scores
|
| 117 |
+
|
| 118 |
+
def predict_single(model, smiles, fp_type='ecfp4'):
|
| 119 |
+
fp = ecfp4_fp(smiles) if fp_type=='ecfp4' else maccs_fp(smiles)
|
| 120 |
+
y_pred = model.predict([fp])[0]
|
| 121 |
+
return y_pred
|
| 122 |
+
|
| 123 |
+
# =========== Gradio主UI ============
|
| 124 |
+
|
| 125 |
+
with gr.Blocks(title="Cheminformatics Platform") as demo:
|
| 126 |
+
gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
|
| 127 |
+
|
| 128 |
+
# --- 分子資料導入 ---
|
| 129 |
+
with gr.Tab("1️⃣ 資料導入/結構圖"):
|
| 130 |
+
file = gr.File(label="上��CSV", file_types=[".csv"])
|
| 131 |
+
df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
|
| 132 |
+
smiles_input = gr.Textbox(label="分子SMILES預覽")
|
| 133 |
+
mol_image = gr.Image(label="分子結構圖", shape=(160,160))
|
| 134 |
+
file.upload(lambda f: load_csv(f).head(10), file, df_preview)
|
| 135 |
+
smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
|
| 136 |
+
|
| 137 |
+
# --- 分子特徵生成 ---
|
| 138 |
+
with gr.Tab("2️⃣ 特徵計算/描述子"):
|
| 139 |
+
file2 = gr.File(label="再次選擇CSV")
|
| 140 |
+
feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
|
| 141 |
+
file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5), file2, feat_preview)
|
| 142 |
+
|
| 143 |
+
# --- 資料探索 ---
|
| 144 |
+
with gr.Tab("3️⃣ 資料集分析 (EDA)"):
|
| 145 |
+
desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子")
|
| 146 |
+
eda_plot = gr.Image(label="分布圖")
|
| 147 |
+
file3 = gr.File(label="選擇CSV")
|
| 148 |
+
file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
|
| 149 |
+
desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d), desc_type, eda_plot)
|
| 150 |
+
|
| 151 |
+
# --- 分群與PCA ---
|
| 152 |
+
with gr.Tab("4️⃣ 分群/降維可視化"):
|
| 153 |
+
file4 = gr.File(label="上傳CSV")
|
| 154 |
+
nclus = gr.Slider(2,8,3,1,label="分群數")
|
| 155 |
+
pca_plot = gr.Image(label="PCA分佈")
|
| 156 |
+
km_plot = gr.Image(label="KMeans分群")
|
| 157 |
+
file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
|
| 158 |
+
file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))), file4, pca_plot)
|
| 159 |
+
nclus.change(lambda n: km_plot.update(value=kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n)), nclus, km_plot)
|
| 160 |
+
|
| 161 |
+
# --- 建模/預測 ---
|
| 162 |
+
with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
|
| 163 |
+
file5 = gr.File(label="上傳CSV")
|
| 164 |
+
model_status = gr.Markdown("模型狀態")
|
| 165 |
+
smiles_pred = gr.Textbox(label="預測SMILES")
|
| 166 |
+
y_pred = gr.Textbox(label="預測值/類別")
|
| 167 |
+
def train_and_predict(f, s):
|
| 168 |
+
df = add_fps_and_desc(load_csv(f))
|
| 169 |
+
model, scores = train_model(df)
|
| 170 |
+
pred = predict_single(model, s)
|
| 171 |
+
return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
|
| 172 |
+
file5.upload(lambda f: model_status.update(value="已載入, 請輸入SMILES進行預測"), file5, model_status)
|
| 173 |
+
smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
|
| 174 |
+
|
| 175 |
+
gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
|
| 176 |
+
|
| 177 |
+
demo.launch(share=True)
|