Kung-Hsun commited on
Commit
45e3039
·
verified ·
1 Parent(s): 2f5494f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cheminformatics 多功能平台 - 基礎版
2
+ # 主要涵蓋:分子資料導入、指紋/描述子生成、資料探索、分群、建模、特徵解釋、批量預測、可視化
3
+ # Author: 2025
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import numpy as np
8
+ from rdkit import Chem
9
+ from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
10
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
11
+ from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
12
+ from sklearn.decomposition import PCA
13
+ from sklearn.cluster import KMeans
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ import io
17
+ from PIL import Image
18
+
19
+ # =========== 功能1: 分子資料導入/轉換 =============
20
+ def load_csv(file):
21
+ # 讀取CSV,要求有 smiles 與 label 欄位
22
+ df = pd.read_csv(file.name if hasattr(file, "name") else file)
23
+ if not {'smiles','label'}.issubset(df.columns):
24
+ raise ValueError("CSV需包含'smiles','label'欄位")
25
+ # 統一SMILES格式
26
+ df['smiles'] = df['smiles'].astype(str)
27
+ return df
28
+
29
+ def mol_img(smiles, size=(160,160)):
30
+ mol = Chem.MolFromSmiles(smiles)
31
+ if mol is None:
32
+ return Image.new("RGB", size, (250,250,250))
33
+ return Draw.MolToImage(mol, size=size)
34
+
35
+ # =========== 功能2: 分子指紋/描述子生成 =============
36
+ def ecfp4_fp(smiles, nbits=2048):
37
+ mol = Chem.MolFromSmiles(smiles)
38
+ return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
39
+
40
+ def maccs_fp(smiles):
41
+ mol = Chem.MolFromSmiles(smiles)
42
+ return np.array(MACCSkeys.GenMACCSKeys(mol)) if mol else np.zeros(167)
43
+
44
+ def calc_rdkit_desc(smiles):
45
+ mol = Chem.MolFromSmiles(smiles)
46
+ if mol is None: return {}
47
+ return {n: f(mol) for n, f in Descriptors.descList}
48
+
49
+ def add_fps_and_desc(df):
50
+ # 產生 ECFP4 指紋 (預設2048 bits)
51
+ df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
52
+ # 產生 MACCS 指紋
53
+ df['maccs'] = df['smiles'].apply(maccs_fp)
54
+ # 計算部分常見描述子(如需更多用mordred/rdkit)
55
+ df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
56
+ df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
57
+ return df
58
+
59
+ # =========== 功能3: 資料集探索分析 (EDA) ============
60
+ def plot_desc_dist(df, desc='MolWt'):
61
+ # 柱狀圖:分子量等物化性質分布
62
+ fig, ax = plt.subplots(figsize=(5,3))
63
+ sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
64
+ ax.set_title(f"{desc} Distribution")
65
+ buf = io.BytesIO()
66
+ plt.tight_layout()
67
+ plt.savefig(buf, format='png')
68
+ buf.seek(0)
69
+ plt.close(fig)
70
+ return Image.open(buf)
71
+
72
+ # =========== 功能4: 分群/降維可視化 ============
73
+ def pca_2d(df, use='ecfp4'):
74
+ X = np.stack(df[use].to_numpy())
75
+ pca = PCA(n_components=2)
76
+ pc = pca.fit_transform(X)
77
+ fig, ax = plt.subplots(figsize=(5,4))
78
+ scatter = ax.scatter(pc[:,0], pc[:,1], c=df['label'], cmap='Set1', alpha=0.7)
79
+ plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"PCA 2D ({use})")
80
+ plt.colorbar(scatter)
81
+ buf = io.BytesIO()
82
+ plt.tight_layout()
83
+ plt.savefig(buf, format='png')
84
+ buf.seek(0)
85
+ plt.close(fig)
86
+ return Image.open(buf)
87
+
88
+ def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
89
+ X = np.stack(df[use].to_numpy())
90
+ km = KMeans(n_clusters=n_clusters, random_state=42)
91
+ labels = km.fit_predict(X)
92
+ pca = PCA(n_components=2)
93
+ pc = pca.fit_transform(X)
94
+ fig, ax = plt.subplots(figsize=(5,4))
95
+ scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
96
+ plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"KMeans Clusters ({n_clusters})")
97
+ plt.colorbar(scatter)
98
+ buf = io.BytesIO()
99
+ plt.tight_layout()
100
+ plt.savefig(buf, format='png')
101
+ buf.seek(0)
102
+ plt.close(fig)
103
+ return Image.open(buf)
104
+
105
+ # =========== 功能5: 機器學習建模與預測 ============
106
+ def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
107
+ X = np.stack(df[fp_type].to_numpy())
108
+ y = df['label'].values
109
+ if task == 'auto':
110
+ task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
111
+ if model_type == 'rf':
112
+ model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
113
+ # 可拓展支援更多模型
114
+ scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
115
+ model.fit(X, y)
116
+ return model, scores
117
+
118
+ def predict_single(model, smiles, fp_type='ecfp4'):
119
+ fp = ecfp4_fp(smiles) if fp_type=='ecfp4' else maccs_fp(smiles)
120
+ y_pred = model.predict([fp])[0]
121
+ return y_pred
122
+
123
+ # =========== Gradio主UI ============
124
+
125
+ with gr.Blocks(title="Cheminformatics Platform") as demo:
126
+ gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
127
+
128
+ # --- 分子資料導入 ---
129
+ with gr.Tab("1️⃣ 資料導入/結構圖"):
130
+ file = gr.File(label="上��CSV", file_types=[".csv"])
131
+ df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
132
+ smiles_input = gr.Textbox(label="分子SMILES預覽")
133
+ mol_image = gr.Image(label="分子結構圖", shape=(160,160))
134
+ file.upload(lambda f: load_csv(f).head(10), file, df_preview)
135
+ smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
136
+
137
+ # --- 分子特徵生成 ---
138
+ with gr.Tab("2️⃣ 特徵計算/描述子"):
139
+ file2 = gr.File(label="再次選擇CSV")
140
+ feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
141
+ file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5), file2, feat_preview)
142
+
143
+ # --- 資料探索 ---
144
+ with gr.Tab("3️⃣ 資料集分析 (EDA)"):
145
+ desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子")
146
+ eda_plot = gr.Image(label="分布圖")
147
+ file3 = gr.File(label="選擇CSV")
148
+ file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
149
+ desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d), desc_type, eda_plot)
150
+
151
+ # --- 分群與PCA ---
152
+ with gr.Tab("4️⃣ 分群/降維可視化"):
153
+ file4 = gr.File(label="上傳CSV")
154
+ nclus = gr.Slider(2,8,3,1,label="分群數")
155
+ pca_plot = gr.Image(label="PCA分佈")
156
+ km_plot = gr.Image(label="KMeans分群")
157
+ file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
158
+ file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))), file4, pca_plot)
159
+ nclus.change(lambda n: km_plot.update(value=kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n)), nclus, km_plot)
160
+
161
+ # --- 建模/預測 ---
162
+ with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
163
+ file5 = gr.File(label="上傳CSV")
164
+ model_status = gr.Markdown("模型狀態")
165
+ smiles_pred = gr.Textbox(label="預測SMILES")
166
+ y_pred = gr.Textbox(label="預測值/類別")
167
+ def train_and_predict(f, s):
168
+ df = add_fps_and_desc(load_csv(f))
169
+ model, scores = train_model(df)
170
+ pred = predict_single(model, s)
171
+ return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
172
+ file5.upload(lambda f: model_status.update(value="已載入, 請輸入SMILES進行預測"), file5, model_status)
173
+ smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
174
+
175
+ gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
176
+
177
+ demo.launch(share=True)