Kung-Hsun commited on
Commit
8260ba5
·
verified ·
1 Parent(s): 6bed393

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -75
app.py CHANGED
@@ -7,15 +7,18 @@ import pandas as pd
7
  import numpy as np
8
  from rdkit import Chem
9
  from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
10
- from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
11
- from sklearn.model_selection import cross_val_score
12
  from sklearn.decomposition import PCA
13
- from sklearn.cluster import KMeans
 
 
14
  import matplotlib.pyplot as plt
15
  import seaborn as sns
16
  import io
17
  from PIL import Image
18
  import chardet
 
19
 
20
  # =========== 功能1: 分子資料導入/轉換 ===========
21
  def robust_read_csv(file):
@@ -48,6 +51,121 @@ def mol_img(smiles, size=(160,160)):
48
  return Image.new("RGB", size, (250,250,250))
49
  return Draw.MolToImage(mol, size=size)
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # =========== 功能2: 分子指紋/描述子生成 ===========
52
  def ecfp4_fp(smiles, nbits=2048):
53
  mol = Chem.MolFromSmiles(smiles)
@@ -150,78 +268,88 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
150
  feat_state = gr.State()
151
  model_state = gr.State()
152
 
153
- # --- 分子資料導入 ---
154
  with gr.Tab("1️⃣ 資料導入/結構圖"):
155
- file = gr.File(label="上傳CSV", file_types=[".csv"])
156
- df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
157
- smiles_input = gr.Textbox(label="分子SMILES預覽")
158
- mol_image = gr.Image(label="分子結構圖")
159
-
160
- def on_upload(f):
161
- df = load_csv(f)
162
- return df.head(10), df
163
-
164
- file.upload(on_upload, file, [df_preview, data_state])
165
- smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
166
-
167
- # --- 分子特徵生成 ---
168
- with gr.Tab("2️⃣ 特徵計算/描述子"):
169
- feat_btn = gr.Button("生成特徵/描述子")
170
- feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
171
-
172
- def on_feat(state_df):
173
- if state_df is None:
174
- return pd.DataFrame(), None
175
- feat_df = add_fps_and_desc(state_df.copy())
176
- return feat_df.head(5), feat_df
177
-
178
- feat_btn.click(on_feat, data_state, [feat_preview, feat_state])
179
-
180
- # --- 資料探索 ---
181
- with gr.Tab("3️⃣ 資料集分析 (EDA)"):
182
- desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
183
- eda_btn = gr.Button("生成描述子分布圖")
184
- eda_plot = gr.Image(label="分布圖")
185
- eda_btn.click(lambda d, feat_df: plot_desc_dist(feat_df, d) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
186
- [desc_type, feat_state], eda_plot)
187
-
188
- # ---群與PCA ---
189
- with gr.Tab("4️⃣ 群/降維可視化"):
190
- pca_btn = gr.Button("PCA 分布圖")
191
- pca_plot = gr.Image(label="PCA分佈")
192
- nclus = gr.Slider(2, 8, 3, 1, label="分群數")
193
- km_btn = gr.Button("KMeans 分群圖")
194
- km_plot = gr.Image(label="KMeans分群")
195
- pca_btn.click(lambda feat_df: pca_2d(feat_df) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)), feat_state, pca_plot)
196
- km_btn.click(lambda n, feat_df: kmeans_clusters(feat_df, n) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
197
- [nclus, feat_state], km_plot)
198
-
199
- # --- 建模/預測 ---
200
- with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
201
- train_btn = gr.Button("訓練模型 (RF, 5-fold)")
202
- model_status = gr.Markdown("模型狀態")
203
- smiles_pred = gr.Textbox(label="預測SMILES")
204
- y_pred = gr.Textbox(label="預測值/類別")
205
-
206
- def handle_train(feat_df):
207
- if feat_df is None:
208
- return "請先進行特徵生成", None
209
- model, scores = train_model(feat_df)
210
- return f"模型交叉驗證: {np.round(scores,3)}", model
211
-
212
- train_btn.click(handle_train, feat_state, [model_status, model_state])
213
-
214
- def handle_predict(s, model):
215
- if model is None:
216
- return "請先訓練模型", ""
217
- try:
218
- pred = predict_single(model, s)
219
- return "已預測", str(pred)
220
- except Exception as e:
221
- return f"預測失敗: {e}", ""
222
-
223
- smiles_pred.change(handle_predict, [smiles_pred, model_state], [model_status, y_pred])
224
-
225
- gr.Markdown("---\n> 建議完整流程:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
 
 
 
 
 
 
 
 
 
 
226
 
227
  demo.launch(share=True)
 
7
  import numpy as np
8
  from rdkit import Chem
9
  from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
10
+ from rdkit.Chem import PandasTools
11
+ from rdkit.Chem import rdMolDescriptors
12
  from sklearn.decomposition import PCA
13
+ from sklearn.manifold import TSNE
14
+ from umap import UMAP
15
+ from sklearn.cluster import KMeans, DBSCAN
16
  import matplotlib.pyplot as plt
17
  import seaborn as sns
18
  import io
19
  from PIL import Image
20
  import chardet
21
+ from ydata_profiling import ProfileReport
22
 
23
  # =========== 功能1: 分子資料導入/轉換 ===========
24
  def robust_read_csv(file):
 
51
  return Image.new("RGB", size, (250,250,250))
52
  return Draw.MolToImage(mol, size=size)
53
 
54
+ ### 支援多格式匯入
55
+ def load_table(file):
56
+ if file is None: return pd.DataFrame()
57
+ if hasattr(file, "name") and file.name.endswith(('.xls', '.xlsx')):
58
+ return pd.read_excel(file, engine="openpyxl")
59
+ elif hasattr(file, "name") and file.name.endswith('.sdf'):
60
+ # 用 PandasTools 支援 SDF
61
+ return PandasTools.LoadSDF(file.name)
62
+ else:
63
+ # CSV with encoding detect
64
+ pos = file.tell() if hasattr(file, "tell") else 0
65
+ raw = file.read(4096)
66
+ enc = chardet.detect(raw)["encoding"] or "utf-8"
67
+ file.seek(pos)
68
+ return pd.read_csv(file, encoding=enc)
69
+
70
+ def smiles_to_mol(smiles):
71
+ try:
72
+ return Chem.MolFromSmiles(smiles)
73
+ except:
74
+ return None
75
+
76
+ ### 批量分子圖
77
+ def batch_mol_imgs(smiles_list):
78
+ imgs = []
79
+ for smi in smiles_list:
80
+ imgs.append(mol_img(smi))
81
+ # 拼圖
82
+ n = len(imgs)
83
+ grid = Draw.MolsToGridImage([smiles_to_mol(s) for s in smiles_list[:25] if smiles_to_mol(s)],
84
+ molsPerRow=5, subImgSize=(160, 160))
85
+ buf = io.BytesIO()
86
+ grid.save(buf, format='PNG')
87
+ buf.seek(0)
88
+ return Image.open(buf)
89
+
90
+ ### 特徵、描述子與官能基計數
91
+ def calc_features(df, fp_types, desc_types, func_groups, smarts_dict=None):
92
+ # ECFP4, MACCS, RDKitFP
93
+ if 'ecfp4' in fp_types:
94
+ df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048))
95
+ if 'maccs' in fp_types:
96
+ df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167))
97
+ if 'rdkitfp' in fp_types:
98
+ df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(rdMolDescriptors.GetRDKitFingerprintAsBitVect(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048))
99
+
100
+ # 部分描述子
101
+ for desc in desc_types:
102
+ try:
103
+ if hasattr(Descriptors, desc):
104
+ df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan)
105
+ except Exception: continue
106
+
107
+ # 官能基/SMARTS
108
+ if smarts_dict is None:
109
+ smarts_dict = {'NO2': '[N+](=O)[O-]', 'OH': '[OX2H]', 'NH2': '[NX3;H2]'}
110
+ for name, patt in smarts_dict.items():
111
+ patt_obj = Chem.MolFromSmarts(patt)
112
+ df[name+'_count'] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt_obj) if Chem.MolFromSmiles(s) and patt_obj else [])
113
+ df[name+'_count'] = df[name+'_count'].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0)
114
+ return df
115
+
116
+
117
+ ### 降維/分群/群代表分子
118
+ def apply_dim_red(df, use, method='PCA'):
119
+ X = np.stack(df[use].to_numpy())
120
+ if method == 'PCA':
121
+ pc = PCA(n_components=2).fit_transform(X)
122
+ elif method == 'UMAP':
123
+ pc = UMAP(n_components=2, random_state=42).fit_transform(X)
124
+ elif method == 'tSNE':
125
+ pc = TSNE(n_components=2, random_state=42).fit_transform(X)
126
+ else:
127
+ raise ValueError('Unknown method')
128
+ return pc
129
+
130
+ def plot_scatter(pc, labels, title):
131
+ fig, ax = plt.subplots(figsize=(5,4))
132
+ scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
133
+ plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title)
134
+ plt.colorbar(scatter)
135
+ buf = io.BytesIO()
136
+ plt.tight_layout()
137
+ plt.savefig(buf, format='png')
138
+ buf.seek(0)
139
+ plt.close(fig)
140
+ return Image.open(buf)
141
+
142
+ def clustering(df, use, method='KMeans', n_clusters=3):
143
+ X = np.stack(df[use].to_numpy())
144
+ if method == 'KMeans':
145
+ labels = KMeans(n_clusters=n_clusters, random_state=42).fit_predict(X)
146
+ elif method == 'DBSCAN':
147
+ labels = DBSCAN(eps=3, min_samples=2).fit_predict(X)
148
+ else:
149
+ raise ValueError('Unknown clustering')
150
+ return labels
151
+
152
+ def cluster_reps(df, cluster_labels, use):
153
+ reps = []
154
+ for cl in np.unique(cluster_labels):
155
+ cluster_df = df[cluster_labels==cl]
156
+ idx = np.random.choice(cluster_df.index, 1)[0]
157
+ reps.append(cluster_df.loc[idx]['smiles'])
158
+ return reps
159
+
160
+ ### EDA報表
161
+ def eda_report(df):
162
+ profile = ProfileReport(df, title="EDA報告", minimal=True)
163
+ buf = io.BytesIO()
164
+ profile.to_file(buf)
165
+ buf.seek(0)
166
+ return buf
167
+
168
+
169
  # =========== 功能2: 分子指紋/描述子生成 ===========
170
  def ecfp4_fp(smiles, nbits=2048):
171
  mol = Chem.MolFromSmiles(smiles)
 
268
  feat_state = gr.State()
269
  model_state = gr.State()
270
 
271
+ ## 1. 資料導入與批次結構圖
272
  with gr.Tab("1️⃣ 資料導入/結構圖"):
273
+ up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"])
274
+ df_view = gr.Dataframe(label="資料預覽 (前15筆)")
275
+ mol_grid = gr.Image(label="分子結構圖(前25筆)")
276
+ up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view)
277
+ up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid)
278
+
279
+ ## 2. 特徵與官能基
280
+ with gr.Tab("2️⃣ 特徵/描述子/官能基計算"):
281
+ fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋選擇", value=["ecfp4"])
282
+ desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子")
283
+ func_smart = gr.Textbox(label="官能基SMARTS, 逗號分隔 (如 [N+](=O)[O-], [OX2H], [NX3;H2] )")
284
+ file2 = gr.File(label="再次選擇分子檔")
285
+ feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前10筆)")
286
+
287
+ def calc_all_feats(file, fp, desc, smartbox):
288
+ df = load_table(file)
289
+ # smartbox 格式處理
290
+ smarts_dict = {}
291
+ if smartbox:
292
+ items = [i.strip() for i in smartbox.split(",") if i.strip()]
293
+ for idx, smt in enumerate(items):
294
+ smarts_dict[f"custom_{idx}"] = smt
295
+ df = calc_features(df, fp, desc, smarts_dict if smarts_dict else None)
296
+ return df.head(10)
297
+ file2.upload(lambda f: load_table(f).head(10) if f else pd.DataFrame(), file2, feat_preview)
298
+ gr.Button("特徵計算", variant="primary").click(
299
+ calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview
300
+ )
301
+
302
+ ## 3. 資料探索/EDA
303
+ with gr.Tab("3️⃣ EDA分析/自動報表"):
304
+ file3 = gr.File(label="選擇分子檔")
305
+ col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子欄位")
306
+ eda_img = gr.Image(label="布圖")
307
+ eda_btn = gr.Button("產生描述子")
308
+ eda_sum = gr.File(label="下載EDA報表")
309
+
310
+ def eda_plot(file, col):
311
+ df = load_table(file)
312
+ if col not in df: return None
313
+ fig, ax = plt.subplots(figsize=(5,3))
314
+ sns.histplot(df[col].dropna(), ax=ax, bins=30, kde=True)
315
+ buf = io.BytesIO()
316
+ plt.tight_layout()
317
+ plt.savefig(buf, format='png')
318
+ buf.seek(0)
319
+ plt.close(fig)
320
+ return Image.open(buf)
321
+ eda_btn.click(eda_plot, [file3, col_sel], eda_img)
322
+ gr.Button("生成EDA報表", variant="primary").click(
323
+ lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum
324
+ )
325
+
326
+ ## 4. 降維/分群/群代表分子圖
327
+ with gr.Tab("4️⃣ 降維/分群/結構探索"):
328
+ file4 = gr.File(label="分子檔")
329
+ use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維用指紋")
330
+ dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA")
331
+ cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans")
332
+ nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數")
333
+ dr_img = gr.Image(label="降維視覺化")
334
+ rep_imgs = gr.Image(label="群代表分子(自動選取,每群1個)")
335
+
336
+ def dimred_and_cluster(file, fp, dr, cl, nclu):
337
+ df = load_table(file)
338
+ df = calc_features(df, [fp], [], {})
339
+ pc = apply_dim_red(df, fp, dr)
340
+ if cl == 'KMeans':
341
+ labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc)
342
+ else:
343
+ labels = DBSCAN(eps=3, min_samples=2).fit_predict(pc)
344
+ plotimg = plot_scatter(pc, labels, f"{dr}-{cl}")
345
+ # 每群代表分子
346
+ reps = cluster_reps(df, labels, fp)
347
+ rep_img = batch_mol_imgs(reps)
348
+ return plotimg, rep_img
349
+ gr.Button("降維+分群分析", variant="primary").click(
350
+ dimred_and_cluster, [file4, use_fp, dr_method, cl_method, nclus], [dr_img, rep_imgs]
351
+ )
352
+
353
+ gr.Markdown("---\n> 完整工作流:1️⃣資料導入 → 2️⃣特徵/描述子/官能基 → 3️⃣EDA分析 → 4️⃣降維/分群/結構探索")
354
 
355
  demo.launch(share=True)