Kung-Hsun commited on
Commit
235e68e
·
verified ·
1 Parent(s): 1d19db7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -247
app.py CHANGED
@@ -20,80 +20,30 @@ from PIL import Image
20
  import chardet
21
  from ydata_profiling import ProfileReport
22
 
23
- # =========== 功能1: 分子資料導入/轉換 ===========
24
- def robust_read_csv(file):
25
- if file is None:
26
- return pd.DataFrame()
27
- if hasattr(file, "read"):
28
- pos = file.tell() if hasattr(file, "tell") else 0
29
- raw = file.read(4096)
30
- enc = chardet.detect(raw)["encoding"] or "utf-8"
31
- file.seek(pos)
32
- return pd.read_csv(file, encoding=enc)
33
- elif hasattr(file, "name"):
34
- with open(file.name, "rb") as f:
35
- raw = f.read(4096)
36
- enc = chardet.detect(raw)["encoding"] or "utf-8"
37
- return pd.read_csv(file.name, encoding=enc)
38
- else:
39
- raise RuntimeError("未知 file 類型")
40
-
41
- def load_csv(file):
42
- df = robust_read_csv(file)
43
- if not {'smiles','label'}.issubset(df.columns):
44
- raise ValueError("CSV需包含'smiles','label'欄位")
45
- df['smiles'] = df['smiles'].astype(str)
46
- return df
47
-
48
- def mol_img(smiles, size=(160,160)):
49
- mol = Chem.MolFromSmiles(smiles)
50
- if mol is None:
51
- return Image.new("RGB", size, (250,250,250))
52
- return Draw.MolToImage(mol, size=size)
53
-
54
- ### 支援多格式匯入
55
  def load_table(file):
56
- # 允許 file 為 None, gradio.NamedString, gradio.TempFile, file-like, 或字串路徑
57
  if file is None:
58
  return pd.DataFrame()
59
- # 若是字串(Gradio新版直接給路徑字串)
60
- if isinstance(file, str):
61
- # 根據副檔名選擇讀取方式
62
- if file.endswith('.csv'):
63
- return pd.read_csv(file)
64
- elif file.endswith(('.xls', '.xlsx')):
65
- return pd.read_excel(file, engine="openpyxl")
66
- elif file.endswith('.sdf'):
67
- return PandasTools.LoadSDF(file)
68
- else:
69
- raise RuntimeError(f"不支援的檔案格式: {file}")
70
- # 若是有 .name 屬性(TempFile, NamedString)
71
- elif hasattr(file, "name"):
72
- fname = file.name
73
  if fname.endswith('.csv'):
74
- return pd.read_csv(fname)
75
- elif fname.endswith(('.xls', '.xlsx')):
76
- return pd.read_excel(fname, engine="openpyxl")
 
 
 
77
  elif fname.endswith('.sdf'):
78
  return PandasTools.LoadSDF(fname)
79
  else:
80
  raise RuntimeError(f"不支援的檔案格式: {fname}")
81
- # 若是有 read 方法的 file-like(極少見)
82
- elif hasattr(file, "read"):
83
- return pd.read_csv(file)
84
- else:
85
- raise RuntimeError("未知檔案型態")
86
 
87
- def smiles_to_mol(smiles):
88
- try:
89
- return Chem.MolFromSmiles(smiles)
90
- except:
91
- return None
92
-
93
- ### 批量分子圖
94
  def batch_mol_imgs(smiles_list):
95
  mols = [Chem.MolFromSmiles(s) for s in smiles_list[:25] if Chem.MolFromSmiles(s)]
96
- if len(mols)==0:
97
  return Image.new("RGB", (800, 160), (255,255,255))
98
  grid = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(160,160))
99
  buf = io.BytesIO()
@@ -101,117 +51,39 @@ def batch_mol_imgs(smiles_list):
101
  buf.seek(0)
102
  return Image.open(buf)
103
 
104
- ### 特徵、描述子官能基計數
105
- def calc_features(df, fp_types, desc_types, func_groups, smarts_dict=None):
106
- # ECFP4, MACCS, RDKitFP
107
  if 'ecfp4' in fp_types:
108
  df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048))
109
  if 'maccs' in fp_types:
110
  df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167))
111
  if 'rdkitfp' in fp_types:
112
  df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(rdMolDescriptors.GetRDKitFingerprintAsBitVect(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048))
113
-
114
- # 部分描述子
115
  for desc in desc_types:
116
  try:
117
  if hasattr(Descriptors, desc):
118
  df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan)
119
- except Exception: continue
120
-
121
- # 官能基/SMARTS
122
- if smarts_dict is None:
123
- smarts_dict = {'NO2': '[N+](=O)[O-]', 'OH': '[OX2H]', 'NH2': '[NX3;H2]'}
124
- for name, patt in smarts_dict.items():
125
- patt_obj = Chem.MolFromSmarts(patt)
126
- df[name+'_count'] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt_obj) if Chem.MolFromSmiles(s) and patt_obj else [])
127
- df[name+'_count'] = df[name+'_count'].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0)
128
  return df
129
 
130
-
131
- ### 降維/分群/群代表分子
132
- def apply_dim_red(df, use, method='PCA'):
133
- X = np.stack(df[use].to_numpy())
134
- if method == 'PCA':
135
- pc = PCA(n_components=2).fit_transform(X)
136
- elif method == 'UMAP':
137
- pc = UMAP(n_components=2, random_state=42).fit_transform(X)
138
- elif method == 'tSNE':
139
- pc = TSNE(n_components=2, random_state=42).fit_transform(X)
140
- else:
141
- raise ValueError('Unknown method')
142
- return pc
143
-
144
- def plot_scatter(pc, labels, title):
145
- fig, ax = plt.subplots(figsize=(5,4))
146
- scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
147
- plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title)
148
- plt.colorbar(scatter)
149
- buf = io.BytesIO()
150
- plt.tight_layout()
151
- plt.savefig(buf, format='png')
152
- buf.seek(0)
153
- plt.close(fig)
154
- return Image.open(buf)
155
-
156
- def clustering(df, use, method='KMeans', n_clusters=3):
157
- X = np.stack(df[use].to_numpy())
158
- if method == 'KMeans':
159
- labels = KMeans(n_clusters=n_clusters, random_state=42).fit_predict(X)
160
- elif method == 'DBSCAN':
161
- labels = DBSCAN(eps=3, min_samples=2).fit_predict(X)
162
- else:
163
- raise ValueError('Unknown clustering')
164
- return labels
165
-
166
- def cluster_reps(df, cluster_labels, use):
167
- reps = []
168
- for cl in np.unique(cluster_labels):
169
- cluster_df = df[cluster_labels==cl]
170
- idx = np.random.choice(cluster_df.index, 1)[0]
171
- reps.append(cluster_df.loc[idx]['smiles'])
172
- return reps
173
-
174
- ### EDA報表
175
  def eda_report(df):
176
  profile = ProfileReport(df, title="EDA報告", minimal=True)
177
- buf = io.BytesIO()
178
- profile.to_file(buf)
179
- buf.seek(0)
180
- return buf
181
-
182
-
183
- # =========== 功能2: 分子指紋/描述子生成 ===========
184
- def ecfp4_fp(smiles, nbits=2048):
185
- mol = Chem.MolFromSmiles(smiles)
186
- return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
187
-
188
- def maccs_fp(smiles):
189
- mol = Chem.MolFromSmiles(smiles)
190
- return np.array(MACCSkeys.GenMACCSKeys(mol)) if mol else np.zeros(167)
191
 
192
- def calc_rdkit_desc(smiles):
193
- mol = Chem.MolFromSmiles(smiles)
194
- if mol is None: return {}
195
- return {n: f(mol) for n, f in Descriptors.descList}
196
-
197
- def add_fps_and_desc(df):
198
- if 'ecfp4' not in df.columns:
199
- df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
200
- if 'maccs' not in df.columns:
201
- df['maccs'] = df['smiles'].apply(maccs_fp)
202
- if 'MolWt' not in df.columns:
203
- df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
204
- if 'TPSA' not in df.columns:
205
- df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
206
- return df
207
-
208
- # =========== 功能3: 資料集探索分析 (EDA) ===========
209
  def plot_desc_dist(df, desc='MolWt'):
210
  if df is None or desc not in df.columns:
211
  return Image.new("RGB", (400,200), (255,255,255))
212
  fig, ax = plt.subplots(figsize=(5,3))
213
  sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
214
- ax.set_title(f"{desc} Distribution")
215
  buf = io.BytesIO()
216
  plt.tight_layout()
217
  plt.savefig(buf, format='png')
@@ -219,35 +91,23 @@ def plot_desc_dist(df, desc='MolWt'):
219
  plt.close(fig)
220
  return Image.open(buf)
221
 
222
- # =========== 功能4: 分群/降維可視化 ===========
223
- def pca_2d(df, use='ecfp4'):
224
- if df is None or use not in df.columns:
225
- return Image.new("RGB", (400,200), (255,255,255))
226
  X = np.stack(df[use].to_numpy())
227
- pca = PCA(n_components=2)
228
- pc = pca.fit_transform(X)
229
- fig, ax = plt.subplots(figsize=(5,4))
230
- scatter = ax.scatter(pc[:,0], pc[:,1], c=df['label'], cmap='Set1', alpha=0.7)
231
- plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"PCA 2D ({use})")
232
- plt.colorbar(scatter)
233
- buf = io.BytesIO()
234
- plt.tight_layout()
235
- plt.savefig(buf, format='png')
236
- buf.seek(0)
237
- plt.close(fig)
238
- return Image.open(buf)
239
 
240
- def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
241
- if df is None or use not in df.columns:
242
- return Image.new("RGB", (400,200), (255,255,255))
243
- X = np.stack(df[use].to_numpy())
244
- km = KMeans(n_clusters=n_clusters, random_state=42)
245
- labels = km.fit_predict(X)
246
- pca = PCA(n_components=2)
247
- pc = pca.fit_transform(X)
248
  fig, ax = plt.subplots(figsize=(5,4))
249
  scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
250
- plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"KMeans Clusters ({n_clusters})")
251
  plt.colorbar(scatter)
252
  buf = io.BytesIO()
253
  plt.tight_layout()
@@ -256,33 +116,21 @@ def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
256
  plt.close(fig)
257
  return Image.open(buf)
258
 
259
- # =========== 功能5: 機器學習建模與預測 ===========
260
- def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
261
- X = np.stack(df[fp_type].to_numpy())
262
- y = df['label'].values
263
- if task == 'auto':
264
- task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
265
- if model_type == 'rf':
266
- model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
267
- scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
268
- model.fit(X, y)
269
- return model, scores
270
-
271
- def predict_single(model, smiles, fp_type='ecfp4'):
272
- fp = ecfp4_fp(smiles) if fp_type=='ecfp4' else maccs_fp(smiles)
273
- y_pred = model.predict([fp])[0]
274
- return y_pred
275
 
276
- # =========== Gradio主UI ===========
277
  with gr.Blocks(title="Cheminformatics Platform") as demo:
278
- gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
279
-
280
- # 全域狀態:原始資料、特徵後資料、模型
281
- data_state = gr.State()
282
- feat_state = gr.State()
283
- model_state = gr.State()
284
 
285
- ## 1. 資料導入與批次結構圖
286
  with gr.Tab("1️⃣ 資料導入/結構圖"):
287
  up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"])
288
  df_view = gr.Dataframe(label="資料預覽 (前15筆)")
@@ -290,73 +138,51 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
290
  up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view)
291
  up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid)
292
 
293
- ## 2. 特徵官能基
294
  with gr.Tab("2️⃣ 特徵/描述子/官能基計算"):
295
- fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋選擇", value=["ecfp4"])
 
296
  desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子")
297
- func_smart = gr.Textbox(label="官能基SMARTS, 逗號分隔 (如 [N+](=O)[O-], [OX2H], [NX3;H2] )")
298
- file2 = gr.File(label="再次選擇分")
299
- feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前10筆)")
300
-
301
  def calc_all_feats(file, fp, desc, smartbox):
302
  df = load_table(file)
303
- # smartbox 格式處理
304
- smarts_dict = {}
305
- if smartbox:
306
- items = [i.strip() for i in smartbox.split(",") if i.strip()]
307
- for idx, smt in enumerate(items):
308
- smarts_dict[f"custom_{idx}"] = smt
309
- df = calc_features(df, fp, desc, smarts_dict if smarts_dict else None)
310
  return df.head(10)
311
- file2.upload(lambda f: load_table(f).head(10) if f else pd.DataFrame(), file2, feat_preview)
312
- gr.Button("特徵計算", variant="primary").click(
313
  calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview
314
  )
315
 
316
- ## 3. 資料探索/EDA
317
  with gr.Tab("3️⃣ EDA分析/自動報表"):
318
- file3 = gr.File(label="選擇分子檔")
319
- col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子欄位")
320
- eda_img = gr.Image(label="分布圖")
321
- eda_btn = gr.Button("產生描述子分布")
 
 
 
 
322
  eda_sum = gr.File(label="下載EDA報表")
323
-
324
- def eda_plot(file, col):
325
- df = load_table(file)
326
- if col not in df: return None
327
- fig, ax = plt.subplots(figsize=(5,3))
328
- sns.histplot(df[col].dropna(), ax=ax, bins=30, kde=True)
329
- buf = io.BytesIO()
330
- plt.tight_layout()
331
- plt.savefig(buf, format='png')
332
- buf.seek(0)
333
- plt.close(fig)
334
- return Image.open(buf)
335
- eda_btn.click(eda_plot, [file3, col_sel], eda_img)
336
  gr.Button("生成EDA報表", variant="primary").click(
337
  lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum
338
  )
339
 
340
- ## 4. 降維/分群/群代表分子圖
341
  with gr.Tab("4️⃣ 降維/分群/結構探索"):
342
  file4 = gr.File(label="分子檔")
343
- use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維指紋")
344
  dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA")
345
  cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans")
346
  nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數")
347
- dr_img = gr.Image(label="降維視覺化")
348
- rep_imgs = gr.Image(label="群代表分子(自動選取,每群1個)")
349
-
350
  def dimred_and_cluster(file, fp, dr, cl, nclu):
351
  df = load_table(file)
352
- df = calc_features(df, [fp], [], {})
353
  pc = apply_dim_red(df, fp, dr)
354
- if cl == 'KMeans':
355
- labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc)
356
- else:
357
- labels = DBSCAN(eps=3, min_samples=2).fit_predict(pc)
358
  plotimg = plot_scatter(pc, labels, f"{dr}-{cl}")
359
- # 每群代表分子
360
  reps = cluster_reps(df, labels, fp)
361
  rep_img = batch_mol_imgs(reps)
362
  return plotimg, rep_img
 
20
  import chardet
21
  from ydata_profiling import ProfileReport
22
 
23
+ # =========== Robust 多格式自動讀取 ===========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def load_table(file):
 
25
  if file is None:
26
  return pd.DataFrame()
27
+ # 路徑或 str
28
+ fname = file if isinstance(file, str) else getattr(file, "name", None)
29
+ if fname is not None:
 
 
 
 
 
 
 
 
 
 
 
30
  if fname.endswith('.csv'):
31
+ with open(fname, 'rb') as f:
32
+ raw = f.read(4096)
33
+ enc = chardet.detect(raw)['encoding'] or 'utf-8'
34
+ return pd.read_csv(fname, encoding=enc, engine='python')
35
+ elif fname.endswith('.xlsx') or fname.endswith('.xls'):
36
+ return pd.read_excel(fname)
37
  elif fname.endswith('.sdf'):
38
  return PandasTools.LoadSDF(fname)
39
  else:
40
  raise RuntimeError(f"不支援的檔案格式: {fname}")
41
+ raise RuntimeError("不支援的 file 類型")
 
 
 
 
42
 
43
+ # =========== 批量分子圖 (前25) ===========
 
 
 
 
 
 
44
  def batch_mol_imgs(smiles_list):
45
  mols = [Chem.MolFromSmiles(s) for s in smiles_list[:25] if Chem.MolFromSmiles(s)]
46
+ if not mols:
47
  return Image.new("RGB", (800, 160), (255,255,255))
48
  grid = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(160,160))
49
  buf = io.BytesIO()
 
51
  buf.seek(0)
52
  return Image.open(buf)
53
 
54
+ # =========== 指紋/描述子/官能基 ===========
55
+ def calc_features(df, fp_types, desc_types, smartbox):
 
56
  if 'ecfp4' in fp_types:
57
  df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048))
58
  if 'maccs' in fp_types:
59
  df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167))
60
  if 'rdkitfp' in fp_types:
61
  df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(rdMolDescriptors.GetRDKitFingerprintAsBitVect(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048))
 
 
62
  for desc in desc_types:
63
  try:
64
  if hasattr(Descriptors, desc):
65
  df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan)
66
+ except: continue
67
+ # SMARTS 官能基
68
+ if smartbox:
69
+ for idx, smt in enumerate([x.strip() for x in smartbox.split(",") if x.strip()]):
70
+ patt = Chem.MolFromSmarts(smt)
71
+ df[f"FG{idx+1}_count"] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt) if Chem.MolFromSmiles(s) and patt else [])
72
+ df[f"FG{idx+1}_count"] = df[f"FG{idx+1}_count"].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0)
 
 
73
  return df
74
 
75
+ # =========== EDA報表 & 單欄分布 ===========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def eda_report(df):
77
  profile = ProfileReport(df, title="EDA報告", minimal=True)
78
+ out = "/tmp/eda_report.html"
79
+ profile.to_file(out)
80
+ return out
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def plot_desc_dist(df, desc='MolWt'):
83
  if df is None or desc not in df.columns:
84
  return Image.new("RGB", (400,200), (255,255,255))
85
  fig, ax = plt.subplots(figsize=(5,3))
86
  sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
 
87
  buf = io.BytesIO()
88
  plt.tight_layout()
89
  plt.savefig(buf, format='png')
 
91
  plt.close(fig)
92
  return Image.open(buf)
93
 
94
+ # =========== 降維/分群 & 群代表分子 ===========
95
+ def apply_dim_red(df, use, method='PCA'):
 
 
96
  X = np.stack(df[use].to_numpy())
97
+ if method == 'PCA':
98
+ pc = PCA(n_components=2).fit_transform(X)
99
+ elif method == 'UMAP':
100
+ pc = UMAP(n_components=2, random_state=42).fit_transform(X)
101
+ elif method == 'tSNE':
102
+ pc = TSNE(n_components=2, random_state=42).fit_transform(X)
103
+ else:
104
+ raise ValueError('Unknown method')
105
+ return pc
 
 
 
106
 
107
+ def plot_scatter(pc, labels, title):
 
 
 
 
 
 
 
108
  fig, ax = plt.subplots(figsize=(5,4))
109
  scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
110
+ plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title)
111
  plt.colorbar(scatter)
112
  buf = io.BytesIO()
113
  plt.tight_layout()
 
116
  plt.close(fig)
117
  return Image.open(buf)
118
 
119
+ def cluster_reps(df, labels, use):
120
+ reps = []
121
+ labels = np.array(labels)
122
+ for cl in np.unique(labels):
123
+ cluster_df = df[labels == cl]
124
+ if len(cluster_df) > 0:
125
+ idx = np.random.choice(cluster_df.index, 1)[0]
126
+ reps.append(cluster_df.loc[idx]['smiles'])
127
+ return reps
 
 
 
 
 
 
 
128
 
129
+ # =========== Gradio UI ===========
130
  with gr.Blocks(title="Cheminformatics Platform") as demo:
131
+ gr.Markdown("# 🧪 Cheminformatics 多功能平台")
 
 
 
 
 
132
 
133
+ # 1. 資料導入與批次結構圖
134
  with gr.Tab("1️⃣ 資料導入/結構圖"):
135
  up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"])
136
  df_view = gr.Dataframe(label="資料預覽 (前15筆)")
 
138
  up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view)
139
  up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid)
140
 
141
+ # 2. 特徵/描述子/官能基計算
142
  with gr.Tab("2️⃣ 特徵/描述子/官能基計算"):
143
+ file2 = gr.File(label="選擇分子檔")
144
+ fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋", value=['ecfp4'])
145
  desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子")
146
+ func_smart = gr.Textbox(label="官能基SMARTS(逗號分隔)", placeholder="[N+](=O)[O-], [OX2H]")
147
+ feat_preview = gr.Dataframe(label="特徵/描述預覽(前10筆)")
 
 
148
  def calc_all_feats(file, fp, desc, smartbox):
149
  df = load_table(file)
150
+ df = calc_features(df, fp, desc, smartbox)
 
 
 
 
 
 
151
  return df.head(10)
152
+ gr.Button("特徵/官能基計算", variant="primary").click(
 
153
  calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview
154
  )
155
 
156
+ # 3. EDA分析/自動報表
157
  with gr.Tab("3️⃣ EDA分析/自動報表"):
158
+ file3 = gr.File(label="分子檔")
159
+ col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="欄位")
160
+ eda_img = gr.Image(label="描述子分布圖")
161
+ eda_btn = gr.Button("產生分布")
162
+ eda_btn.click(
163
+ lambda f, c: plot_desc_dist(calc_features(load_table(f), ['ecfp4'], [c], None), c) if f else None,
164
+ [file3, col_sel], eda_img
165
+ )
166
  eda_sum = gr.File(label="下載EDA報表")
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  gr.Button("生成EDA報表", variant="primary").click(
168
  lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum
169
  )
170
 
171
+ # 4. 降維/分群/群代表分子圖
172
  with gr.Tab("4️⃣ 降維/分群/結構探索"):
173
  file4 = gr.File(label="分子檔")
174
+ use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維指紋", value="ecfp4")
175
  dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA")
176
  cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans")
177
  nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數")
178
+ dr_img = gr.Image(label="降維/分群視覺化")
179
+ rep_imgs = gr.Image(label="群代表分子(每群1個)")
 
180
  def dimred_and_cluster(file, fp, dr, cl, nclu):
181
  df = load_table(file)
182
+ df = calc_features(df, [fp], [], None)
183
  pc = apply_dim_red(df, fp, dr)
184
+ labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc) if cl == 'KMeans' else DBSCAN(eps=3, min_samples=2).fit_predict(pc)
 
 
 
185
  plotimg = plot_scatter(pc, labels, f"{dr}-{cl}")
 
186
  reps = cluster_reps(df, labels, fp)
187
  rep_img = batch_mol_imgs(reps)
188
  return plotimg, rep_img