Kung-Hsun commited on
Commit
76b74c3
·
verified ·
1 Parent(s): 0463d63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -14
app.py CHANGED
@@ -8,18 +8,37 @@ import numpy as np
8
  from rdkit import Chem
9
  from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
10
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
11
- from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
12
  from sklearn.decomposition import PCA
13
  from sklearn.cluster import KMeans
14
  import matplotlib.pyplot as plt
15
  import seaborn as sns
16
  import io
17
  from PIL import Image
 
18
 
19
  # =========== 功能1: 分子資料導入/轉換 =============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def load_csv(file):
21
  # 讀取CSV,要求有 smiles 與 label 欄位
22
- df = pd.read_csv(file.name if hasattr(file, "name") else file)
23
  if not {'smiles','label'}.issubset(df.columns):
24
  raise ValueError("CSV需包含'smiles','label'欄位")
25
  # 統一SMILES格式
@@ -47,18 +66,17 @@ def calc_rdkit_desc(smiles):
47
  return {n: f(mol) for n, f in Descriptors.descList}
48
 
49
  def add_fps_and_desc(df):
50
- # 產生 ECFP4 指紋 (預設2048 bits)
51
  df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
52
- # 產生 MACCS 指紋
53
  df['maccs'] = df['smiles'].apply(maccs_fp)
54
- # 計算部分常見描述子(如需更多用mordred/rdkit)
55
  df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
56
  df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
57
  return df
58
 
 
59
  # =========== 功能3: 資料集探索分析 (EDA) ============
60
  def plot_desc_dist(df, desc='MolWt'):
61
- # 柱狀圖:分子量等物化性質分布
 
62
  fig, ax = plt.subplots(figsize=(5,3))
63
  sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
64
  ax.set_title(f"{desc} Distribution")
@@ -71,6 +89,8 @@ def plot_desc_dist(df, desc='MolWt'):
71
 
72
  # =========== 功能4: 分群/降維可視化 ============
73
  def pca_2d(df, use='ecfp4'):
 
 
74
  X = np.stack(df[use].to_numpy())
75
  pca = PCA(n_components=2)
76
  pc = pca.fit_transform(X)
@@ -86,6 +106,8 @@ def pca_2d(df, use='ecfp4'):
86
  return Image.open(buf)
87
 
88
  def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
 
 
89
  X = np.stack(df[use].to_numpy())
90
  km = KMeans(n_clusters=n_clusters, random_state=42)
91
  labels = km.fit_predict(X)
@@ -110,7 +132,6 @@ def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
110
  task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
111
  if model_type == 'rf':
112
  model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
113
- # 可拓展支援更多模型
114
  scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
115
  model.fit(X, y)
116
  return model, scores
@@ -131,22 +152,23 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
131
  df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
132
  smiles_input = gr.Textbox(label="分子SMILES預覽")
133
  mol_image = gr.Image(label="分子結構圖")
134
- file.upload(lambda f: load_csv(f).head(10), file, df_preview)
135
  smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
136
 
137
  # --- 分子特徵生成 ---
138
  with gr.Tab("2️⃣ 特徵計算/描述子"):
139
  file2 = gr.File(label="再次選擇CSV")
140
  feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
141
- file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5), file2, feat_preview)
142
 
143
  # --- 資料探索 ---
144
  with gr.Tab("3️⃣ 資料集分析 (EDA)"):
145
- desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子")
146
  eda_plot = gr.Image(label="分布圖")
147
  file3 = gr.File(label="選擇CSV")
148
  file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
149
- desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d), desc_type, eda_plot)
 
150
 
151
  # --- 分群與PCA ---
152
  with gr.Tab("4️⃣ 分群/降維可視化"):
@@ -155,8 +177,8 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
155
  pca_plot = gr.Image(label="PCA分佈")
156
  km_plot = gr.Image(label="KMeans分群")
157
  file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
158
- file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))), file4, pca_plot)
159
- nclus.change(lambda n: km_plot.update(value=kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n)), nclus, km_plot)
160
 
161
  # --- 建模/預測 ---
162
  with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
@@ -165,13 +187,15 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
165
  smiles_pred = gr.Textbox(label="預測SMILES")
166
  y_pred = gr.Textbox(label="預測值/類別")
167
  def train_and_predict(f, s):
 
168
  df = add_fps_and_desc(load_csv(f))
169
  model, scores = train_model(df)
170
  pred = predict_single(model, s)
171
  return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
172
- file5.upload(lambda f: model_status.update(value="已載入, 請輸入SMILES進行預測"), file5, model_status)
173
  smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
174
 
175
  gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
176
 
 
177
  demo.launch(share=True)
 
8
  from rdkit import Chem
9
  from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
10
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
11
+ from sklearn.model_selection import cross_val_score
12
  from sklearn.decomposition import PCA
13
  from sklearn.cluster import KMeans
14
  import matplotlib.pyplot as plt
15
  import seaborn as sns
16
  import io
17
  from PIL import Image
18
+ import chardet
19
 
20
  # =========== 功能1: 分子資料導入/轉換 =============
21
+ def robust_read_csv(file):
22
+ if file is None:
23
+ return pd.DataFrame()
24
+ if hasattr(file, "read"):
25
+ pos = file.tell() if hasattr(file, "tell") else 0
26
+ raw = file.read(4096)
27
+ enc = chardet.detect(raw)["encoding"] or "utf-8"
28
+ file.seek(pos)
29
+ return pd.read_csv(file, encoding=enc)
30
+ elif hasattr(file, "name"):
31
+ with open(file.name, "rb") as f:
32
+ raw = f.read(4096)
33
+ enc = chardet.detect(raw)["encoding"] or "utf-8"
34
+ return pd.read_csv(file.name, encoding=enc)
35
+ else:
36
+ raise RuntimeError("未知 file 類型")
37
+
38
+
39
  def load_csv(file):
40
  # 讀取CSV,要求有 smiles 與 label 欄位
41
+ df = robust_read_csv(file)
42
  if not {'smiles','label'}.issubset(df.columns):
43
  raise ValueError("CSV需包含'smiles','label'欄位")
44
  # 統一SMILES格式
 
66
  return {n: f(mol) for n, f in Descriptors.descList}
67
 
68
  def add_fps_and_desc(df):
 
69
  df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
 
70
  df['maccs'] = df['smiles'].apply(maccs_fp)
 
71
  df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
72
  df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
73
  return df
74
 
75
+
76
  # =========== 功能3: 資料集探索分析 (EDA) ============
77
  def plot_desc_dist(df, desc='MolWt'):
78
+ if df is None or desc not in df.columns:
79
+ return Image.new("RGB", (400,200), (255,255,255))
80
  fig, ax = plt.subplots(figsize=(5,3))
81
  sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
82
  ax.set_title(f"{desc} Distribution")
 
89
 
90
  # =========== 功能4: 分群/降維可視化 ============
91
  def pca_2d(df, use='ecfp4'):
92
+ if df is None or use not in df.columns:
93
+ return Image.new("RGB", (400,200), (255,255,255))
94
  X = np.stack(df[use].to_numpy())
95
  pca = PCA(n_components=2)
96
  pc = pca.fit_transform(X)
 
106
  return Image.open(buf)
107
 
108
  def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
109
+ if df is None or use not in df.columns:
110
+ return Image.new("RGB", (400,200), (255,255,255))
111
  X = np.stack(df[use].to_numpy())
112
  km = KMeans(n_clusters=n_clusters, random_state=42)
113
  labels = km.fit_predict(X)
 
132
  task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
133
  if model_type == 'rf':
134
  model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
 
135
  scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
136
  model.fit(X, y)
137
  return model, scores
 
152
  df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
153
  smiles_input = gr.Textbox(label="分子SMILES預覽")
154
  mol_image = gr.Image(label="分子結構圖")
155
+ file.upload(lambda f: load_csv(f).head(10) if f else pd.DataFrame(), file, df_preview)
156
  smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
157
 
158
  # --- 分子特徵生成 ---
159
  with gr.Tab("2️⃣ 特徵計算/描述子"):
160
  file2 = gr.File(label="再次選擇CSV")
161
  feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
162
+ file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5) if f else pd.DataFrame(), file2, feat_preview)
163
 
164
  # --- 資料探索 ---
165
  with gr.Tab("3️⃣ 資料集分析 (EDA)"):
166
+ desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
167
  eda_plot = gr.Image(label="分布圖")
168
  file3 = gr.File(label="選擇CSV")
169
  file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
170
+ desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d) if file3.value else Image.new("RGB", (400,200), (255,255,255)), desc_type, eda_plot)
171
+
172
 
173
  # --- 分群與PCA ---
174
  with gr.Tab("4️⃣ 分群/降維可視化"):
 
177
  pca_plot = gr.Image(label="PCA分佈")
178
  km_plot = gr.Image(label="KMeans分群")
179
  file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
180
+ file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))) if f else Image.new("RGB", (400,200), (255,255,255)), file4, pca_plot)
181
+ nclus.change(lambda n: kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n) if file4.value else Image.new("RGB", (400,200), (255,255,255)), nclus, km_plot)
182
 
183
  # --- 建模/預測 ---
184
  with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
 
187
  smiles_pred = gr.Textbox(label="預測SMILES")
188
  y_pred = gr.Textbox(label="預測值/類別")
189
  def train_and_predict(f, s):
190
+ if not f: return "請先上傳CSV", ""
191
  df = add_fps_and_desc(load_csv(f))
192
  model, scores = train_model(df)
193
  pred = predict_single(model, s)
194
  return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
195
+ file5.upload(lambda f: "已載入, 請輸入SMILES進行預測" if f else "請上傳資料", file5, model_status)
196
  smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
197
 
198
  gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
199
 
200
+
201
  demo.launch(share=True)