Kung-Hsun commited on
Commit
b052cf9
·
verified ·
1 Parent(s): 7b00b7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -44
app.py CHANGED
@@ -17,7 +17,7 @@ import io
17
  from PIL import Image
18
  import chardet
19
 
20
- # =========== 功能1: 分子資料導入/轉換 =============
21
  def robust_read_csv(file):
22
  if file is None:
23
  return pd.DataFrame()
@@ -35,13 +35,10 @@ def robust_read_csv(file):
35
  else:
36
  raise RuntimeError("未知 file 類型")
37
 
38
-
39
  def load_csv(file):
40
- # 讀取CSV,要求有 smiles 與 label 欄位
41
  df = robust_read_csv(file)
42
  if not {'smiles','label'}.issubset(df.columns):
43
  raise ValueError("CSV需包含'smiles','label'欄位")
44
- # 統一SMILES格式
45
  df['smiles'] = df['smiles'].astype(str)
46
  return df
47
 
@@ -51,7 +48,7 @@ def mol_img(smiles, size=(160,160)):
51
  return Image.new("RGB", size, (250,250,250))
52
  return Draw.MolToImage(mol, size=size)
53
 
54
- # =========== 功能2: 分子指紋/描述子生成 =============
55
  def ecfp4_fp(smiles, nbits=2048):
56
  mol = Chem.MolFromSmiles(smiles)
57
  return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
@@ -66,14 +63,17 @@ def calc_rdkit_desc(smiles):
66
  return {n: f(mol) for n, f in Descriptors.descList}
67
 
68
  def add_fps_and_desc(df):
69
- df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
70
- df['maccs'] = df['smiles'].apply(maccs_fp)
71
- df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
72
- df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
 
 
 
 
73
  return df
74
 
75
-
76
- # =========== 功能3: 資料集探索分析 (EDA) ============
77
  def plot_desc_dist(df, desc='MolWt'):
78
  if df is None or desc not in df.columns:
79
  return Image.new("RGB", (400,200), (255,255,255))
@@ -87,7 +87,7 @@ def plot_desc_dist(df, desc='MolWt'):
87
  plt.close(fig)
88
  return Image.open(buf)
89
 
90
- # =========== 功能4: 分群/降維可視化 ============
91
  def pca_2d(df, use='ecfp4'):
92
  if df is None or use not in df.columns:
93
  return Image.new("RGB", (400,200), (255,255,255))
@@ -124,7 +124,7 @@ def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
124
  plt.close(fig)
125
  return Image.open(buf)
126
 
127
- # =========== 功能5: 機器學習建模與預測 ============
128
  def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
129
  X = np.stack(df[fp_type].to_numpy())
130
  y = df['label'].values
@@ -141,60 +141,76 @@ def predict_single(model, smiles, fp_type='ecfp4'):
141
  y_pred = model.predict([fp])[0]
142
  return y_pred
143
 
144
- # =========== Gradio主UI ============
145
-
146
  with gr.Blocks(title="Cheminformatics Platform") as demo:
147
  gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
148
-
 
 
 
 
 
149
  # --- 分子資料導入 ---
150
  with gr.Tab("1️⃣ 資料導入/結構圖"):
151
  file = gr.File(label="上傳CSV", file_types=[".csv"])
152
  df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
153
  smiles_input = gr.Textbox(label="分子SMILES預覽")
154
  mol_image = gr.Image(label="分子結構圖")
155
- file.upload(lambda f: load_csv(f).head(10) if f else pd.DataFrame(), file, df_preview)
 
 
 
 
 
156
  smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
157
-
158
  # --- 分子特徵生成 ---
159
  with gr.Tab("2️⃣ 特徵計算/描述子"):
160
- file2 = gr.File(label="再次選擇CSV")
161
  feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
162
- file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5) if f else pd.DataFrame(), file2, feat_preview)
163
-
 
 
 
 
 
 
 
164
  # --- 資料探索 ---
165
  with gr.Tab("3️⃣ 資料集分析 (EDA)"):
166
  desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
 
167
  eda_plot = gr.Image(label="分布圖")
168
- file3 = gr.File(label="選擇CSV")
169
- file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
170
- desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d) if file3.value else Image.new("RGB", (400,200), (255,255,255)), desc_type, eda_plot)
171
 
172
-
173
  # --- 分群與PCA ---
174
  with gr.Tab("4️⃣ 分群/降維可視化"):
175
- file4 = gr.File(label="上傳CSV")
176
- nclus = gr.Slider(2,8,3,1,label="分群數")
177
  pca_plot = gr.Image(label="PCA分佈")
 
 
178
  km_plot = gr.Image(label="KMeans分群")
179
- file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
180
- file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))) if f else Image.new("RGB", (400,200), (255,255,255)), file4, pca_plot)
181
- nclus.change(lambda n: kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n) if file4.value else Image.new("RGB", (400,200), (255,255,255)), nclus, km_plot)
182
-
183
  # --- 建模/預測 ---
184
  with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
185
- file5 = gr.File(label="上傳CSV")
186
  model_status = gr.Markdown("模型狀態")
187
  smiles_pred = gr.Textbox(label="預測SMILES")
188
  y_pred = gr.Textbox(label="預測值/類別")
189
- model_state = gr.State(None) # 新增 State 儲存模型
190
-
191
- def handle_train(f):
192
- if not f:
193
- return "請上傳資料", None
194
- df = add_fps_and_desc(load_csv(f))
195
- model, scores = train_model(df)
196
  return f"模型交叉驗證: {np.round(scores,3)}", model
197
-
 
 
198
  def handle_predict(s, model):
199
  if model is None:
200
  return "請先訓練模型", ""
@@ -203,11 +219,9 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
203
  return "已預測", str(pred)
204
  except Exception as e:
205
  return f"預測失敗: {e}", ""
206
-
207
- file5.upload(handle_train, file5, [model_status, model_state])
208
  smiles_pred.change(handle_predict, [smiles_pred, model_state], [model_status, y_pred])
209
-
210
- gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
211
 
 
212
 
213
  demo.launch(share=True)
 
17
  from PIL import Image
18
  import chardet
19
 
20
+ # =========== 功能1: 分子資料導入/轉換 ===========
21
  def robust_read_csv(file):
22
  if file is None:
23
  return pd.DataFrame()
 
35
  else:
36
  raise RuntimeError("未知 file 類型")
37
 
 
38
  def load_csv(file):
 
39
  df = robust_read_csv(file)
40
  if not {'smiles','label'}.issubset(df.columns):
41
  raise ValueError("CSV需包含'smiles','label'欄位")
 
42
  df['smiles'] = df['smiles'].astype(str)
43
  return df
44
 
 
48
  return Image.new("RGB", size, (250,250,250))
49
  return Draw.MolToImage(mol, size=size)
50
 
51
+ # =========== 功能2: 分子指紋/描述子生成 ===========
52
  def ecfp4_fp(smiles, nbits=2048):
53
  mol = Chem.MolFromSmiles(smiles)
54
  return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
 
63
  return {n: f(mol) for n, f in Descriptors.descList}
64
 
65
  def add_fps_and_desc(df):
66
+ if 'ecfp4' not in df.columns:
67
+ df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
68
+ if 'maccs' not in df.columns:
69
+ df['maccs'] = df['smiles'].apply(maccs_fp)
70
+ if 'MolWt' not in df.columns:
71
+ df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
72
+ if 'TPSA' not in df.columns:
73
+ df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
74
  return df
75
 
76
+ # =========== 功能3: 資料集探索分析 (EDA) ===========
 
77
  def plot_desc_dist(df, desc='MolWt'):
78
  if df is None or desc not in df.columns:
79
  return Image.new("RGB", (400,200), (255,255,255))
 
87
  plt.close(fig)
88
  return Image.open(buf)
89
 
90
+ # =========== 功能4: 分群/降維可視化 ===========
91
  def pca_2d(df, use='ecfp4'):
92
  if df is None or use not in df.columns:
93
  return Image.new("RGB", (400,200), (255,255,255))
 
124
  plt.close(fig)
125
  return Image.open(buf)
126
 
127
+ # =========== 功能5: 機器學習建模與預測 ===========
128
  def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
129
  X = np.stack(df[fp_type].to_numpy())
130
  y = df['label'].values
 
141
  y_pred = model.predict([fp])[0]
142
  return y_pred
143
 
144
+ # =========== Gradio主UI ===========
 
145
  with gr.Blocks(title="Cheminformatics Platform") as demo:
146
  gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
147
+
148
+ # 全域狀態:原始資料、特徵後資料、模型
149
+ data_state = gr.State()
150
+ feat_state = gr.State()
151
+ model_state = gr.State()
152
+
153
  # --- 分子資料導入 ---
154
  with gr.Tab("1️⃣ 資料導入/結構圖"):
155
  file = gr.File(label="上傳CSV", file_types=[".csv"])
156
  df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
157
  smiles_input = gr.Textbox(label="分子SMILES預覽")
158
  mol_image = gr.Image(label="分子結構圖")
159
+
160
+ def on_upload(f):
161
+ df = load_csv(f)
162
+ return df.head(10), df
163
+
164
+ file.upload(on_upload, file, [df_preview, data_state])
165
  smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
166
+
167
  # --- 分子特徵生成 ---
168
  with gr.Tab("2️⃣ 特徵計算/描述子"):
169
+ feat_btn = gr.Button("生成特徵/描述子")
170
  feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
171
+
172
+ def on_feat(state_df):
173
+ if state_df is None:
174
+ return pd.DataFrame(), None
175
+ feat_df = add_fps_and_desc(state_df.copy())
176
+ return feat_df.head(5), feat_df
177
+
178
+ feat_btn.click(on_feat, data_state, [feat_preview, feat_state])
179
+
180
  # --- 資料探索 ---
181
  with gr.Tab("3️⃣ 資料集分析 (EDA)"):
182
  desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
183
+ eda_btn = gr.Button("生成描述子���布圖")
184
  eda_plot = gr.Image(label="分布圖")
185
+ eda_btn.click(lambda d, feat_df: plot_desc_dist(feat_df, d) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
186
+ [desc_type, feat_state], eda_plot)
 
187
 
 
188
  # --- 分群與PCA ---
189
  with gr.Tab("4️⃣ 分群/降維可視化"):
190
+ pca_btn = gr.Button("PCA 分布圖")
 
191
  pca_plot = gr.Image(label="PCA分佈")
192
+ nclus = gr.Slider(2, 8, 3, 1, label="分群數")
193
+ km_btn = gr.Button("KMeans 分群圖")
194
  km_plot = gr.Image(label="KMeans分群")
195
+ pca_btn.click(lambda feat_df: pca_2d(feat_df) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)), feat_state, pca_plot)
196
+ km_btn.click(lambda n, feat_df: kmeans_clusters(feat_df, n) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
197
+ [nclus, feat_state], km_plot)
198
+
199
  # --- 建模/預測 ---
200
  with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
201
+ train_btn = gr.Button("訓練模型 (RF, 5-fold)")
202
  model_status = gr.Markdown("模型狀態")
203
  smiles_pred = gr.Textbox(label="預測SMILES")
204
  y_pred = gr.Textbox(label="預測值/類別")
205
+
206
+ def handle_train(feat_df):
207
+ if feat_df is None:
208
+ return "請先進行特徵生成", None
209
+ model, scores = train_model(feat_df)
 
 
210
  return f"模型交叉驗證: {np.round(scores,3)}", model
211
+
212
+ train_btn.click(handle_train, feat_state, [model_status, model_state])
213
+
214
  def handle_predict(s, model):
215
  if model is None:
216
  return "請先訓練模型", ""
 
219
  return "已預測", str(pred)
220
  except Exception as e:
221
  return f"預測失敗: {e}", ""
222
+
 
223
  smiles_pred.change(handle_predict, [smiles_pred, model_state], [model_status, y_pred])
 
 
224
 
225
+ gr.Markdown("---\n> 建議完整流程:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
226
 
227
  demo.launch(share=True)