Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,18 +8,37 @@ import numpy as np
|
|
| 8 |
from rdkit import Chem
|
| 9 |
from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
|
| 10 |
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 11 |
-
from sklearn.model_selection import
|
| 12 |
from sklearn.decomposition import PCA
|
| 13 |
from sklearn.cluster import KMeans
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
import seaborn as sns
|
| 16 |
import io
|
| 17 |
from PIL import Image
|
|
|
|
| 18 |
|
| 19 |
# =========== 功能1: 分子資料導入/轉換 =============
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def load_csv(file):
|
| 21 |
# 讀取CSV,要求有 smiles 與 label 欄位
|
| 22 |
-
df =
|
| 23 |
if not {'smiles','label'}.issubset(df.columns):
|
| 24 |
raise ValueError("CSV需包含'smiles','label'欄位")
|
| 25 |
# 統一SMILES格式
|
|
@@ -47,18 +66,17 @@ def calc_rdkit_desc(smiles):
|
|
| 47 |
return {n: f(mol) for n, f in Descriptors.descList}
|
| 48 |
|
| 49 |
def add_fps_and_desc(df):
|
| 50 |
-
# 產生 ECFP4 指紋 (預設2048 bits)
|
| 51 |
df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
|
| 52 |
-
# 產生 MACCS 指紋
|
| 53 |
df['maccs'] = df['smiles'].apply(maccs_fp)
|
| 54 |
-
# 計算部分常見描述子(如需更多用mordred/rdkit)
|
| 55 |
df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
|
| 56 |
df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
|
| 57 |
return df
|
| 58 |
|
|
|
|
| 59 |
# =========== 功能3: 資料集探索分析 (EDA) ============
|
| 60 |
def plot_desc_dist(df, desc='MolWt'):
|
| 61 |
-
|
|
|
|
| 62 |
fig, ax = plt.subplots(figsize=(5,3))
|
| 63 |
sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
|
| 64 |
ax.set_title(f"{desc} Distribution")
|
|
@@ -71,6 +89,8 @@ def plot_desc_dist(df, desc='MolWt'):
|
|
| 71 |
|
| 72 |
# =========== 功能4: 分群/降維可視化 ============
|
| 73 |
def pca_2d(df, use='ecfp4'):
|
|
|
|
|
|
|
| 74 |
X = np.stack(df[use].to_numpy())
|
| 75 |
pca = PCA(n_components=2)
|
| 76 |
pc = pca.fit_transform(X)
|
|
@@ -86,6 +106,8 @@ def pca_2d(df, use='ecfp4'):
|
|
| 86 |
return Image.open(buf)
|
| 87 |
|
| 88 |
def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
|
|
|
|
|
|
|
| 89 |
X = np.stack(df[use].to_numpy())
|
| 90 |
km = KMeans(n_clusters=n_clusters, random_state=42)
|
| 91 |
labels = km.fit_predict(X)
|
|
@@ -110,7 +132,6 @@ def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
|
|
| 110 |
task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
|
| 111 |
if model_type == 'rf':
|
| 112 |
model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
|
| 113 |
-
# 可拓展支援更多模型
|
| 114 |
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
|
| 115 |
model.fit(X, y)
|
| 116 |
return model, scores
|
|
@@ -131,22 +152,23 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
|
|
| 131 |
df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
|
| 132 |
smiles_input = gr.Textbox(label="分子SMILES預覽")
|
| 133 |
mol_image = gr.Image(label="分子結構圖")
|
| 134 |
-
file.upload(lambda f: load_csv(f).head(10), file, df_preview)
|
| 135 |
smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
|
| 136 |
|
| 137 |
# --- 分子特徵生成 ---
|
| 138 |
with gr.Tab("2️⃣ 特徵計算/描述子"):
|
| 139 |
file2 = gr.File(label="再次選擇CSV")
|
| 140 |
feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
|
| 141 |
-
file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5), file2, feat_preview)
|
| 142 |
|
| 143 |
# --- 資料探索 ---
|
| 144 |
with gr.Tab("3️⃣ 資料集分析 (EDA)"):
|
| 145 |
-
desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子")
|
| 146 |
eda_plot = gr.Image(label="分布圖")
|
| 147 |
file3 = gr.File(label="選擇CSV")
|
| 148 |
file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
|
| 149 |
-
desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d), desc_type, eda_plot)
|
|
|
|
| 150 |
|
| 151 |
# --- 分群與PCA ---
|
| 152 |
with gr.Tab("4️⃣ 分群/降維可視化"):
|
|
@@ -155,8 +177,8 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
|
|
| 155 |
pca_plot = gr.Image(label="PCA分佈")
|
| 156 |
km_plot = gr.Image(label="KMeans分群")
|
| 157 |
file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
|
| 158 |
-
file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))), file4, pca_plot)
|
| 159 |
-
nclus.change(lambda n:
|
| 160 |
|
| 161 |
# --- 建模/預測 ---
|
| 162 |
with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
|
|
@@ -165,13 +187,15 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
|
|
| 165 |
smiles_pred = gr.Textbox(label="預測SMILES")
|
| 166 |
y_pred = gr.Textbox(label="預測值/類別")
|
| 167 |
def train_and_predict(f, s):
|
|
|
|
| 168 |
df = add_fps_and_desc(load_csv(f))
|
| 169 |
model, scores = train_model(df)
|
| 170 |
pred = predict_single(model, s)
|
| 171 |
return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
|
| 172 |
-
file5.upload(lambda f:
|
| 173 |
smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
|
| 174 |
|
| 175 |
gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
|
| 176 |
|
|
|
|
| 177 |
demo.launch(share=True)
|
|
|
|
| 8 |
from rdkit import Chem
|
| 9 |
from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
|
| 10 |
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 11 |
+
from sklearn.model_selection import cross_val_score
|
| 12 |
from sklearn.decomposition import PCA
|
| 13 |
from sklearn.cluster import KMeans
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
import seaborn as sns
|
| 16 |
import io
|
| 17 |
from PIL import Image
|
| 18 |
+
import chardet
|
| 19 |
|
| 20 |
# =========== 功能1: 分子資料導入/轉換 =============
|
| 21 |
+
def robust_read_csv(file):
|
| 22 |
+
if file is None:
|
| 23 |
+
return pd.DataFrame()
|
| 24 |
+
if hasattr(file, "read"):
|
| 25 |
+
pos = file.tell() if hasattr(file, "tell") else 0
|
| 26 |
+
raw = file.read(4096)
|
| 27 |
+
enc = chardet.detect(raw)["encoding"] or "utf-8"
|
| 28 |
+
file.seek(pos)
|
| 29 |
+
return pd.read_csv(file, encoding=enc)
|
| 30 |
+
elif hasattr(file, "name"):
|
| 31 |
+
with open(file.name, "rb") as f:
|
| 32 |
+
raw = f.read(4096)
|
| 33 |
+
enc = chardet.detect(raw)["encoding"] or "utf-8"
|
| 34 |
+
return pd.read_csv(file.name, encoding=enc)
|
| 35 |
+
else:
|
| 36 |
+
raise RuntimeError("未知 file 類型")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
def load_csv(file):
|
| 40 |
# 讀取CSV,要求有 smiles 與 label 欄位
|
| 41 |
+
df = robust_read_csv(file)
|
| 42 |
if not {'smiles','label'}.issubset(df.columns):
|
| 43 |
raise ValueError("CSV需包含'smiles','label'欄位")
|
| 44 |
# 統一SMILES格式
|
|
|
|
| 66 |
return {n: f(mol) for n, f in Descriptors.descList}
|
| 67 |
|
| 68 |
def add_fps_and_desc(df):
|
|
|
|
| 69 |
df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
|
|
|
|
| 70 |
df['maccs'] = df['smiles'].apply(maccs_fp)
|
|
|
|
| 71 |
df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
|
| 72 |
df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
|
| 73 |
return df
|
| 74 |
|
| 75 |
+
|
| 76 |
# =========== 功能3: 資料集探索分析 (EDA) ============
|
| 77 |
def plot_desc_dist(df, desc='MolWt'):
|
| 78 |
+
if df is None or desc not in df.columns:
|
| 79 |
+
return Image.new("RGB", (400,200), (255,255,255))
|
| 80 |
fig, ax = plt.subplots(figsize=(5,3))
|
| 81 |
sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
|
| 82 |
ax.set_title(f"{desc} Distribution")
|
|
|
|
| 89 |
|
| 90 |
# =========== 功能4: 分群/降維可視化 ============
|
| 91 |
def pca_2d(df, use='ecfp4'):
|
| 92 |
+
if df is None or use not in df.columns:
|
| 93 |
+
return Image.new("RGB", (400,200), (255,255,255))
|
| 94 |
X = np.stack(df[use].to_numpy())
|
| 95 |
pca = PCA(n_components=2)
|
| 96 |
pc = pca.fit_transform(X)
|
|
|
|
| 106 |
return Image.open(buf)
|
| 107 |
|
| 108 |
def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
|
| 109 |
+
if df is None or use not in df.columns:
|
| 110 |
+
return Image.new("RGB", (400,200), (255,255,255))
|
| 111 |
X = np.stack(df[use].to_numpy())
|
| 112 |
km = KMeans(n_clusters=n_clusters, random_state=42)
|
| 113 |
labels = km.fit_predict(X)
|
|
|
|
| 132 |
task = 'regression' if np.issubdtype(y.dtype, np.floating) else 'classification'
|
| 133 |
if model_type == 'rf':
|
| 134 |
model = RandomForestRegressor(n_estimators=100) if task == 'regression' else RandomForestClassifier(n_estimators=100)
|
|
|
|
| 135 |
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error' if task=='regression' else 'accuracy')
|
| 136 |
model.fit(X, y)
|
| 137 |
return model, scores
|
|
|
|
| 152 |
df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
|
| 153 |
smiles_input = gr.Textbox(label="分子SMILES預覽")
|
| 154 |
mol_image = gr.Image(label="分子結構圖")
|
| 155 |
+
file.upload(lambda f: load_csv(f).head(10) if f else pd.DataFrame(), file, df_preview)
|
| 156 |
smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
|
| 157 |
|
| 158 |
# --- 分子特徵生成 ---
|
| 159 |
with gr.Tab("2️⃣ 特徵計算/描述子"):
|
| 160 |
file2 = gr.File(label="再次選擇CSV")
|
| 161 |
feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
|
| 162 |
+
file2.upload(lambda f: add_fps_and_desc(load_csv(f)).head(5) if f else pd.DataFrame(), file2, feat_preview)
|
| 163 |
|
| 164 |
# --- 資料探索 ---
|
| 165 |
with gr.Tab("3️⃣ 資料集分析 (EDA)"):
|
| 166 |
+
desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
|
| 167 |
eda_plot = gr.Image(label="分布圖")
|
| 168 |
file3 = gr.File(label="選擇CSV")
|
| 169 |
file3.upload(lambda f: add_fps_and_desc(load_csv(f)), file3, None)
|
| 170 |
+
desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d) if file3.value else Image.new("RGB", (400,200), (255,255,255)), desc_type, eda_plot)
|
| 171 |
+
|
| 172 |
|
| 173 |
# --- 分群與PCA ---
|
| 174 |
with gr.Tab("4️⃣ 分群/降維可視化"):
|
|
|
|
| 177 |
pca_plot = gr.Image(label="PCA分佈")
|
| 178 |
km_plot = gr.Image(label="KMeans分群")
|
| 179 |
file4.upload(lambda f: add_fps_and_desc(load_csv(f)), file4, None)
|
| 180 |
+
file4.change(lambda f: pca_2d(add_fps_and_desc(load_csv(f))) if f else Image.new("RGB", (400,200), (255,255,255)), file4, pca_plot)
|
| 181 |
+
nclus.change(lambda n: kmeans_clusters(add_fps_and_desc(load_csv(file4.value)), n) if file4.value else Image.new("RGB", (400,200), (255,255,255)), nclus, km_plot)
|
| 182 |
|
| 183 |
# --- 建模/預測 ---
|
| 184 |
with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
|
|
|
|
| 187 |
smiles_pred = gr.Textbox(label="預測SMILES")
|
| 188 |
y_pred = gr.Textbox(label="預測值/類別")
|
| 189 |
def train_and_predict(f, s):
|
| 190 |
+
if not f: return "請先上傳CSV", ""
|
| 191 |
df = add_fps_and_desc(load_csv(f))
|
| 192 |
model, scores = train_model(df)
|
| 193 |
pred = predict_single(model, s)
|
| 194 |
return f"模型交叉驗證: {np.round(scores,3)}", str(pred)
|
| 195 |
+
file5.upload(lambda f: "已載入, 請輸入SMILES進行預測" if f else "請上傳資料", file5, model_status)
|
| 196 |
smiles_pred.change(lambda s: train_and_predict(file5.value, s), smiles_pred, [model_status, y_pred])
|
| 197 |
|
| 198 |
gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
|
| 199 |
|
| 200 |
+
|
| 201 |
demo.launch(share=True)
|