Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,7 +17,7 @@ import io
|
|
| 17 |
from PIL import Image
|
| 18 |
import chardet
|
| 19 |
|
| 20 |
-
# =========== 功能1: 分子資料導入/轉換 ===========
|
| 21 |
def robust_read_csv(file):
|
| 22 |
if file is None:
|
| 23 |
return pd.DataFrame()
|
|
@@ -35,13 +35,10 @@ def robust_read_csv(file):
|
|
| 35 |
else:
|
| 36 |
raise RuntimeError("未知 file 類型")
|
| 37 |
|
| 38 |
-
|
| 39 |
def load_csv(file):
|
| 40 |
-
# 讀取CSV,要求有 smiles 與 label 欄位
|
| 41 |
df = robust_read_csv(file)
|
| 42 |
if not {'smiles','label'}.issubset(df.columns):
|
| 43 |
raise ValueError("CSV需包含'smiles','label'欄位")
|
| 44 |
-
# 統一SMILES格式
|
| 45 |
df['smiles'] = df['smiles'].astype(str)
|
| 46 |
return df
|
| 47 |
|
|
@@ -51,7 +48,7 @@ def mol_img(smiles, size=(160,160)):
|
|
| 51 |
return Image.new("RGB", size, (250,250,250))
|
| 52 |
return Draw.MolToImage(mol, size=size)
|
| 53 |
|
| 54 |
-
# =========== 功能2: 分子指紋/描述子生成 ===========
|
| 55 |
def ecfp4_fp(smiles, nbits=2048):
|
| 56 |
mol = Chem.MolFromSmiles(smiles)
|
| 57 |
return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
|
|
@@ -66,14 +63,17 @@ def calc_rdkit_desc(smiles):
|
|
| 66 |
return {n: f(mol) for n, f in Descriptors.descList}
|
| 67 |
|
| 68 |
def add_fps_and_desc(df):
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
return df
|
| 74 |
|
| 75 |
-
|
| 76 |
-
# =========== 功能3: 資料集探索分析 (EDA) ============
|
| 77 |
def plot_desc_dist(df, desc='MolWt'):
|
| 78 |
if df is None or desc not in df.columns:
|
| 79 |
return Image.new("RGB", (400,200), (255,255,255))
|
|
@@ -87,7 +87,7 @@ def plot_desc_dist(df, desc='MolWt'):
|
|
| 87 |
plt.close(fig)
|
| 88 |
return Image.open(buf)
|
| 89 |
|
| 90 |
-
# =========== 功能4: 分群/降維可視化 ===========
|
| 91 |
def pca_2d(df, use='ecfp4'):
|
| 92 |
if df is None or use not in df.columns:
|
| 93 |
return Image.new("RGB", (400,200), (255,255,255))
|
|
@@ -124,7 +124,7 @@ def kmeans_clusters(df, n_clusters=3, use='ecfp4'):
|
|
| 124 |
plt.close(fig)
|
| 125 |
return Image.open(buf)
|
| 126 |
|
| 127 |
-
# =========== 功能5: 機器學習建模與預測 ===========
|
| 128 |
def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
|
| 129 |
X = np.stack(df[fp_type].to_numpy())
|
| 130 |
y = df['label'].values
|
|
@@ -141,60 +141,76 @@ def predict_single(model, smiles, fp_type='ecfp4'):
|
|
| 141 |
y_pred = model.predict([fp])[0]
|
| 142 |
return y_pred
|
| 143 |
|
| 144 |
-
# =========== Gradio主UI ===========
|
| 145 |
-
|
| 146 |
with gr.Blocks(title="Cheminformatics Platform") as demo:
|
| 147 |
gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
# --- 分子資料導入 ---
|
| 150 |
with gr.Tab("1️⃣ 資料導入/結構圖"):
|
| 151 |
file = gr.File(label="上傳CSV", file_types=[".csv"])
|
| 152 |
df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
|
| 153 |
smiles_input = gr.Textbox(label="分子SMILES預覽")
|
| 154 |
mol_image = gr.Image(label="分子結構圖")
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
|
| 157 |
-
|
| 158 |
# --- 分子特徵生成 ---
|
| 159 |
with gr.Tab("2️⃣ 特徵計算/描述子"):
|
| 160 |
-
|
| 161 |
feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
# --- 資料探索 ---
|
| 165 |
with gr.Tab("3️⃣ 資料集分析 (EDA)"):
|
| 166 |
desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
|
|
|
|
| 167 |
eda_plot = gr.Image(label="分布圖")
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
desc_type.change(lambda d: plot_desc_dist(add_fps_and_desc(load_csv(file3.value)), d) if file3.value else Image.new("RGB", (400,200), (255,255,255)), desc_type, eda_plot)
|
| 171 |
|
| 172 |
-
|
| 173 |
# --- 分群與PCA ---
|
| 174 |
with gr.Tab("4️⃣ 分群/降維可視化"):
|
| 175 |
-
|
| 176 |
-
nclus = gr.Slider(2,8,3,1,label="分群數")
|
| 177 |
pca_plot = gr.Image(label="PCA分佈")
|
|
|
|
|
|
|
| 178 |
km_plot = gr.Image(label="KMeans分群")
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
# --- 建模/預測 ---
|
| 184 |
with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
|
| 185 |
-
|
| 186 |
model_status = gr.Markdown("模型狀態")
|
| 187 |
smiles_pred = gr.Textbox(label="預測SMILES")
|
| 188 |
y_pred = gr.Textbox(label="預測值/類別")
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
df = add_fps_and_desc(load_csv(f))
|
| 195 |
-
model, scores = train_model(df)
|
| 196 |
return f"模型交叉驗證: {np.round(scores,3)}", model
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
def handle_predict(s, model):
|
| 199 |
if model is None:
|
| 200 |
return "請先訓練模型", ""
|
|
@@ -203,11 +219,9 @@ with gr.Blocks(title="Cheminformatics Platform") as demo:
|
|
| 203 |
return "已預測", str(pred)
|
| 204 |
except Exception as e:
|
| 205 |
return f"預測失敗: {e}", ""
|
| 206 |
-
|
| 207 |
-
file5.upload(handle_train, file5, [model_status, model_state])
|
| 208 |
smiles_pred.change(handle_predict, [smiles_pred, model_state], [model_status, y_pred])
|
| 209 |
-
|
| 210 |
-
gr.Markdown("---\n> 建議工作流:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
|
| 211 |
|
|
|
|
| 212 |
|
| 213 |
demo.launch(share=True)
|
|
|
|
| 17 |
from PIL import Image
|
| 18 |
import chardet
|
| 19 |
|
| 20 |
+
# =========== 功能1: 分子資料導入/轉換 ===========
|
| 21 |
def robust_read_csv(file):
|
| 22 |
if file is None:
|
| 23 |
return pd.DataFrame()
|
|
|
|
| 35 |
else:
|
| 36 |
raise RuntimeError("未知 file 類型")
|
| 37 |
|
|
|
|
| 38 |
def load_csv(file):
|
|
|
|
| 39 |
df = robust_read_csv(file)
|
| 40 |
if not {'smiles','label'}.issubset(df.columns):
|
| 41 |
raise ValueError("CSV需包含'smiles','label'欄位")
|
|
|
|
| 42 |
df['smiles'] = df['smiles'].astype(str)
|
| 43 |
return df
|
| 44 |
|
|
|
|
| 48 |
return Image.new("RGB", size, (250,250,250))
|
| 49 |
return Draw.MolToImage(mol, size=size)
|
| 50 |
|
| 51 |
+
# =========== 功能2: 分子指紋/描述子生成 ===========
|
| 52 |
def ecfp4_fp(smiles, nbits=2048):
|
| 53 |
mol = Chem.MolFromSmiles(smiles)
|
| 54 |
return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)) if mol else np.zeros(nbits)
|
|
|
|
| 63 |
return {n: f(mol) for n, f in Descriptors.descList}
|
| 64 |
|
| 65 |
def add_fps_and_desc(df):
|
| 66 |
+
if 'ecfp4' not in df.columns:
|
| 67 |
+
df['ecfp4'] = df['smiles'].apply(ecfp4_fp)
|
| 68 |
+
if 'maccs' not in df.columns:
|
| 69 |
+
df['maccs'] = df['smiles'].apply(maccs_fp)
|
| 70 |
+
if 'MolWt' not in df.columns:
|
| 71 |
+
df['MolWt'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('MolWt', np.nan))
|
| 72 |
+
if 'TPSA' not in df.columns:
|
| 73 |
+
df['TPSA'] = df['smiles'].apply(lambda s: calc_rdkit_desc(s).get('TPSA', np.nan))
|
| 74 |
return df
|
| 75 |
|
| 76 |
+
# =========== 功能3: 資料集探索分析 (EDA) ===========
|
|
|
|
| 77 |
def plot_desc_dist(df, desc='MolWt'):
|
| 78 |
if df is None or desc not in df.columns:
|
| 79 |
return Image.new("RGB", (400,200), (255,255,255))
|
|
|
|
| 87 |
plt.close(fig)
|
| 88 |
return Image.open(buf)
|
| 89 |
|
| 90 |
+
# =========== 功能4: 分群/降維可視化 ===========
|
| 91 |
def pca_2d(df, use='ecfp4'):
|
| 92 |
if df is None or use not in df.columns:
|
| 93 |
return Image.new("RGB", (400,200), (255,255,255))
|
|
|
|
| 124 |
plt.close(fig)
|
| 125 |
return Image.open(buf)
|
| 126 |
|
| 127 |
+
# =========== 功能5: 機器學習建模與預測 ===========
|
| 128 |
def train_model(df, fp_type='ecfp4', model_type='rf', task='auto'):
|
| 129 |
X = np.stack(df[fp_type].to_numpy())
|
| 130 |
y = df['label'].values
|
|
|
|
| 141 |
y_pred = model.predict([fp])[0]
|
| 142 |
return y_pred
|
| 143 |
|
| 144 |
+
# =========== Gradio主UI ===========
|
|
|
|
| 145 |
with gr.Blocks(title="Cheminformatics Platform") as demo:
|
| 146 |
gr.Markdown("# 🧪 Cheminformatics 多功能分析平台")
|
| 147 |
+
|
| 148 |
+
# 全域狀態:原始資料、特徵後資料、模型
|
| 149 |
+
data_state = gr.State()
|
| 150 |
+
feat_state = gr.State()
|
| 151 |
+
model_state = gr.State()
|
| 152 |
+
|
| 153 |
# --- 分子資料導入 ---
|
| 154 |
with gr.Tab("1️⃣ 資料導入/結構圖"):
|
| 155 |
file = gr.File(label="上傳CSV", file_types=[".csv"])
|
| 156 |
df_preview = gr.Dataframe(label="資料預覽 (前10筆)")
|
| 157 |
smiles_input = gr.Textbox(label="分子SMILES預覽")
|
| 158 |
mol_image = gr.Image(label="分子結構圖")
|
| 159 |
+
|
| 160 |
+
def on_upload(f):
|
| 161 |
+
df = load_csv(f)
|
| 162 |
+
return df.head(10), df
|
| 163 |
+
|
| 164 |
+
file.upload(on_upload, file, [df_preview, data_state])
|
| 165 |
smiles_input.change(lambda s: mol_img(s), smiles_input, mol_image)
|
| 166 |
+
|
| 167 |
# --- 分子特徵生成 ---
|
| 168 |
with gr.Tab("2️⃣ 特徵計算/描述子"):
|
| 169 |
+
feat_btn = gr.Button("生成特徵/描述子")
|
| 170 |
feat_preview = gr.Dataframe(label="特徵/描述子預覽 (前5筆)")
|
| 171 |
+
|
| 172 |
+
def on_feat(state_df):
|
| 173 |
+
if state_df is None:
|
| 174 |
+
return pd.DataFrame(), None
|
| 175 |
+
feat_df = add_fps_and_desc(state_df.copy())
|
| 176 |
+
return feat_df.head(5), feat_df
|
| 177 |
+
|
| 178 |
+
feat_btn.click(on_feat, data_state, [feat_preview, feat_state])
|
| 179 |
+
|
| 180 |
# --- 資料探索 ---
|
| 181 |
with gr.Tab("3️⃣ 資料集分析 (EDA)"):
|
| 182 |
desc_type = gr.Dropdown(['MolWt', 'TPSA'], label="選擇描述子", value="MolWt")
|
| 183 |
+
eda_btn = gr.Button("生成描述子���布圖")
|
| 184 |
eda_plot = gr.Image(label="分布圖")
|
| 185 |
+
eda_btn.click(lambda d, feat_df: plot_desc_dist(feat_df, d) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
|
| 186 |
+
[desc_type, feat_state], eda_plot)
|
|
|
|
| 187 |
|
|
|
|
| 188 |
# --- 分群與PCA ---
|
| 189 |
with gr.Tab("4️⃣ 分群/降維可視化"):
|
| 190 |
+
pca_btn = gr.Button("PCA 分布圖")
|
|
|
|
| 191 |
pca_plot = gr.Image(label="PCA分佈")
|
| 192 |
+
nclus = gr.Slider(2, 8, 3, 1, label="分群數")
|
| 193 |
+
km_btn = gr.Button("KMeans 分群圖")
|
| 194 |
km_plot = gr.Image(label="KMeans分群")
|
| 195 |
+
pca_btn.click(lambda feat_df: pca_2d(feat_df) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)), feat_state, pca_plot)
|
| 196 |
+
km_btn.click(lambda n, feat_df: kmeans_clusters(feat_df, n) if feat_df is not None else Image.new("RGB", (400,200), (255,255,255)),
|
| 197 |
+
[nclus, feat_state], km_plot)
|
| 198 |
+
|
| 199 |
# --- 建模/預測 ---
|
| 200 |
with gr.Tab("5️⃣ 建模/交叉驗證/預測"):
|
| 201 |
+
train_btn = gr.Button("訓練模型 (RF, 5-fold)")
|
| 202 |
model_status = gr.Markdown("模型狀態")
|
| 203 |
smiles_pred = gr.Textbox(label="預測SMILES")
|
| 204 |
y_pred = gr.Textbox(label="預測值/類別")
|
| 205 |
+
|
| 206 |
+
def handle_train(feat_df):
|
| 207 |
+
if feat_df is None:
|
| 208 |
+
return "請先進行特徵生成", None
|
| 209 |
+
model, scores = train_model(feat_df)
|
|
|
|
|
|
|
| 210 |
return f"模型交叉驗證: {np.round(scores,3)}", model
|
| 211 |
+
|
| 212 |
+
train_btn.click(handle_train, feat_state, [model_status, model_state])
|
| 213 |
+
|
| 214 |
def handle_predict(s, model):
|
| 215 |
if model is None:
|
| 216 |
return "請先訓練模型", ""
|
|
|
|
| 219 |
return "已預測", str(pred)
|
| 220 |
except Exception as e:
|
| 221 |
return f"預測失敗: {e}", ""
|
| 222 |
+
|
|
|
|
| 223 |
smiles_pred.change(handle_predict, [smiles_pred, model_state], [model_status, y_pred])
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
gr.Markdown("---\n> 建議完整流程:1️⃣資料導入 → 2️⃣特徵生成 → 3️⃣EDA探索 → 4️⃣分群 → 5️⃣建模預測")
|
| 226 |
|
| 227 |
demo.launch(share=True)
|