Spaces:
Sleeping
Sleeping
| # Cheminformatics 多功能平台 - 基礎版 | |
| # 主要涵蓋:分子資料導入、指紋/描述子生成、資料探索、分群、建模、特徵解釋、批量預測、可視化 | |
| # Author: 2025 | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from rdkit import Chem | |
| from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors | |
| from rdkit.Chem import PandasTools | |
| from rdkit.Chem import rdMolDescriptors | |
| from sklearn.decomposition import PCA | |
| from sklearn.manifold import TSNE | |
| from umap import UMAP | |
| from sklearn.cluster import KMeans, DBSCAN | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import io | |
| from PIL import Image | |
| import chardet | |
| from ydata_profiling import ProfileReport | |
| # =========== Robust 多格式自動讀取 =========== | |
| def load_table(file): | |
| if file is None: | |
| return pd.DataFrame() | |
| # 路徑或 str | |
| fname = file if isinstance(file, str) else getattr(file, "name", None) | |
| if fname is not None: | |
| if fname.endswith('.csv'): | |
| with open(fname, 'rb') as f: | |
| raw = f.read(4096) | |
| enc = chardet.detect(raw)['encoding'] or 'utf-8' | |
| return pd.read_csv(fname, encoding=enc, engine='python') | |
| elif fname.endswith('.xlsx') or fname.endswith('.xls'): | |
| return pd.read_excel(fname) | |
| elif fname.endswith('.sdf'): | |
| return PandasTools.LoadSDF(fname) | |
| else: | |
| raise RuntimeError(f"不支援的檔案格式: {fname}") | |
| raise RuntimeError("不支援的 file 類型") | |
| # =========== 批量分子圖 (前25) =========== | |
| def batch_mol_imgs(smiles_list): | |
| mols = [Chem.MolFromSmiles(s) for s in smiles_list[:25] if Chem.MolFromSmiles(s)] | |
| if not mols: | |
| return Image.new("RGB", (800, 160), (255,255,255)) | |
| grid = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(160,160)) | |
| buf = io.BytesIO() | |
| grid.save(buf, format='PNG') | |
| buf.seek(0) | |
| return Image.open(buf) | |
| # =========== 指紋/描述子/官能基 =========== | |
| def calc_features(df, fp_types, desc_types, smartbox): | |
| if 'ecfp4' in fp_types: | |
| df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048)) | |
| if 'maccs' in fp_types: | |
| df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167)) | |
| if 'rdkitfp' in fp_types: | |
| df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(Chem.RDKFingerprint(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048)) | |
| for desc in desc_types: | |
| try: | |
| if hasattr(Descriptors, desc): | |
| df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan) | |
| except: continue | |
| # SMARTS 官能基 | |
| if smartbox: | |
| for idx, smt in enumerate([x.strip() for x in smartbox.split(",") if x.strip()]): | |
| patt = Chem.MolFromSmarts(smt) | |
| df[f"FG{idx+1}_count"] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt) if Chem.MolFromSmiles(s) and patt else []) | |
| df[f"FG{idx+1}_count"] = df[f"FG{idx+1}_count"].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0) | |
| return df | |
| # =========== EDA報表 & 單欄分布 =========== | |
| def eda_report(df): | |
| profile = ProfileReport(df, title="EDA報告", minimal=True) | |
| out = "/tmp/eda_report.html" | |
| profile.to_file(out) | |
| return out | |
| def plot_desc_dist(df, desc='MolWt'): | |
| if df is None or desc not in df.columns: | |
| return Image.new("RGB", (400,200), (255,255,255)) | |
| fig, ax = plt.subplots(figsize=(5,3)) | |
| sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close(fig) | |
| return Image.open(buf) | |
| # =========== 降維/分群 & 群代表分子 =========== | |
| def apply_dim_red(df, use, method='PCA'): | |
| X = np.stack(df[use].to_numpy()) | |
| if method == 'PCA': | |
| pc = PCA(n_components=2).fit_transform(X) | |
| elif method == 'UMAP': | |
| n_neighbors = min(15, X.shape[0]-1) if X.shape[0] > 1 else 1 | |
| pc = UMAP(n_components=2, random_state=42, n_neighbors=n_neighbors).fit_transform(X) | |
| elif method == 'tSNE': | |
| n_samples = X.shape[0] | |
| perplexity = min(30, max(2, (n_samples-1)//2)) | |
| pc = TSNE(n_components=2, perplexity=perplexity, random_state=42).fit_transform(X) | |
| else: | |
| raise ValueError('Unknown method') | |
| return pc | |
| def plot_scatter(pc, labels, title): | |
| fig, ax = plt.subplots(figsize=(5,4)) | |
| scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7) | |
| plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title) | |
| plt.colorbar(scatter) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close(fig) | |
| return Image.open(buf) | |
| def cluster_reps(df, labels, use): | |
| reps = [] | |
| labels = np.array(labels) | |
| for cl in np.unique(labels): | |
| cluster_df = df[labels == cl] | |
| if len(cluster_df) > 0: | |
| idx = np.random.choice(cluster_df.index, 1)[0] | |
| reps.append(cluster_df.loc[idx]['smiles']) | |
| return reps | |
| # 圖像自動說明函數 | |
| # 針對描述子分布圖的簡易解釋 | |
| def explain_desc_dist(df, desc): | |
| if df is None or desc not in df.columns: | |
| return "請先上傳檔案並選擇描述子。" | |
| n = df.shape[0] | |
| mean = df[desc].mean() | |
| std = df[desc].std() | |
| minv = df[desc].min() | |
| maxv = df[desc].max() | |
| msg = ( | |
| f"這張圖顯示目前資料集裡「{desc}」這個特徵的分布情況。\n\n" | |
| f"總共有 {n} 筆資料。數值大多集中在平均值 {mean:.2f} 左右,標準差是 {std:.2f}。\n" | |
| f"最低值 {minv:.2f},最高值 {maxv:.2f}。圖中高峰的位置就是資料最集中的範圍。\n" | |
| "這樣的分布圖可以幫助你判斷資料是不是有極端值,或是大多數分子都屬於哪個範圍。\n\n" | |
| "【應用案例】\n" | |
| "比方說,如果你正在開發一款新藥或新材料,可以藉由分布圖觀察目標分子的分子量、極性(TPSA)是否和現有產品類似。" | |
| "如果發現有極端高或低的分子量,可能要特別注意這些分子在後續應用上的行為,例如溶解度、吸收性等。\n" | |
| "這樣的分析常被用於藥物篩選(藥物發現)、高分子材料設計,甚至協助判斷是否有「資料異常」需要清理。" | |
| ) | |
| return msg | |
| # 針對降維與分群圖的簡易解釋 | |
| def explain_dimred(pc, labels, dr, cl): | |
| n = pc.shape[0] | |
| n_cls = len(np.unique(labels)) | |
| label_cnt = {int(lbl): sum(labels==lbl) for lbl in np.unique(labels)} | |
| msg = ( | |
| f"這張圖是把分子資料用 {dr} 方法做降維,再用 {cl} 方法分群得到的。\n\n" | |
| f"每個點代表一個分子,不同顏色表示不同的群組。這次總共分成 {n_cls} 群。\n" | |
| f"每群的分子數量分別是:{label_cnt}\n" | |
| "如果看到有明顯分群,代表這些分子之間可能有某些共通的特徵。\n" | |
| "點跟點之間越近,代表它們在特徵上也越像。如果有單獨一兩個點很遠,那可能就是所謂的離群值(和其他分子差異較大)。\n\n" | |
| "【應用案例】\n" | |
| "假設你是一位藥物化學家,可以利用這張圖來判斷哪些分子屬於同一類型(例如同一類藥效分子),快速發現群內外的差異。" | |
| "如果你是在開發新材料,可以找出和現有材料組成最相近或最不一樣的分子,有助於找出潛力新材料。\n" | |
| "此外,如果發現某個群組特別小或特別大,也可以針對這些群組進行更細緻的分析,例如針對特殊族群做後續的結構設計。" | |
| ) | |
| return msg | |
| # =========== Gradio 主 UI =========== | |
| with gr.Blocks(title="Cheminformatics Platform") as demo: | |
| gr.Markdown("# 🧪 Cheminformatics 多功能平台") | |
| # 1. 資料導入與批次結構圖 | |
| with gr.Tab("1️⃣ 資料導入/結構圖"): | |
| up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"]) | |
| df_view = gr.Dataframe(label="資料預覽 (前15筆)") | |
| mol_grid = gr.Image(label="分子結構圖(前25筆)") | |
| up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view) | |
| up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid) | |
| # 2. 特徵/描述子/官能基計算 | |
| with gr.Tab("2️⃣ 特徵/描述子/官能基計算"): | |
| file2 = gr.File(label="選擇分子檔") | |
| fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋", value=['ecfp4']) | |
| desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子") | |
| func_smart = gr.Textbox(label="官能基SMARTS(逗號分隔)", placeholder="[N+](=O)[O-], [OX2H]") | |
| feat_preview = gr.Dataframe(label="特徵/描述子預覽(前10筆)") | |
| def calc_all_feats(file, fp, desc, smartbox): | |
| df = load_table(file) | |
| df = calc_features(df, fp, desc, smartbox) | |
| return df.head(10) | |
| gr.Button("特徵/官能基計算", variant="primary").click( | |
| calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview | |
| ) | |
| # 3. EDA分析/自動報表 | |
| with gr.Tab("3️⃣ EDA分析/自動報表"): | |
| file3 = gr.File(label="分子檔") | |
| col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="欄位") | |
| eda_img = gr.Image(label="描述子分布圖") | |
| eda_desc = gr.Markdown(label="圖像說明") # <<<<<<<< 新增這一行 | |
| eda_btn = gr.Button("產生分布圖") | |
| def show_eda_img_with_explain(file, col): | |
| df = calc_features(load_table(file), ['ecfp4'], [col], None) | |
| img = plot_desc_dist(df, col) | |
| txt = explain_desc_dist(df, col) | |
| return img, txt | |
| eda_btn.click( | |
| show_eda_img_with_explain, [file3, col_sel], [eda_img, eda_desc] | |
| ) | |
| eda_sum = gr.File(label="下載EDA報表") | |
| gr.Button("生成EDA報表", variant="primary").click( | |
| lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum | |
| ) | |
| # 4. 降維/分群/群代表分子圖 | |
| with gr.Tab("4️⃣ 降維/分群/結構探索"): | |
| file4 = gr.File(label="分子檔") | |
| use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維指紋", value="ecfp4") | |
| dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA") | |
| cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans") | |
| nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數") | |
| dr_img = gr.Image(label="降維/分群視覺化") | |
| dr_desc = gr.Markdown(label="圖像說明") # <======= 新增這一行 | |
| rep_imgs = gr.Image(label="群代表分子圖(每群1個)") | |
| def dimred_and_cluster(file, fp, dr, cl, nclu): | |
| df = load_table(file) | |
| df = calc_features(df, [fp], [], None) | |
| pc = apply_dim_red(df, fp, dr) | |
| labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc) if cl == 'KMeans' else DBSCAN(eps=3, min_samples=2).fit_predict(pc) | |
| plotimg = plot_scatter(pc, labels, f"{dr}-{cl}") | |
| # 圖像解釋 | |
| desc = explain_dimred(pc, labels, dr, cl) | |
| reps = cluster_reps(df, labels, fp) | |
| rep_img = batch_mol_imgs(reps) | |
| return plotimg, desc, rep_img | |
| gr.Button("降維+分群分析", variant="primary").click( | |
| dimred_and_cluster, [file4, use_fp, dr_method, cl_method, nclus], [dr_img, dr_desc, rep_imgs] | |
| ) | |
| gr.Markdown("---\n> 完整工作流:1️⃣資料導入 → 2️⃣特徵/描述子/官能基 → 3️⃣EDA分析 → 4️⃣降維/分群/結構探索") | |
| demo.launch(share=True) | |