# Cheminformatics 多功能平台 - 基礎版 # 主要涵蓋:分子資料導入、指紋/描述子生成、資料探索、分群、建模、特徵解釋、批量預測、可視化 # Author: 2025 import gradio as gr import pandas as pd import numpy as np from rdkit import Chem from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors from rdkit.Chem import PandasTools from rdkit.Chem import rdMolDescriptors from sklearn.decomposition import PCA from sklearn.manifold import TSNE from umap import UMAP from sklearn.cluster import KMeans, DBSCAN import matplotlib.pyplot as plt import seaborn as sns import io from PIL import Image import chardet from ydata_profiling import ProfileReport # =========== Robust 多格式自動讀取 =========== def load_table(file): if file is None: return pd.DataFrame() # 路徑或 str fname = file if isinstance(file, str) else getattr(file, "name", None) if fname is not None: if fname.endswith('.csv'): with open(fname, 'rb') as f: raw = f.read(4096) enc = chardet.detect(raw)['encoding'] or 'utf-8' return pd.read_csv(fname, encoding=enc, engine='python') elif fname.endswith('.xlsx') or fname.endswith('.xls'): return pd.read_excel(fname) elif fname.endswith('.sdf'): return PandasTools.LoadSDF(fname) else: raise RuntimeError(f"不支援的檔案格式: {fname}") raise RuntimeError("不支援的 file 類型") # =========== 批量分子圖 (前25) =========== def batch_mol_imgs(smiles_list): mols = [Chem.MolFromSmiles(s) for s in smiles_list[:25] if Chem.MolFromSmiles(s)] if not mols: return Image.new("RGB", (800, 160), (255,255,255)) grid = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(160,160)) buf = io.BytesIO() grid.save(buf, format='PNG') buf.seek(0) return Image.open(buf) # =========== 指紋/描述子/官能基 =========== def calc_features(df, fp_types, desc_types, smartbox): if 'ecfp4' in fp_types: df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048)) if 'maccs' in fp_types: df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167)) if 'rdkitfp' in fp_types: df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(Chem.RDKFingerprint(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048)) for desc in desc_types: try: if hasattr(Descriptors, desc): df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan) except: continue # SMARTS 官能基 if smartbox: for idx, smt in enumerate([x.strip() for x in smartbox.split(",") if x.strip()]): patt = Chem.MolFromSmarts(smt) df[f"FG{idx+1}_count"] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt) if Chem.MolFromSmiles(s) and patt else []) df[f"FG{idx+1}_count"] = df[f"FG{idx+1}_count"].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0) return df # =========== EDA報表 & 單欄分布 =========== def eda_report(df): profile = ProfileReport(df, title="EDA報告", minimal=True) out = "/tmp/eda_report.html" profile.to_file(out) return out def plot_desc_dist(df, desc='MolWt'): if df is None or desc not in df.columns: return Image.new("RGB", (400,200), (255,255,255)) fig, ax = plt.subplots(figsize=(5,3)) sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True) buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format='png') buf.seek(0) plt.close(fig) return Image.open(buf) # =========== 降維/分群 & 群代表分子 =========== def apply_dim_red(df, use, method='PCA'): X = np.stack(df[use].to_numpy()) if method == 'PCA': pc = PCA(n_components=2).fit_transform(X) elif method == 'UMAP': n_neighbors = min(15, X.shape[0]-1) if X.shape[0] > 1 else 1 pc = UMAP(n_components=2, random_state=42, n_neighbors=n_neighbors).fit_transform(X) elif method == 'tSNE': n_samples = X.shape[0] perplexity = min(30, max(2, (n_samples-1)//2)) pc = TSNE(n_components=2, perplexity=perplexity, random_state=42).fit_transform(X) else: raise ValueError('Unknown method') return pc def plot_scatter(pc, labels, title): fig, ax = plt.subplots(figsize=(5,4)) scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7) plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title) plt.colorbar(scatter) buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format='png') buf.seek(0) plt.close(fig) return Image.open(buf) def cluster_reps(df, labels, use): reps = [] labels = np.array(labels) for cl in np.unique(labels): cluster_df = df[labels == cl] if len(cluster_df) > 0: idx = np.random.choice(cluster_df.index, 1)[0] reps.append(cluster_df.loc[idx]['smiles']) return reps # 圖像自動說明函數 # 針對描述子分布圖的簡易解釋 def explain_desc_dist(df, desc): if df is None or desc not in df.columns: return "請先上傳檔案並選擇描述子。" n = df.shape[0] mean = df[desc].mean() std = df[desc].std() minv = df[desc].min() maxv = df[desc].max() msg = ( f"這張圖顯示目前資料集裡「{desc}」這個特徵的分布情況。\n\n" f"總共有 {n} 筆資料。數值大多集中在平均值 {mean:.2f} 左右,標準差是 {std:.2f}。\n" f"最低值 {minv:.2f},最高值 {maxv:.2f}。圖中高峰的位置就是資料最集中的範圍。\n" "這樣的分布圖可以幫助你判斷資料是不是有極端值,或是大多數分子都屬於哪個範圍。\n\n" "【應用案例】\n" "比方說,如果你正在開發一款新藥或新材料,可以藉由分布圖觀察目標分子的分子量、極性(TPSA)是否和現有產品類似。" "如果發現有極端高或低的分子量,可能要特別注意這些分子在後續應用上的行為,例如溶解度、吸收性等。\n" "這樣的分析常被用於藥物篩選(藥物發現)、高分子材料設計,甚至協助判斷是否有「資料異常」需要清理。" ) return msg # 針對降維與分群圖的簡易解釋 def explain_dimred(pc, labels, dr, cl): n = pc.shape[0] n_cls = len(np.unique(labels)) label_cnt = {int(lbl): sum(labels==lbl) for lbl in np.unique(labels)} msg = ( f"這張圖是把分子資料用 {dr} 方法做降維,再用 {cl} 方法分群得到的。\n\n" f"每個點代表一個分子,不同顏色表示不同的群組。這次總共分成 {n_cls} 群。\n" f"每群的分子數量分別是:{label_cnt}\n" "如果看到有明顯分群,代表這些分子之間可能有某些共通的特徵。\n" "點跟點之間越近,代表它們在特徵上也越像。如果有單獨一兩個點很遠,那可能就是所謂的離群值(和其他分子差異較大)。\n\n" "【應用案例】\n" "假設你是一位藥物化學家,可以利用這張圖來判斷哪些分子屬於同一類型(例如同一類藥效分子),快速發現群內外的差異。" "如果你是在開發新材料,可以找出和現有材料組成最相近或最不一樣的分子,有助於找出潛力新材料。\n" "此外,如果發現某個群組特別小或特別大,也可以針對這些群組進行更細緻的分析,例如針對特殊族群做後續的結構設計。" ) return msg # =========== Gradio 主 UI =========== with gr.Blocks(title="Cheminformatics Platform") as demo: gr.Markdown("# 🧪 Cheminformatics 多功能平台") # 1. 資料導入與批次結構圖 with gr.Tab("1️⃣ 資料導入/結構圖"): up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"]) df_view = gr.Dataframe(label="資料預覽 (前15筆)") mol_grid = gr.Image(label="分子結構圖(前25筆)") up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view) up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid) # 2. 特徵/描述子/官能基計算 with gr.Tab("2️⃣ 特徵/描述子/官能基計算"): file2 = gr.File(label="選擇分子檔") fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋", value=['ecfp4']) desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子") func_smart = gr.Textbox(label="官能基SMARTS(逗號分隔)", placeholder="[N+](=O)[O-], [OX2H]") feat_preview = gr.Dataframe(label="特徵/描述子預覽(前10筆)") def calc_all_feats(file, fp, desc, smartbox): df = load_table(file) df = calc_features(df, fp, desc, smartbox) return df.head(10) gr.Button("特徵/官能基計算", variant="primary").click( calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview ) # 3. EDA分析/自動報表 with gr.Tab("3️⃣ EDA分析/自動報表"): file3 = gr.File(label="分子檔") col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="欄位") eda_img = gr.Image(label="描述子分布圖") eda_desc = gr.Markdown(label="圖像說明") # <<<<<<<< 新增這一行 eda_btn = gr.Button("產生分布圖") def show_eda_img_with_explain(file, col): df = calc_features(load_table(file), ['ecfp4'], [col], None) img = plot_desc_dist(df, col) txt = explain_desc_dist(df, col) return img, txt eda_btn.click( show_eda_img_with_explain, [file3, col_sel], [eda_img, eda_desc] ) eda_sum = gr.File(label="下載EDA報表") gr.Button("生成EDA報表", variant="primary").click( lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum ) # 4. 降維/分群/群代表分子圖 with gr.Tab("4️⃣ 降維/分群/結構探索"): file4 = gr.File(label="分子檔") use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維指紋", value="ecfp4") dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA") cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans") nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數") dr_img = gr.Image(label="降維/分群視覺化") dr_desc = gr.Markdown(label="圖像說明") # <======= 新增這一行 rep_imgs = gr.Image(label="群代表分子圖(每群1個)") def dimred_and_cluster(file, fp, dr, cl, nclu): df = load_table(file) df = calc_features(df, [fp], [], None) pc = apply_dim_red(df, fp, dr) labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc) if cl == 'KMeans' else DBSCAN(eps=3, min_samples=2).fit_predict(pc) plotimg = plot_scatter(pc, labels, f"{dr}-{cl}") # 圖像解釋 desc = explain_dimred(pc, labels, dr, cl) reps = cluster_reps(df, labels, fp) rep_img = batch_mol_imgs(reps) return plotimg, desc, rep_img gr.Button("降維+分群分析", variant="primary").click( dimred_and_cluster, [file4, use_fp, dr_method, cl_method, nclus], [dr_img, dr_desc, rep_imgs] ) gr.Markdown("---\n> 完整工作流:1️⃣資料導入 → 2️⃣特徵/描述子/官能基 → 3️⃣EDA分析 → 4️⃣降維/分群/結構探索") demo.launch(share=True)