Kung-Hsun's picture
Update app.py
22e1400 verified
raw
history blame
9.07 kB
# Cheminformatics 多功能平台 - 基礎版
# 主要涵蓋:分子資料導入、指紋/描述子生成、資料探索、分群、建模、特徵解釋、批量預測、可視化
# Author: 2025
import gradio as gr
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, MACCSkeys, Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans, DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns
import io
from PIL import Image
import chardet
from ydata_profiling import ProfileReport
# =========== Robust 多格式自動讀取 ===========
def load_table(file):
if file is None:
return pd.DataFrame()
# 路徑或 str
fname = file if isinstance(file, str) else getattr(file, "name", None)
if fname is not None:
if fname.endswith('.csv'):
with open(fname, 'rb') as f:
raw = f.read(4096)
enc = chardet.detect(raw)['encoding'] or 'utf-8'
return pd.read_csv(fname, encoding=enc, engine='python')
elif fname.endswith('.xlsx') or fname.endswith('.xls'):
return pd.read_excel(fname)
elif fname.endswith('.sdf'):
return PandasTools.LoadSDF(fname)
else:
raise RuntimeError(f"不支援的檔案格式: {fname}")
raise RuntimeError("不支援的 file 類型")
# =========== 批量分子圖 (前25) ===========
def batch_mol_imgs(smiles_list):
mols = [Chem.MolFromSmiles(s) for s in smiles_list[:25] if Chem.MolFromSmiles(s)]
if not mols:
return Image.new("RGB", (800, 160), (255,255,255))
grid = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(160,160))
buf = io.BytesIO()
grid.save(buf, format='PNG')
buf.seek(0)
return Image.open(buf)
# =========== 指紋/描述子/官能基 ===========
def calc_features(df, fp_types, desc_types, smartbox):
if 'ecfp4' in fp_types:
df['ecfp4'] = df['smiles'].apply(lambda s: np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2, nBits=2048)) if Chem.MolFromSmiles(s) else np.zeros(2048))
if 'maccs' in fp_types:
df['maccs'] = df['smiles'].apply(lambda s: np.array(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(s))) if Chem.MolFromSmiles(s) else np.zeros(167))
if 'rdkitfp' in fp_types:
df['rdkitfp'] = df['smiles'].apply(lambda s: np.array(Chem.RDKFingerprint(Chem.MolFromSmiles(s), maxPath=5)) if Chem.MolFromSmiles(s) else np.zeros(2048))
for desc in desc_types:
try:
if hasattr(Descriptors, desc):
df[desc] = df['smiles'].apply(lambda s: getattr(Descriptors, desc)(Chem.MolFromSmiles(s)) if Chem.MolFromSmiles(s) else np.nan)
except: continue
# SMARTS 官能基
if smartbox:
for idx, smt in enumerate([x.strip() for x in smartbox.split(",") if x.strip()]):
patt = Chem.MolFromSmarts(smt)
df[f"FG{idx+1}_count"] = df['smiles'].apply(lambda s: Chem.MolFromSmiles(s).GetSubstructMatches(patt) if Chem.MolFromSmiles(s) and patt else [])
df[f"FG{idx+1}_count"] = df[f"FG{idx+1}_count"].apply(lambda l: len(l) if isinstance(l, (list, tuple)) else 0)
return df
# =========== EDA報表 & 單欄分布 ===========
def eda_report(df):
profile = ProfileReport(df, title="EDA報告", minimal=True)
out = "/tmp/eda_report.html"
profile.to_file(out)
return out
def plot_desc_dist(df, desc='MolWt'):
if df is None or desc not in df.columns:
return Image.new("RGB", (400,200), (255,255,255))
fig, ax = plt.subplots(figsize=(5,3))
sns.histplot(df[desc].dropna(), ax=ax, bins=30, kde=True)
buf = io.BytesIO()
plt.tight_layout()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close(fig)
return Image.open(buf)
# =========== 降維/分群 & 群代表分子 ===========
def apply_dim_red(df, use, method='PCA'):
X = np.stack(df[use].to_numpy())
if method == 'PCA':
pc = PCA(n_components=2).fit_transform(X)
elif method == 'UMAP':
n_neighbors = min(15, X.shape[0]-1) if X.shape[0] > 1 else 1
pc = UMAP(n_components=2, random_state=42, n_neighbors=n_neighbors).fit_transform(X)
elif method == 'tSNE':
n_samples = X.shape[0]
perplexity = min(30, max(2, (n_samples-1)//2))
pc = TSNE(n_components=2, perplexity=perplexity, random_state=42).fit_transform(X)
else:
raise ValueError('Unknown method')
return pc
def plot_scatter(pc, labels, title):
fig, ax = plt.subplots(figsize=(5,4))
scatter = ax.scatter(pc[:,0], pc[:,1], c=labels, cmap='tab10', alpha=0.7)
plt.xlabel("Dim1"); plt.ylabel("Dim2"); plt.title(title)
plt.colorbar(scatter)
buf = io.BytesIO()
plt.tight_layout()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close(fig)
return Image.open(buf)
def cluster_reps(df, labels, use):
reps = []
labels = np.array(labels)
for cl in np.unique(labels):
cluster_df = df[labels == cl]
if len(cluster_df) > 0:
idx = np.random.choice(cluster_df.index, 1)[0]
reps.append(cluster_df.loc[idx]['smiles'])
return reps
# =========== Gradio 主 UI ===========
with gr.Blocks(title="Cheminformatics Platform") as demo:
gr.Markdown("# 🧪 Cheminformatics 多功能平台")
# 1. 資料導入與批次結構圖
with gr.Tab("1️⃣ 資料導入/結構圖"):
up = gr.File(label="上傳分子檔 (csv/xlsx/sdf)", file_types=[".csv", ".xlsx", ".sdf"])
df_view = gr.Dataframe(label="資料預覽 (前15筆)")
mol_grid = gr.Image(label="分子結構圖(前25筆)")
up.upload(lambda f: load_table(f).head(15) if f else pd.DataFrame(), up, df_view)
up.upload(lambda f: batch_mol_imgs(load_table(f)['smiles'].values[:25]) if f else None, up, mol_grid)
# 2. 特徵/描述子/官能基計算
with gr.Tab("2️⃣ 特徵/描述子/官能基計算"):
file2 = gr.File(label="選擇分子檔")
fp_types = gr.CheckboxGroup(['ecfp4','maccs','rdkitfp'], label="指紋", value=['ecfp4'])
desc_types = gr.CheckboxGroup(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="描述子")
func_smart = gr.Textbox(label="官能基SMARTS(逗號分隔)", placeholder="[N+](=O)[O-], [OX2H]")
feat_preview = gr.Dataframe(label="特徵/描述子預覽(前10筆)")
def calc_all_feats(file, fp, desc, smartbox):
df = load_table(file)
df = calc_features(df, fp, desc, smartbox)
return df.head(10)
gr.Button("特徵/官能基計算", variant="primary").click(
calc_all_feats, [file2, fp_types, desc_types, func_smart], feat_preview
)
# 3. EDA分析/自動報表
with gr.Tab("3️⃣ EDA分析/自動報表"):
file3 = gr.File(label="分子檔")
col_sel = gr.Dropdown(['MolWt','TPSA','NumHDonors','NumHAcceptors','LogP'], label="欄位")
eda_img = gr.Image(label="描述子分布圖")
eda_btn = gr.Button("產生分布圖")
eda_btn.click(
lambda f, c: plot_desc_dist(calc_features(load_table(f), ['ecfp4'], [c], None), c) if f else None,
[file3, col_sel], eda_img
)
eda_sum = gr.File(label="下載EDA報表")
gr.Button("生成EDA報表", variant="primary").click(
lambda f: eda_report(load_table(f)) if f else None, file3, eda_sum
)
# 4. 降維/分群/群代表分子圖
with gr.Tab("4️⃣ 降維/分群/結構探索"):
file4 = gr.File(label="分子檔")
use_fp = gr.Dropdown(['ecfp4','maccs','rdkitfp'], label="降維指紋", value="ecfp4")
dr_method = gr.Radio(['PCA','UMAP','tSNE'], label="降維方法", value="PCA")
cl_method = gr.Radio(['KMeans','DBSCAN'], label="分群方法", value="KMeans")
nclus = gr.Slider(2, 8, 3, 1, label="KMeans分群數")
dr_img = gr.Image(label="降維/分群視覺化")
rep_imgs = gr.Image(label="群代表分子圖(每群1個)")
def dimred_and_cluster(file, fp, dr, cl, nclu):
df = load_table(file)
df = calc_features(df, [fp], [], None)
pc = apply_dim_red(df, fp, dr)
labels = KMeans(n_clusters=int(nclu), random_state=42).fit_predict(pc) if cl == 'KMeans' else DBSCAN(eps=3, min_samples=2).fit_predict(pc)
plotimg = plot_scatter(pc, labels, f"{dr}-{cl}")
reps = cluster_reps(df, labels, fp)
rep_img = batch_mol_imgs(reps)
return plotimg, rep_img
gr.Button("降維+分群分析", variant="primary").click(
dimred_and_cluster, [file4, use_fp, dr_method, cl_method, nclus], [dr_img, rep_imgs]
)
gr.Markdown("---\n> 完整工作流:1️⃣資料導入 → 2️⃣特徵/描述子/官能基 → 3️⃣EDA分析 → 4️⃣降維/分群/結構探索")
demo.launch(share=True)