import warnings warnings.filterwarnings("ignore") import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.impute import SimpleImputer from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc ) # ========================= # 基本工具函式 # ========================= def load_data(file_obj): if file_obj is None: raise ValueError("請先上傳 CSV 或 Excel 檔案。") file_path = file_obj.name lower_name = file_path.lower() if lower_name.endswith(".csv"): return pd.read_csv(file_path) if lower_name.endswith(".xlsx") or lower_name.endswith(".xls"): return pd.read_excel(file_path) raise ValueError("目前只支援 .csv、.xlsx、.xls 檔案。") def build_model( model_name, knn_k, dt_criterion, dt_max_depth, rf_estimators, rf_max_depth, lr_c, svm_kernel, svm_c ): if model_name == "KNN": return KNeighborsClassifier(n_neighbors=int(knn_k)) if model_name == "Decision Tree": max_depth = None if int(dt_max_depth) == 0 else int(dt_max_depth) return DecisionTreeClassifier( criterion=dt_criterion, max_depth=max_depth, random_state=42 ) if model_name == "Random Forest": max_depth = None if int(rf_max_depth) == 0 else int(rf_max_depth) return RandomForestClassifier( n_estimators=int(rf_estimators), max_depth=max_depth, random_state=42 ) if model_name == "Logistic Regression": return LogisticRegression( C=float(lr_c), max_iter=2000, random_state=42 ) if model_name == "SVM": return SVC( kernel=svm_kernel, C=float(svm_c), probability=True, random_state=42 ) raise ValueError("不支援的模型。") def preprocess_features(df, target_column): df = df.copy().dropna(how="all") y = df[target_column] X = df.drop(columns=[target_column]) numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() if numeric_cols: num_imputer = SimpleImputer(strategy="median") X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols]) if categorical_cols: cat_imputer = SimpleImputer(strategy="most_frequent") X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols]) X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) return X, y def prepare_target(df, target_column, use_count_as_target): df = df.copy() if use_count_as_target: if "count" not in df.columns: raise ValueError("你勾選了 count 二元分類,但資料中沒有 count 欄位。") median_value = df["count"].median() df["label"] = (df["count"] > median_value).astype(int) target_column = "label" if target_column is None or target_column not in df.columns: raise ValueError("請選擇正確的目標欄位。") return df, target_column def encode_target(y): if y.dtype == "object": encoder = LabelEncoder() y = encoder.fit_transform(y) return y # ========================= # 視覺化函式 # ========================= def plot_target_distribution(y_series, title="Label Distribution"): fig, ax = plt.subplots(figsize=(6, 4)) counts = pd.Series(y_series).value_counts().sort_index() ax.bar(counts.index.astype(str), counts.values) ax.set_title(title) ax.set_xlabel("Class") ax.set_ylabel("Count") plt.tight_layout() return fig def plot_confusion(y_true, y_pred): fig, ax = plt.subplots(figsize=(5, 4)) disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred)) disp.plot(ax=ax) ax.set_title("Confusion Matrix") plt.tight_layout() return fig def plot_roc_curve(y_true, y_prob): fpr, tpr, _ = roc_curve(y_true, y_prob) roc_auc = auc(fpr, tpr) fig, ax = plt.subplots(figsize=(6, 4)) ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}") ax.plot([0, 1], [0, 1], linestyle="--") ax.set_title("ROC Curve") ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate") ax.legend(loc="lower right") plt.tight_layout() return fig, roc_auc def plot_model_comparison(result_df): fig, ax = plt.subplots(figsize=(8, 4)) ax.bar(result_df["Model"], result_df["Accuracy"]) ax.set_title("Model Accuracy Comparison") ax.set_xlabel("Model") ax.set_ylabel("Accuracy") ax.set_ylim(0, 1) plt.xticks(rotation=15) plt.tight_layout() return fig # ========================= # 資料分析 # ========================= def analyze_file(file_obj): try: df = load_data(file_obj) preview_df = df.head(10) info_df = pd.DataFrame({ "欄位名稱": df.columns, "資料型態": [str(dtype) for dtype in df.dtypes] }) missing_df = pd.DataFrame({ "欄位名稱": df.columns, "缺失值數量": df.isnull().sum().values, "缺失比例(%)": (df.isnull().mean().values * 100).round(2) }) summary = [] summary.append(f"資料筆數:{df.shape[0]}") summary.append(f"資料欄數:{df.shape[1]}") summary.append(f"數值欄位數:{len(df.select_dtypes(include=[np.number]).columns)}") summary.append(f"類別欄位數:{len(df.select_dtypes(exclude=[np.number]).columns)}") summary.append(f"總缺失值數:{int(df.isnull().sum().sum())}") columns = list(df.columns) if len(columns) > 0: default_target = "count" if "count" in columns else columns[-1] else: default_target = None has_count_message = "有偵測到 count 欄位,可直接轉成二元分類。" if "count" in df.columns else "未偵測到 count 欄位。" empty_fig = plt.figure() plt.close(empty_fig) return ( preview_df, info_df, missing_df, "\n".join(summary) + f"\n{has_count_message}", gr.update(choices=columns, value=default_target), ) except Exception as e: empty_df = pd.DataFrame() return ( empty_df, empty_df, empty_df, f"資料分析失敗:{e}", gr.update(choices=[], value=None), ) def target_distribution(file_obj, target_column, use_count_as_target): try: df = load_data(file_obj) df, target_column = prepare_target(df, target_column, use_count_as_target) fig = plot_target_distribution(df[target_column], title=f"{target_column} Distribution") return fig except Exception as e: fig, ax = plt.subplots(figsize=(6, 3)) ax.text(0.5, 0.5, f"無法產生分布圖:\n{e}", ha="center", va="center") ax.axis("off") plt.tight_layout() return fig # ========================= # 單一模型訓練 # ========================= def train_single_model( file_obj, target_column, use_count_as_target, test_size, use_scaling, model_name, knn_k, dt_criterion, dt_max_depth, rf_estimators, rf_max_depth, lr_c, svm_kernel, svm_c ): try: df = load_data(file_obj) df, target_column = prepare_target(df, target_column, use_count_as_target) X, y = preprocess_features(df, target_column) y = encode_target(y) unique_classes = np.unique(y) if len(unique_classes) != 2: raise ValueError("目前版本只支援二元分類,因為需要輸出 ROC/AUC。") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(test_size), random_state=42, stratify=y ) if use_scaling: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) else: X_train = X_train.values X_test = X_test.values model = build_model( model_name=model_name, knn_k=knn_k, dt_criterion=dt_criterion, dt_max_depth=dt_max_depth, rf_estimators=rf_estimators, rf_max_depth=rf_max_depth, lr_c=lr_c, svm_kernel=svm_kernel, svm_c=svm_c ) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_prob = None if hasattr(model, "predict_proba"): y_prob = model.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) pre = precision_score(y_test, y_pred, zero_division=0) rec = recall_score(y_test, y_pred, zero_division=0) f1 = f1_score(y_test, y_pred, zero_division=0) auc_text = "無法計算" roc_fig = None if y_prob is not None: roc_fig, roc_auc = plot_roc_curve(y_test, y_prob) auc_text = f"{roc_auc:.4f}" result_text = ( f"模型名稱:{model_name}\n" f"Accuracy:{acc:.4f}\n" f"Precision:{pre:.4f}\n" f"Recall:{rec:.4f}\n" f"F1-score:{f1:.4f}\n" f"AUC:{auc_text}" ) report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose() cm_fig = plot_confusion(y_test, y_pred) return result_text, report_df.round(4), cm_fig, roc_fig except Exception as e: empty_df = pd.DataFrame() fig, ax = plt.subplots(figsize=(6, 3)) ax.text(0.5, 0.5, f"錯誤:{e}", ha="center", va="center") ax.axis("off") plt.tight_layout() return f"模型訓練失敗:{e}", empty_df, fig, None # ========================= # 多模型比較 # ========================= def compare_models( file_obj, target_column, use_count_as_target, test_size, use_scaling ): try: df = load_data(file_obj) df, target_column = prepare_target(df, target_column, use_count_as_target) X, y = preprocess_features(df, target_column) y = encode_target(y) unique_classes = np.unique(y) if len(unique_classes) != 2: raise ValueError("目前版本只支援二元分類比較。") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(test_size), random_state=42, stratify=y ) if use_scaling: scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) else: X_train_scaled = X_train.values X_test_scaled = X_test.values models = [ ("KNN", KNeighborsClassifier(n_neighbors=5)), ("Decision Tree", DecisionTreeClassifier(random_state=42)), ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)), ("Logistic Regression", LogisticRegression(max_iter=2000, random_state=42)), ("SVM", SVC(kernel="rbf", probability=True, random_state=42)), ] rows = [] for name, model in models: model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) acc = accuracy_score(y_test, y_pred) pre = precision_score(y_test, y_pred, zero_division=0) rec = recall_score(y_test, y_pred, zero_division=0) f1 = f1_score(y_test, y_pred, zero_division=0) auc_score = np.nan if hasattr(model, "predict_proba"): y_prob = model.predict_proba(X_test_scaled)[:, 1] auc_score = auc(*roc_curve(y_test, y_prob)[:2]) rows.append({ "Model": name, "Accuracy": round(acc, 4), "Precision": round(pre, 4), "Recall": round(rec, 4), "F1-score": round(f1, 4), "AUC": None if pd.isna(auc_score) else round(auc_score, 4) }) result_df = pd.DataFrame(rows).sort_values(by="Accuracy", ascending=False).reset_index(drop=True) compare_fig = plot_model_comparison(result_df) best_model = result_df.iloc[0] summary = ( f"最佳模型:{best_model['Model']}\n" f"Accuracy:{best_model['Accuracy']}\n" f"Precision:{best_model['Precision']}\n" f"Recall:{best_model['Recall']}\n" f"F1-score:{best_model['F1-score']}\n" f"AUC:{best_model['AUC']}" ) return summary, result_df, compare_fig except Exception as e: empty_df = pd.DataFrame() fig, ax = plt.subplots(figsize=(6, 3)) ax.text(0.5, 0.5, f"錯誤:{e}", ha="center", va="center") ax.axis("off") plt.tight_layout() return f"模型比較失敗:{e}", empty_df, fig # ========================= # UI # ========================= custom_css = """ .gradio-container { max-width: 1200px !important; } """ with gr.Blocks(title="機器學習模型訓練工具", css=custom_css) as demo: gr.Markdown(""" # 機器學習模型訓練 - 資料上傳與預覽 - 欄位型態與缺失值分析 - `count` 欄位轉二元分類 - KNN / Decision Tree / Random Forest / Logistic Regression / SVM - Accuracy / Precision / Recall / F1-score / AUC - Confusion Matrix / ROC Curve - 多模型比較 """) with gr.Tab("1. 資料分析"): with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="上傳 CSV 或 Excel 檔案", file_types=[".csv", ".xlsx", ".xls"] ) analyze_btn = gr.Button("分析資料", variant="primary") target_dropdown = gr.Dropdown(label="目標欄位", choices=[], value=None) use_count_checkbox = gr.Checkbox( label="若資料有 count 欄位,將其依中位數轉成二元分類", value=True ) dist_btn = gr.Button("顯示類別分布") with gr.Column(scale=2): summary_output = gr.Textbox(label="資料摘要", lines=8) preview_output = gr.Dataframe(label="資料預覽") info_output = gr.Dataframe(label="欄位型態") missing_output = gr.Dataframe(label="缺失值統計") dist_plot = gr.Plot(label="類別分布圖") with gr.Tab("2. 單一模型訓練"): with gr.Row(): with gr.Column(scale=1): test_size_slider = gr.Slider( label="測試集比例", minimum=0.1, maximum=0.5, step=0.1, value=0.2 ) use_scaling_checkbox = gr.Checkbox( label="使用 StandardScaler", value=True ) model_dropdown = gr.Dropdown( label="選擇模型", choices=[ "KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM" ], value="KNN" ) gr.Markdown("## 模型參數") knn_k = gr.Slider(label="KNN:k 值", minimum=1, maximum=15, value=5, step=1) dt_criterion = gr.Dropdown( label="Decision Tree:criterion", choices=["gini", "entropy"], value="gini" ) dt_max_depth = gr.Slider( label="Decision Tree:max_depth(0 代表不限)", minimum=0, maximum=20, value=5, step=1 ) rf_estimators = gr.Slider( label="Random Forest:n_estimators", minimum=10, maximum=300, value=100, step=10 ) rf_max_depth = gr.Slider( label="Random Forest:max_depth(0 代表不限)", minimum=0, maximum=20, value=5, step=1 ) lr_c = gr.Slider( label="Logistic Regression:C", minimum=0.01, maximum=10.0, value=1.0, step=0.01 ) svm_kernel = gr.Dropdown( label="SVM:kernel", choices=["linear", "rbf"], value="rbf" ) svm_c = gr.Slider( label="SVM:C", minimum=0.01, maximum=10.0, value=1.0, step=0.01 ) train_btn = gr.Button("開始訓練單一模型", variant="primary") with gr.Column(scale=2): single_result_output = gr.Textbox(label="模型結果", lines=8) report_output = gr.Dataframe(label="Classification Report") cm_output = gr.Plot(label="Confusion Matrix") roc_output = gr.Plot(label="ROC Curve") with gr.Tab("3. 多模型比較"): with gr.Row(): with gr.Column(scale=1): compare_btn = gr.Button("比較所有模型", variant="primary") with gr.Column(scale=2): compare_summary = gr.Textbox(label="最佳模型摘要", lines=8) compare_table = gr.Dataframe(label="模型比較表") compare_plot = gr.Plot(label="模型 Accuracy 比較圖") analyze_btn.click( fn=analyze_file, inputs=[file_input], outputs=[ preview_output, info_output, missing_output, summary_output, target_dropdown ] ) dist_btn.click( fn=target_distribution, inputs=[file_input, target_dropdown, use_count_checkbox], outputs=[dist_plot] ) train_btn.click( fn=train_single_model, inputs=[ file_input, target_dropdown, use_count_checkbox, test_size_slider, use_scaling_checkbox, model_dropdown, knn_k, dt_criterion, dt_max_depth, rf_estimators, rf_max_depth, lr_c, svm_kernel, svm_c ], outputs=[ single_result_output, report_output, cm_output, roc_output ] ) compare_btn.click( fn=compare_models, inputs=[ file_input, target_dropdown, use_count_checkbox, test_size_slider, use_scaling_checkbox ], outputs=[ compare_summary, compare_table, compare_plot ] ) demo.launch()