Spaces:

william1324
/

Learningmodel

Sleeping

App Files Files Community

william1324 commited on Apr 13

Commit

37ea271

verified ·

1 Parent(s): 1087b36

Create app.py

Browse files

Files changed (1) hide show

app.py +398 -0

app.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+    ConfusionMatrixDisplay,
+    roc_curve,
+    auc
+)
+def load_data(file_obj):
+    if file_obj is None:
+        raise ValueError("請先上傳 CSV 或 Excel 檔案。")
+    file_path = file_obj.name
+    lower_name = file_path.lower()
+    if lower_name.endswith(".csv"):
+        return pd.read_csv(file_path)
+    if lower_name.endswith(".xlsx") or lower_name.endswith(".xls"):
+        return pd.read_excel(file_path)
+    raise ValueError("只支援 CSV、XLSX、XLS 檔案。")
+def preprocess_data(df, target_column):
+    df = df.copy()
+    df = df.dropna(how="all")
+    y = df[target_column]
+    X = df.drop(columns=[target_column])
+    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
+    if numeric_cols:
+        num_imputer = SimpleImputer(strategy="median")
+        X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
+    if categorical_cols:
+        cat_imputer = SimpleImputer(strategy="most_frequent")
+        X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
+        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
+    return X, y
+def build_model(
+    model_name,
+    knn_k,
+    dt_criterion,
+    dt_max_depth,
+    rf_estimators,
+    rf_max_depth,
+    lr_c,
+    svm_kernel,
+    svm_c
+):
+    if model_name == "KNN":
+        return KNeighborsClassifier(n_neighbors=int(knn_k))
+    if model_name == "Decision Tree":
+        max_depth = None if int(dt_max_depth) == 0 else int(dt_max_depth)
+        return DecisionTreeClassifier(
+            criterion=dt_criterion,
+            max_depth=max_depth,
+            random_state=42
+        )
+    if model_name == "Random Forest":
+        max_depth = None if int(rf_max_depth) == 0 else int(rf_max_depth)
+        return RandomForestClassifier(
+            n_estimators=int(rf_estimators),
+            max_depth=max_depth,
+            random_state=42
+        )
+    if model_name == "Logistic Regression":
+        return LogisticRegression(
+            C=float(lr_c),
+            max_iter=1000,
+            random_state=42
+        )
+    if model_name == "SVM":
+        return SVC(
+            kernel=svm_kernel,
+            C=float(svm_c),
+            probability=True,
+            random_state=42
+        )
+    raise ValueError("不支援的模型。")
+def plot_confusion(y_true, y_pred):
+    fig, ax = plt.subplots(figsize=(5, 4))
+    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
+    disp.plot(ax=ax)
+    plt.tight_layout()
+    return fig
+def plot_roc(y_true, y_prob):
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    roc_auc = auc(fpr, tpr)
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
+    ax.plot([0, 1], [0, 1], linestyle="--")
+    ax.set_xlabel("False Positive Rate")
+    ax.set_ylabel("True Positive Rate")
+    ax.set_title("ROC Curve")
+    ax.legend(loc="lower right")
+    plt.tight_layout()
+    return fig, roc_auc
+def analyze_file(file_obj):
+    try:
+        df = load_data(file_obj)
+        info_df = pd.DataFrame({
+            "欄位名稱": df.columns,
+            "資料型態": [str(dtype) for dtype in df.dtypes]
+        })
+        missing_df = pd.DataFrame({
+            "欄位名稱": df.columns,
+            "缺失值數量": df.isnull().sum().values
+        })
+        preview_df = df.head(10)
+        summary_text = f"資料維度：{df.shape[0]} 筆 × {df.shape[1]} 欄"
+        columns = list(df.columns)
+        return (
+            preview_df,
+            info_df,
+            missing_df,
+            summary_text,
+            gr.update(choices=columns, value=columns[0] if columns else None)
+        )
+    except Exception as e:
+        empty_df = pd.DataFrame()
+        return (
+            empty_df,
+            empty_df,
+            empty_df,
+            f"錯誤：{e}",
+            gr.update(choices=[], value=None)
+        )
+def train_model(
+    file_obj,
+    target_column,
+    use_count_as_target,
+    test_size,
+    use_scaling,
+    model_name,
+    knn_k,
+    dt_criterion,
+    dt_max_depth,
+    rf_estimators,
+    rf_max_depth,
+    lr_c,
+    svm_kernel,
+    svm_c
+):
+    try:
+        df = load_data(file_obj)
+        if use_count_as_target:
+            if "count" not in df.columns:
+                raise ValueError("你勾選了用 count 轉二元分類，但資料中沒有 count 欄位。")
+            median_value = df["count"].median()
+            df["label"] = (df["count"] > median_value).astype(int)
+            target_column = "label"
+        if target_column is None or target_column not in df.columns:
+            raise ValueError("請先選擇正確的目標欄位。")
+        X, y = preprocess_data(df, target_column)
+        if y.dtype == "object":
+            encoder = LabelEncoder()
+            y = encoder.fit_transform(y)
+        unique_classes = np.unique(y)
+        if len(unique_classes) != 2:
+            raise ValueError("目前此版本只支援二元分類，因為需要輸出 ROC / AUC。")
+        X_train, X_test, y_train, y_test = train_test_split(
+            X,
+            y,
+            test_size=float(test_size),
+            random_state=42,
+            stratify=y
+        )
+        if use_scaling:
+            scaler = StandardScaler()
+            X_train = scaler.fit_transform(X_train)
+            X_test = scaler.transform(X_test)
+        else:
+            X_train = X_train.values
+            X_test = X_test.values
+        model = build_model(
+            model_name=model_name,
+            knn_k=knn_k,
+            dt_criterion=dt_criterion,
+            dt_max_depth=dt_max_depth,
+            rf_estimators=rf_estimators,
+            rf_max_depth=rf_max_depth,
+            lr_c=lr_c,
+            svm_kernel=svm_kernel,
+            svm_c=svm_c
+        )
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
+        acc = accuracy_score(y_test, y_pred)
+        report_df = pd.DataFrame(
+            classification_report(y_test, y_pred, output_dict=True)
+        ).transpose()
+        cm_fig = plot_confusion(y_test, y_pred)
+        if y_prob is not None:
+            roc_fig, roc_auc = plot_roc(y_test, y_prob)
+            auc_text = f"AUC：{roc_auc:.4f}"
+        else:
+            roc_fig = None
+            auc_text = "AUC：無法計算"
+        result_text = f"模型：{model_name}\nAccuracy：{acc:.4f}\n{auc_text}"
+        return result_text, report_df, cm_fig, roc_fig
+    except Exception as e:
+        empty_df = pd.DataFrame()
+        return f"錯誤：{e}", empty_df, None, None
+with gr.Blocks(title="機器學習模型訓練工具") as demo:
+    gr.Markdown("# 機器學習模型訓練工具")
+    gr.Markdown(
+        "支援 CSV / Excel 上傳、資料檢視、前處理、模型訓練、Classification Report、Confusion Matrix、ROC Curve。"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(label="上傳 CSV 或 Excel 檔案", file_types=[".csv", ".xlsx", ".xls"])
+            analyze_button = gr.Button("分析資料")
+            target_dropdown = gr.Dropdown(label="選擇目標欄位", choices=[], value=None)
+            use_count_checkbox = gr.Checkbox(
+                label="若資料有 count 欄位，將 count 依中位數轉為二元分類",
+                value=True
+            )
+            test_size_slider = gr.Slider(
+                label="測試集比例",
+                minimum=0.1,
+                maximum=0.5,
+                value=0.2,
+                step=0.1
+            )
+            use_scaling_checkbox = gr.Checkbox(
+                label="使用 StandardScaler",
+                value=True
+            )
+            model_dropdown = gr.Dropdown(
+                label="選擇模型",
+                choices=[
+                    "KNN",
+                    "Decision Tree",
+                    "Random Forest",
+                    "Logistic Regression",
+                    "SVM"
+                ],
+                value="KNN"
+            )
+            gr.Markdown("## 模型參數")
+            knn_k = gr.Slider(label="KNN：k 值", minimum=1, maximum=15, value=5, step=1)
+            dt_criterion = gr.Dropdown(
+                label="Decision Tree：criterion",
+                choices=["gini", "entropy"],
+                value="gini"
+            )
+            dt_max_depth = gr.Slider(
+                label="Decision Tree：max_depth（0 代表不限）",
+                minimum=0,
+                maximum=20,
+                value=5,
+                step=1
+            )
+            rf_estimators = gr.Slider(
+                label="Random Forest：n_estimators",
+                minimum=10,
+                maximum=300,
+                value=100,
+                step=10
+            )
+            rf_max_depth = gr.Slider(
+                label="Random Forest：max_depth（0 代表不限）",
+                minimum=0,
+                maximum=20,
+                value=5,
+                step=1
+            )
+            lr_c = gr.Slider(
+                label="Logistic Regression：C",
+                minimum=0.01,
+                maximum=10.0,
+                value=1.0,
+                step=0.01
+            )
+            svm_kernel = gr.Dropdown(
+                label="SVM：kernel",
+                choices=["linear", "rbf"],
+                value="rbf"
+            )
+            svm_c = gr.Slider(
+                label="SVM：C",
+                minimum=0.01,
+                maximum=10.0,
+                value=1.0,
+                step=0.01
+            )
+            train_button = gr.Button("開始訓練", variant="primary")
+        with gr.Column(scale=2):
+            summary_text = gr.Textbox(label="資料摘要")
+            preview_output = gr.Dataframe(label="資料預覽")
+            info_output = gr.Dataframe(label="欄位型態")
+            missing_output = gr.Dataframe(label="缺失值統計")
+            result_text = gr.Textbox(label="模型結果")
+            report_output = gr.Dataframe(label="Classification Report")
+            cm_output = gr.Plot(label="Confusion Matrix")
+            roc_output = gr.Plot(label="ROC Curve")
+    analyze_button.click(
+        fn=analyze_file,
+        inputs=[file_input],
+        outputs=[preview_output, info_output, missing_output, summary_text, target_dropdown]
+    )
+    train_button.click(
+        fn=train_model,
+        inputs=[
+            file_input,
+            target_dropdown,
+            use_count_checkbox,
+            test_size_slider,
+            use_scaling_checkbox,
+            model_dropdown,
+            knn_k,
+            dt_criterion,
+            dt_max_depth,
+            rf_estimators,
+            rf_max_depth,
+            lr_c,
+            svm_kernel,
+            svm_c
+        ],
+        outputs=[result_text, report_output, cm_output, roc_output]
+    )
+demo.launch()