Spaces:

william1324
/

modeltraining

Paused

File size: 8,810 Bytes

2975e51

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc
)


st.set_page_config(page_title="機器學習模型訓練工具", layout="wide")

st.title("機器學習模型訓練工具開發")
st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。")


def load_data(uploaded_file):
    file_name = uploaded_file.name.lower()
    if file_name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
    elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
        df = pd.read_excel(uploaded_file)
    else:
        return None
    return df


def preprocess_data(df, target_column):
    df = df.copy()
    df = df.dropna(how="all")

    y = df[target_column]
    X = df.drop(columns=[target_column])

    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    if len(numeric_cols) > 0:
        num_imputer = SimpleImputer(strategy="median")
        X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

    if len(categorical_cols) > 0:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

    if len(categorical_cols) > 0:
        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

    return X, y


def build_model(model_name, params):
    if model_name == "KNN":
        return KNeighborsClassifier(n_neighbors=params["n_neighbors"])

    if model_name == "Decision Tree":
        return DecisionTreeClassifier(
            criterion=params["criterion"],
            max_depth=params["max_depth"],
            random_state=42
        )

    if model_name == "Random Forest":
        return RandomForestClassifier(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            random_state=42
        )

    if model_name == "Logistic Regression":
        return LogisticRegression(
            C=params["C"],
            max_iter=1000,
            random_state=42
        )

    if model_name == "SVM":
        return SVC(
            kernel=params["kernel"],
            C=params["C"],
            probability=True,
            random_state=42
        )

    return None


def plot_confusion_matrix(y_true, y_pred):
    fig, ax = plt.subplots(figsize=(5, 4))
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
    disp.plot(ax=ax)
    st.pyplot(fig)


def plot_roc_curve(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
    ax.plot([0, 1], [0, 1], linestyle="--")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("ROC Curve")
    ax.legend(loc="lower right")
    st.pyplot(fig)

    return roc_auc


st.sidebar.header("操作區")
uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"])

if uploaded_file is not None:
    df = load_data(uploaded_file)

    if df is None:
        st.error("檔案格式不支援。")
        st.stop()

    st.subheader("原始資料預覽")
    st.dataframe(df.head())

    col1, col2 = st.columns(2)

    with col1:
        st.subheader("資料基本資訊")
        st.write(f"資料維度：{df.shape[0]} 筆 × {df.shape[1]} 欄")
        st.write("欄位型態：")
        st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"]))

    with col2:
        st.subheader("缺失值統計")
        st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"]))

    st.subheader("欄位選擇")
    all_columns = df.columns.tolist()

    if "count" in all_columns:
        st.info("偵測到 count 欄位，可依作業需求轉為二元分類標籤。")
        use_count_as_target = st.checkbox(
            "將 count 轉為二元分類標籤（大於中位數=1，否則=0）",
            value=True
        )

        if use_count_as_target:
            median_value = df["count"].median()
            df["label"] = (df["count"] > median_value).astype(int)
            target_column = "label"
            st.write(f"`count` 中位數 = {median_value}")
            st.write("已建立新目標欄位：`label`")
        else:
            target_column = st.selectbox("請選擇目標欄位", all_columns)
    else:
        target_column = st.selectbox("請選擇目標欄位", all_columns)

    st.subheader("目標欄位分布")
    st.write(df[target_column].value_counts())

    test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1)
    use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True)

    model_name = st.sidebar.selectbox(
        "選擇模型",
        ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"]
    )

    params = {}

    if model_name == "KNN":
        params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5)

    elif model_name == "Decision Tree":
        params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"])
        max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
        params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)

    elif model_name == "Random Forest":
        params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10)
        max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
        params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)

    elif model_name == "Logistic Regression":
        params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)

    elif model_name == "SVM":
        params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"])
        params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)

    run_button = st.sidebar.button("開始訓練模型")

    if run_button:
        try:
            X, y = preprocess_data(df, target_column)

            if y.dtype == "object":
                le = LabelEncoder()
                y = le.fit_transform(y)

            unique_classes = np.unique(y)
            if len(unique_classes) != 2:
                st.error("目前程式設計為二元分類評估（ROC/AUC）。請選擇二元分類目標欄位。")
                st.stop()

            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                test_size=test_size,
                random_state=42,
                stratify=y
            )

            if use_scaling:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
            else:
                X_train = X_train.values
                X_test = X_test.values

            model = build_model(model_name, params)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            st.success("模型訓練完成")

            col3, col4 = st.columns(2)

            with col3:
                st.subheader("Accuracy")
                acc = accuracy_score(y_test, y_pred)
                st.write(f"{acc:.4f}")

            with col4:
                if y_prob is not None:
                    fpr, tpr, _ = roc_curve(y_test, y_prob)
                    roc_auc = auc(fpr, tpr)
                    st.subheader("AUC")
                    st.write(f"{roc_auc:.4f}")

            st.subheader("Classification Report")
            report = classification_report(y_test, y_pred, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            st.dataframe(report_df)

            st.subheader("Confusion Matrix")
            plot_confusion_matrix(y_test, y_pred)

            if y_prob is not None:
                st.subheader("ROC Curve")
                plot_roc_curve(y_test, y_prob)

        except Exception as e:
            st.error(f"執行時發生錯誤：{e}")

else:
    st.info("請先在左側上傳資料檔案。")