Spaces:

william1324
/

modeltraining

Paused

App Files Files Community

william1324 commited on Apr 13

Commit

2975e51

verified ·

1 Parent(s): 0731034

app.py

Browse files

Files changed (1) hide show

app.py +273 -0

app.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+    ConfusionMatrixDisplay,
+    roc_curve,
+    auc
+)
+st.set_page_config(page_title="機器學習模型訓練工具", layout="wide")
+st.title("機器學習模型訓練工具開發")
+st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。")
+def load_data(uploaded_file):
+    file_name = uploaded_file.name.lower()
+    if file_name.endswith(".csv"):
+        df = pd.read_csv(uploaded_file)
+    elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
+        df = pd.read_excel(uploaded_file)
+    else:
+        return None
+    return df
+def preprocess_data(df, target_column):
+    df = df.copy()
+    df = df.dropna(how="all")
+    y = df[target_column]
+    X = df.drop(columns=[target_column])
+    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
+    if len(numeric_cols) > 0:
+        num_imputer = SimpleImputer(strategy="median")
+        X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
+    if len(categorical_cols) > 0:
+        cat_imputer = SimpleImputer(strategy="most_frequent")
+        X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
+    if len(categorical_cols) > 0:
+        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
+    return X, y
+def build_model(model_name, params):
+    if model_name == "KNN":
+        return KNeighborsClassifier(n_neighbors=params["n_neighbors"])
+    if model_name == "Decision Tree":
+        return DecisionTreeClassifier(
+            criterion=params["criterion"],
+            max_depth=params["max_depth"],
+            random_state=42
+        )
+    if model_name == "Random Forest":
+        return RandomForestClassifier(
+            n_estimators=params["n_estimators"],
+            max_depth=params["max_depth"],
+            random_state=42
+        )
+    if model_name == "Logistic Regression":
+        return LogisticRegression(
+            C=params["C"],
+            max_iter=1000,
+            random_state=42
+        )
+    if model_name == "SVM":
+        return SVC(
+            kernel=params["kernel"],
+            C=params["C"],
+            probability=True,
+            random_state=42
+        )
+    return None
+def plot_confusion_matrix(y_true, y_pred):
+    fig, ax = plt.subplots(figsize=(5, 4))
+    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
+    disp.plot(ax=ax)
+    st.pyplot(fig)
+def plot_roc_curve(y_true, y_prob):
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    roc_auc = auc(fpr, tpr)
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
+    ax.plot([0, 1], [0, 1], linestyle="--")
+    ax.set_xlabel("False Positive Rate")
+    ax.set_ylabel("True Positive Rate")
+    ax.set_title("ROC Curve")
+    ax.legend(loc="lower right")
+    st.pyplot(fig)
+    return roc_auc
+st.sidebar.header("操作區")
+uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"])
+if uploaded_file is not None:
+    df = load_data(uploaded_file)
+    if df is None:
+        st.error("檔案格式不支援。")
+        st.stop()
+    st.subheader("原始資料預覽")
+    st.dataframe(df.head())
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("資料基本資訊")
+        st.write(f"資料維度：{df.shape[0]} 筆 × {df.shape[1]} 欄")
+        st.write("欄位型態：")
+        st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"]))
+    with col2:
+        st.subheader("缺失值統計")
+        st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"]))
+    st.subheader("欄位選擇")
+    all_columns = df.columns.tolist()
+    if "count" in all_columns:
+        st.info("偵測到 count 欄位，可依作業需求轉為二元分類標籤。")
+        use_count_as_target = st.checkbox(
+            "將 count 轉為二元分類標籤（大於中位數=1，否則=0）",
+            value=True
+        )
+        if use_count_as_target:
+            median_value = df["count"].median()
+            df["label"] = (df["count"] > median_value).astype(int)
+            target_column = "label"
+            st.write(f"`count` 中位數 = {median_value}")
+            st.write("已建立新目標欄位：`label`")
+        else:
+            target_column = st.selectbox("請選擇目標欄位", all_columns)
+    else:
+        target_column = st.selectbox("請選擇目標欄位", all_columns)
+    st.subheader("目標欄位分布")
+    st.write(df[target_column].value_counts())
+    test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1)
+    use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True)
+    model_name = st.sidebar.selectbox(
+        "選擇模型",
+        ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"]
+    )
+    params = {}
+    if model_name == "KNN":
+        params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5)
+    elif model_name == "Decision Tree":
+        params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"])
+        max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
+        params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)
+    elif model_name == "Random Forest":
+        params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10)
+        max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
+        params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)
+    elif model_name == "Logistic Regression":
+        params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)
+    elif model_name == "SVM":
+        params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"])
+        params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)
+    run_button = st.sidebar.button("開始訓練模型")
+    if run_button:
+        try:
+            X, y = preprocess_data(df, target_column)
+            if y.dtype == "object":
+                le = LabelEncoder()
+                y = le.fit_transform(y)
+            unique_classes = np.unique(y)
+            if len(unique_classes) != 2:
+                st.error("目前程式設計為二元分類評估（ROC/AUC）。請選擇二元分類目標欄位。")
+                st.stop()
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y,
+                test_size=test_size,
+                random_state=42,
+                stratify=y
+            )
+            if use_scaling:
+                scaler = StandardScaler()
+                X_train = scaler.fit_transform(X_train)
+                X_test = scaler.transform(X_test)
+            else:
+                X_train = X_train.values
+                X_test = X_test.values
+            model = build_model(model_name, params)
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_test)
+            y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
+            st.success("模型訓練完成")
+            col3, col4 = st.columns(2)
+            with col3:
+                st.subheader("Accuracy")
+                acc = accuracy_score(y_test, y_pred)
+                st.write(f"{acc:.4f}")
+            with col4:
+                if y_prob is not None:
+                    fpr, tpr, _ = roc_curve(y_test, y_prob)
+                    roc_auc = auc(fpr, tpr)
+                    st.subheader("AUC")
+                    st.write(f"{roc_auc:.4f}")
+            st.subheader("Classification Report")
+            report = classification_report(y_test, y_pred, output_dict=True)
+            report_df = pd.DataFrame(report).transpose()
+            st.dataframe(report_df)
+            st.subheader("Confusion Matrix")
+            plot_confusion_matrix(y_test, y_pred)
+            if y_prob is not None:
+                st.subheader("ROC Curve")
+                plot_roc_curve(y_test, y_prob)
+        except Exception as e:
+            st.error(f"執行時發生錯誤：{e}")
+else:
+    st.info("請先在左側上傳資料檔案。")