import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.impute import SimpleImputer from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc ) st.set_page_config(page_title="機器學習模型訓練工具", layout="wide") st.title("機器學習模型訓練工具開發") st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。") def load_data(uploaded_file): file_name = uploaded_file.name.lower() if file_name.endswith(".csv"): df = pd.read_csv(uploaded_file) elif file_name.endswith(".xlsx") or file_name.endswith(".xls"): df = pd.read_excel(uploaded_file) else: return None return df def preprocess_data(df, target_column): df = df.copy() df = df.dropna(how="all") y = df[target_column] X = df.drop(columns=[target_column]) numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() if len(numeric_cols) > 0: num_imputer = SimpleImputer(strategy="median") X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols]) if len(categorical_cols) > 0: cat_imputer = SimpleImputer(strategy="most_frequent") X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols]) if len(categorical_cols) > 0: X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) return X, y def build_model(model_name, params): if model_name == "KNN": return KNeighborsClassifier(n_neighbors=params["n_neighbors"]) if model_name == "Decision Tree": return DecisionTreeClassifier( criterion=params["criterion"], max_depth=params["max_depth"], random_state=42 ) if model_name == "Random Forest": return RandomForestClassifier( n_estimators=params["n_estimators"], max_depth=params["max_depth"], random_state=42 ) if model_name == "Logistic Regression": return LogisticRegression( C=params["C"], max_iter=1000, random_state=42 ) if model_name == "SVM": return SVC( kernel=params["kernel"], C=params["C"], probability=True, random_state=42 ) return None def plot_confusion_matrix(y_true, y_pred): fig, ax = plt.subplots(figsize=(5, 4)) disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred)) disp.plot(ax=ax) st.pyplot(fig) def plot_roc_curve(y_true, y_prob): fpr, tpr, _ = roc_curve(y_true, y_prob) roc_auc = auc(fpr, tpr) fig, ax = plt.subplots(figsize=(6, 4)) ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}") ax.plot([0, 1], [0, 1], linestyle="--") ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate") ax.set_title("ROC Curve") ax.legend(loc="lower right") st.pyplot(fig) return roc_auc st.sidebar.header("操作區") uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"]) if uploaded_file is not None: df = load_data(uploaded_file) if df is None: st.error("檔案格式不支援。") st.stop() st.subheader("原始資料預覽") st.dataframe(df.head()) col1, col2 = st.columns(2) with col1: st.subheader("資料基本資訊") st.write(f"資料維度:{df.shape[0]} 筆 × {df.shape[1]} 欄") st.write("欄位型態:") st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"])) with col2: st.subheader("缺失值統計") st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"])) st.subheader("欄位選擇") all_columns = df.columns.tolist() if "count" in all_columns: st.info("偵測到 count 欄位,可依作業需求轉為二元分類標籤。") use_count_as_target = st.checkbox( "將 count 轉為二元分類標籤(大於中位數=1,否則=0)", value=True ) if use_count_as_target: median_value = df["count"].median() df["label"] = (df["count"] > median_value).astype(int) target_column = "label" st.write(f"`count` 中位數 = {median_value}") st.write("已建立新目標欄位:`label`") else: target_column = st.selectbox("請選擇目標欄位", all_columns) else: target_column = st.selectbox("請選擇目標欄位", all_columns) st.subheader("目標欄位分布") st.write(df[target_column].value_counts()) test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1) use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True) model_name = st.sidebar.selectbox( "選擇模型", ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"] ) params = {} if model_name == "KNN": params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5) elif model_name == "Decision Tree": params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"]) max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1) params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input) elif model_name == "Random Forest": params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10) max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1) params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input) elif model_name == "Logistic Regression": params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01) elif model_name == "SVM": params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"]) params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01) run_button = st.sidebar.button("開始訓練模型") if run_button: try: X, y = preprocess_data(df, target_column) if y.dtype == "object": le = LabelEncoder() y = le.fit_transform(y) unique_classes = np.unique(y) if len(unique_classes) != 2: st.error("目前程式設計為二元分類評估(ROC/AUC)。請選擇二元分類目標欄位。") st.stop() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42, stratify=y ) if use_scaling: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) else: X_train = X_train.values X_test = X_test.values model = build_model(model_name, params) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None st.success("模型訓練完成") col3, col4 = st.columns(2) with col3: st.subheader("Accuracy") acc = accuracy_score(y_test, y_pred) st.write(f"{acc:.4f}") with col4: if y_prob is not None: fpr, tpr, _ = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) st.subheader("AUC") st.write(f"{roc_auc:.4f}") st.subheader("Classification Report") report = classification_report(y_test, y_pred, output_dict=True) report_df = pd.DataFrame(report).transpose() st.dataframe(report_df) st.subheader("Confusion Matrix") plot_confusion_matrix(y_test, y_pred) if y_prob is not None: st.subheader("ROC Curve") plot_roc_curve(y_test, y_prob) except Exception as e: st.error(f"執行時發生錯誤:{e}") else: st.info("請先在左側上傳資料檔案。")