Spaces:
Paused
Paused
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| classification_report, | |
| confusion_matrix, | |
| ConfusionMatrixDisplay, | |
| roc_curve, | |
| auc | |
| ) | |
| st.set_page_config(page_title="機器學習模型訓練工具", layout="wide") | |
| st.title("機器學習模型訓練工具開發") | |
| st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。") | |
| def load_data(uploaded_file): | |
| file_name = uploaded_file.name.lower() | |
| if file_name.endswith(".csv"): | |
| df = pd.read_csv(uploaded_file) | |
| elif file_name.endswith(".xlsx") or file_name.endswith(".xls"): | |
| df = pd.read_excel(uploaded_file) | |
| else: | |
| return None | |
| return df | |
| def preprocess_data(df, target_column): | |
| df = df.copy() | |
| df = df.dropna(how="all") | |
| y = df[target_column] | |
| X = df.drop(columns=[target_column]) | |
| numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() | |
| categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() | |
| if len(numeric_cols) > 0: | |
| num_imputer = SimpleImputer(strategy="median") | |
| X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols]) | |
| if len(categorical_cols) > 0: | |
| cat_imputer = SimpleImputer(strategy="most_frequent") | |
| X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols]) | |
| if len(categorical_cols) > 0: | |
| X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) | |
| return X, y | |
| def build_model(model_name, params): | |
| if model_name == "KNN": | |
| return KNeighborsClassifier(n_neighbors=params["n_neighbors"]) | |
| if model_name == "Decision Tree": | |
| return DecisionTreeClassifier( | |
| criterion=params["criterion"], | |
| max_depth=params["max_depth"], | |
| random_state=42 | |
| ) | |
| if model_name == "Random Forest": | |
| return RandomForestClassifier( | |
| n_estimators=params["n_estimators"], | |
| max_depth=params["max_depth"], | |
| random_state=42 | |
| ) | |
| if model_name == "Logistic Regression": | |
| return LogisticRegression( | |
| C=params["C"], | |
| max_iter=1000, | |
| random_state=42 | |
| ) | |
| if model_name == "SVM": | |
| return SVC( | |
| kernel=params["kernel"], | |
| C=params["C"], | |
| probability=True, | |
| random_state=42 | |
| ) | |
| return None | |
| def plot_confusion_matrix(y_true, y_pred): | |
| fig, ax = plt.subplots(figsize=(5, 4)) | |
| disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred)) | |
| disp.plot(ax=ax) | |
| st.pyplot(fig) | |
| def plot_roc_curve(y_true, y_prob): | |
| fpr, tpr, _ = roc_curve(y_true, y_prob) | |
| roc_auc = auc(fpr, tpr) | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}") | |
| ax.plot([0, 1], [0, 1], linestyle="--") | |
| ax.set_xlabel("False Positive Rate") | |
| ax.set_ylabel("True Positive Rate") | |
| ax.set_title("ROC Curve") | |
| ax.legend(loc="lower right") | |
| st.pyplot(fig) | |
| return roc_auc | |
| st.sidebar.header("操作區") | |
| uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"]) | |
| if uploaded_file is not None: | |
| df = load_data(uploaded_file) | |
| if df is None: | |
| st.error("檔案格式不支援。") | |
| st.stop() | |
| st.subheader("原始資料預覽") | |
| st.dataframe(df.head()) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("資料基本資訊") | |
| st.write(f"資料維度:{df.shape[0]} 筆 × {df.shape[1]} 欄") | |
| st.write("欄位型態:") | |
| st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"])) | |
| with col2: | |
| st.subheader("缺失值統計") | |
| st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"])) | |
| st.subheader("欄位選擇") | |
| all_columns = df.columns.tolist() | |
| if "count" in all_columns: | |
| st.info("偵測到 count 欄位,可依作業需求轉為二元分類標籤。") | |
| use_count_as_target = st.checkbox( | |
| "將 count 轉為二元分類標籤(大於中位數=1,否則=0)", | |
| value=True | |
| ) | |
| if use_count_as_target: | |
| median_value = df["count"].median() | |
| df["label"] = (df["count"] > median_value).astype(int) | |
| target_column = "label" | |
| st.write(f"`count` 中位數 = {median_value}") | |
| st.write("已建立新目標欄位:`label`") | |
| else: | |
| target_column = st.selectbox("請選擇目標欄位", all_columns) | |
| else: | |
| target_column = st.selectbox("請選擇目標欄位", all_columns) | |
| st.subheader("目標欄位分布") | |
| st.write(df[target_column].value_counts()) | |
| test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1) | |
| use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True) | |
| model_name = st.sidebar.selectbox( | |
| "選擇模型", | |
| ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"] | |
| ) | |
| params = {} | |
| if model_name == "KNN": | |
| params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5) | |
| elif model_name == "Decision Tree": | |
| params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"]) | |
| max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1) | |
| params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input) | |
| elif model_name == "Random Forest": | |
| params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10) | |
| max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1) | |
| params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input) | |
| elif model_name == "Logistic Regression": | |
| params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01) | |
| elif model_name == "SVM": | |
| params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"]) | |
| params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01) | |
| run_button = st.sidebar.button("開始訓練模型") | |
| if run_button: | |
| try: | |
| X, y = preprocess_data(df, target_column) | |
| if y.dtype == "object": | |
| le = LabelEncoder() | |
| y = le.fit_transform(y) | |
| unique_classes = np.unique(y) | |
| if len(unique_classes) != 2: | |
| st.error("目前程式設計為二元分類評估(ROC/AUC)。請選擇二元分類目標欄位。") | |
| st.stop() | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, | |
| test_size=test_size, | |
| random_state=42, | |
| stratify=y | |
| ) | |
| if use_scaling: | |
| scaler = StandardScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| else: | |
| X_train = X_train.values | |
| X_test = X_test.values | |
| model = build_model(model_name, params) | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None | |
| st.success("模型訓練完成") | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| st.subheader("Accuracy") | |
| acc = accuracy_score(y_test, y_pred) | |
| st.write(f"{acc:.4f}") | |
| with col4: | |
| if y_prob is not None: | |
| fpr, tpr, _ = roc_curve(y_test, y_prob) | |
| roc_auc = auc(fpr, tpr) | |
| st.subheader("AUC") | |
| st.write(f"{roc_auc:.4f}") | |
| st.subheader("Classification Report") | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| report_df = pd.DataFrame(report).transpose() | |
| st.dataframe(report_df) | |
| st.subheader("Confusion Matrix") | |
| plot_confusion_matrix(y_test, y_pred) | |
| if y_prob is not None: | |
| st.subheader("ROC Curve") | |
| plot_roc_curve(y_test, y_prob) | |
| except Exception as e: | |
| st.error(f"執行時發生錯誤:{e}") | |
| else: | |
| st.info("請先在左側上傳資料檔案。") |