modeltraining / streamlit_app.py
william1324's picture
Rename app.py to streamlit_app.py
7c3e574 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix,
ConfusionMatrixDisplay,
roc_curve,
auc
)
st.set_page_config(page_title="機器學習模型訓練工具", layout="wide")
st.title("機器學習模型訓練工具開發")
st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。")
def load_data(uploaded_file):
file_name = uploaded_file.name.lower()
if file_name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
df = pd.read_excel(uploaded_file)
else:
return None
return df
def preprocess_data(df, target_column):
df = df.copy()
df = df.dropna(how="all")
y = df[target_column]
X = df.drop(columns=[target_column])
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
if len(numeric_cols) > 0:
num_imputer = SimpleImputer(strategy="median")
X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
if len(categorical_cols) > 0:
cat_imputer = SimpleImputer(strategy="most_frequent")
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
if len(categorical_cols) > 0:
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
return X, y
def build_model(model_name, params):
if model_name == "KNN":
return KNeighborsClassifier(n_neighbors=params["n_neighbors"])
if model_name == "Decision Tree":
return DecisionTreeClassifier(
criterion=params["criterion"],
max_depth=params["max_depth"],
random_state=42
)
if model_name == "Random Forest":
return RandomForestClassifier(
n_estimators=params["n_estimators"],
max_depth=params["max_depth"],
random_state=42
)
if model_name == "Logistic Regression":
return LogisticRegression(
C=params["C"],
max_iter=1000,
random_state=42
)
if model_name == "SVM":
return SVC(
kernel=params["kernel"],
C=params["C"],
probability=True,
random_state=42
)
return None
def plot_confusion_matrix(y_true, y_pred):
fig, ax = plt.subplots(figsize=(5, 4))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
disp.plot(ax=ax)
st.pyplot(fig)
def plot_roc_curve(y_true, y_prob):
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
ax.plot([0, 1], [0, 1], linestyle="--")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curve")
ax.legend(loc="lower right")
st.pyplot(fig)
return roc_auc
st.sidebar.header("操作區")
uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"])
if uploaded_file is not None:
df = load_data(uploaded_file)
if df is None:
st.error("檔案格式不支援。")
st.stop()
st.subheader("原始資料預覽")
st.dataframe(df.head())
col1, col2 = st.columns(2)
with col1:
st.subheader("資料基本資訊")
st.write(f"資料維度:{df.shape[0]} 筆 × {df.shape[1]} 欄")
st.write("欄位型態:")
st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"]))
with col2:
st.subheader("缺失值統計")
st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"]))
st.subheader("欄位選擇")
all_columns = df.columns.tolist()
if "count" in all_columns:
st.info("偵測到 count 欄位,可依作業需求轉為二元分類標籤。")
use_count_as_target = st.checkbox(
"將 count 轉為二元分類標籤(大於中位數=1,否則=0)",
value=True
)
if use_count_as_target:
median_value = df["count"].median()
df["label"] = (df["count"] > median_value).astype(int)
target_column = "label"
st.write(f"`count` 中位數 = {median_value}")
st.write("已建立新目標欄位:`label`")
else:
target_column = st.selectbox("請選擇目標欄位", all_columns)
else:
target_column = st.selectbox("請選擇目標欄位", all_columns)
st.subheader("目標欄位分布")
st.write(df[target_column].value_counts())
test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1)
use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True)
model_name = st.sidebar.selectbox(
"選擇模型",
["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"]
)
params = {}
if model_name == "KNN":
params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5)
elif model_name == "Decision Tree":
params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"])
max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1)
params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)
elif model_name == "Random Forest":
params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10)
max_depth_input = st.sidebar.number_input("max_depth(0 代表不限)", min_value=0, value=5, step=1)
params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)
elif model_name == "Logistic Regression":
params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)
elif model_name == "SVM":
params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"])
params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)
run_button = st.sidebar.button("開始訓練模型")
if run_button:
try:
X, y = preprocess_data(df, target_column)
if y.dtype == "object":
le = LabelEncoder()
y = le.fit_transform(y)
unique_classes = np.unique(y)
if len(unique_classes) != 2:
st.error("目前程式設計為二元分類評估(ROC/AUC)。請選擇二元分類目標欄位。")
st.stop()
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=test_size,
random_state=42,
stratify=y
)
if use_scaling:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
else:
X_train = X_train.values
X_test = X_test.values
model = build_model(model_name, params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
st.success("模型訓練完成")
col3, col4 = st.columns(2)
with col3:
st.subheader("Accuracy")
acc = accuracy_score(y_test, y_pred)
st.write(f"{acc:.4f}")
with col4:
if y_prob is not None:
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
st.subheader("AUC")
st.write(f"{roc_auc:.4f}")
st.subheader("Classification Report")
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
st.dataframe(report_df)
st.subheader("Confusion Matrix")
plot_confusion_matrix(y_test, y_pred)
if y_prob is not None:
st.subheader("ROC Curve")
plot_roc_curve(y_test, y_prob)
except Exception as e:
st.error(f"執行時發生錯誤:{e}")
else:
st.info("請先在左側上傳資料檔案。")