Spaces:

william1324
/

modeltraining

Paused

App Files Files Community

modeltraining / streamlit_app.py

william1324

Rename app.py to streamlit_app.py

7c3e574 verified about 2 months ago

raw

history blame contribute delete

8.81 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.impute import SimpleImputer

	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC

	from sklearn.metrics import (
	accuracy_score,
	classification_report,
	confusion_matrix,
	ConfusionMatrixDisplay,
	roc_curve,
	auc
	)


	st.set_page_config(page_title="機器學習模型訓練工具", layout="wide")

	st.title("機器學習模型訓練工具開發")
	st.write("支援資料上傳、前處理、模型訓練、模型評估與視覺化。")


	def load_data(uploaded_file):
	file_name = uploaded_file.name.lower()
	if file_name.endswith(".csv"):
	df = pd.read_csv(uploaded_file)
	elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
	df = pd.read_excel(uploaded_file)
	else:
	return None
	return df


	def preprocess_data(df, target_column):
	df = df.copy()
	df = df.dropna(how="all")

	y = df[target_column]
	X = df.drop(columns=[target_column])

	numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
	categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

	if len(numeric_cols) > 0:
	num_imputer = SimpleImputer(strategy="median")
	X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

	if len(categorical_cols) > 0:
	cat_imputer = SimpleImputer(strategy="most_frequent")
	X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

	if len(categorical_cols) > 0:
	X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

	return X, y


	def build_model(model_name, params):
	if model_name == "KNN":
	return KNeighborsClassifier(n_neighbors=params["n_neighbors"])

	if model_name == "Decision Tree":
	return DecisionTreeClassifier(
	criterion=params["criterion"],
	max_depth=params["max_depth"],
	random_state=42
	)

	if model_name == "Random Forest":
	return RandomForestClassifier(
	n_estimators=params["n_estimators"],
	max_depth=params["max_depth"],
	random_state=42
	)

	if model_name == "Logistic Regression":
	return LogisticRegression(
	C=params["C"],
	max_iter=1000,
	random_state=42
	)

	if model_name == "SVM":
	return SVC(
	kernel=params["kernel"],
	C=params["C"],
	probability=True,
	random_state=42
	)

	return None


	def plot_confusion_matrix(y_true, y_pred):
	fig, ax = plt.subplots(figsize=(5, 4))
	disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
	disp.plot(ax=ax)
	st.pyplot(fig)


	def plot_roc_curve(y_true, y_prob):
	fpr, tpr, _ = roc_curve(y_true, y_prob)
	roc_auc = auc(fpr, tpr)

	fig, ax = plt.subplots(figsize=(6, 4))
	ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
	ax.plot([0, 1], [0, 1], linestyle="--")
	ax.set_xlabel("False Positive Rate")
	ax.set_ylabel("True Positive Rate")
	ax.set_title("ROC Curve")
	ax.legend(loc="lower right")
	st.pyplot(fig)

	return roc_auc


	st.sidebar.header("操作區")
	uploaded_file = st.sidebar.file_uploader("請上傳 CSV 或 Excel 檔", type=["csv", "xlsx", "xls"])

	if uploaded_file is not None:
	df = load_data(uploaded_file)

	if df is None:
	st.error("檔案格式不支援。")
	st.stop()

	st.subheader("原始資料預覽")
	st.dataframe(df.head())

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("資料基本資訊")
	st.write(f"資料維度：{df.shape[0]} 筆 × {df.shape[1]} 欄")
	st.write("欄位型態：")
	st.dataframe(pd.DataFrame(df.dtypes, columns=["dtype"]))

	with col2:
	st.subheader("缺失值統計")
	st.dataframe(pd.DataFrame(df.isnull().sum(), columns=["missing_count"]))

	st.subheader("欄位選擇")
	all_columns = df.columns.tolist()

	if "count" in all_columns:
	st.info("偵測到 count 欄位，可依作業需求轉為二元分類標籤。")
	use_count_as_target = st.checkbox(
	"將 count 轉為二元分類標籤（大於中位數=1，否則=0）",
	value=True
	)

	if use_count_as_target:
	median_value = df["count"].median()
	df["label"] = (df["count"] > median_value).astype(int)
	target_column = "label"
	st.write(f"`count` 中位數 = {median_value}")
	st.write("已建立新目標欄位：`label`")
	else:
	target_column = st.selectbox("請選擇目標欄位", all_columns)
	else:
	target_column = st.selectbox("請選擇目標欄位", all_columns)

	st.subheader("目標欄位分布")
	st.write(df[target_column].value_counts())

	test_size = st.sidebar.slider("測試集比例 (Test Size)", 0.1, 0.5, 0.2, 0.1)
	use_scaling = st.sidebar.checkbox("使用 StandardScaler", value=True)

	model_name = st.sidebar.selectbox(
	"選擇模型",
	["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"]
	)

	params = {}

	if model_name == "KNN":
	params["n_neighbors"] = st.sidebar.slider("k 值", 1, 15, 5)

	elif model_name == "Decision Tree":
	params["criterion"] = st.sidebar.selectbox("criterion", ["gini", "entropy"])
	max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
	params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)

	elif model_name == "Random Forest":
	params["n_estimators"] = st.sidebar.slider("n_estimators", 10, 300, 100, 10)
	max_depth_input = st.sidebar.number_input("max_depth（0 代表不限）", min_value=0, value=5, step=1)
	params["max_depth"] = None if max_depth_input == 0 else int(max_depth_input)

	elif model_name == "Logistic Regression":
	params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)

	elif model_name == "SVM":
	params["kernel"] = st.sidebar.selectbox("kernel", ["linear", "rbf"])
	params["C"] = st.sidebar.slider("C", 0.01, 10.0, 1.0, 0.01)

	run_button = st.sidebar.button("開始訓練模型")

	if run_button:
	try:
	X, y = preprocess_data(df, target_column)

	if y.dtype == "object":
	le = LabelEncoder()
	y = le.fit_transform(y)

	unique_classes = np.unique(y)
	if len(unique_classes) != 2:
	st.error("目前程式設計為二元分類評估（ROC/AUC）。請選擇二元分類目標欄位。")
	st.stop()

	X_train, X_test, y_train, y_test = train_test_split(
	X, y,
	test_size=test_size,
	random_state=42,
	stratify=y
	)

	if use_scaling:
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)
	else:
	X_train = X_train.values
	X_test = X_test.values

	model = build_model(model_name, params)
	model.fit(X_train, y_train)

	y_pred = model.predict(X_test)
	y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

	st.success("模型訓練完成")

	col3, col4 = st.columns(2)

	with col3:
	st.subheader("Accuracy")
	acc = accuracy_score(y_test, y_pred)
	st.write(f"{acc:.4f}")

	with col4:
	if y_prob is not None:
	fpr, tpr, _ = roc_curve(y_test, y_prob)
	roc_auc = auc(fpr, tpr)
	st.subheader("AUC")
	st.write(f"{roc_auc:.4f}")

	st.subheader("Classification Report")
	report = classification_report(y_test, y_pred, output_dict=True)
	report_df = pd.DataFrame(report).transpose()
	st.dataframe(report_df)

	st.subheader("Confusion Matrix")
	plot_confusion_matrix(y_test, y_pred)

	if y_prob is not None:
	st.subheader("ROC Curve")
	plot_roc_curve(y_test, y_prob)

	except Exception as e:
	st.error(f"執行時發生錯誤：{e}")

	else:
	st.info("請先在左側上傳資料檔案。")