Spaces:
Sleeping
Sleeping
jeff7522553
commited on
Commit
·
1e4bbff
1
Parent(s):
0a19352
視覺化中文->英文
Browse files
app.py
CHANGED
|
@@ -16,11 +16,10 @@ import json
|
|
| 16 |
# --- 初始設定與資料載入 ---
|
| 17 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 18 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 19 |
-
plt.rcParams['font.family'] = ['
|
| 20 |
plt.rcParams['axes.unicode_minus'] = False
|
| 21 |
|
| 22 |
|
| 23 |
-
# 參考 gemini 的建議,再來調整
|
| 24 |
|
| 25 |
def load_data():
|
| 26 |
"""
|
|
@@ -56,15 +55,15 @@ def generate_feature_plot(feature):
|
|
| 56 |
fig, ax = plt.subplots()
|
| 57 |
if feature in NUMERICAL_FEATURES:
|
| 58 |
sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
|
| 59 |
-
ax.set_title(f'"{feature}"
|
| 60 |
else:
|
| 61 |
sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
|
| 62 |
-
ax.set_title(f'"{feature}"
|
| 63 |
plt.tight_layout()
|
| 64 |
return fig
|
| 65 |
|
| 66 |
# --- 核心訓練與評估函式 ---
|
| 67 |
-
def train_and_evaluate(history_log, model_name, features,
|
| 68 |
"""
|
| 69 |
當使用者點擊 "執行模型訓練" 按鈕時觸發。
|
| 70 |
整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
|
|
@@ -76,22 +75,32 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
|
|
| 76 |
# --- 1. 資料準備 ---
|
| 77 |
X = df_processed[features]
|
| 78 |
y = df_processed['Response']
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
if numerical_cols_in_x:
|
| 82 |
scaler = StandardScaler()
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# --- 2. 模型選擇與訓練 ---
|
| 87 |
params = {}
|
| 88 |
if model_name == '羅吉斯回歸':
|
| 89 |
-
params = {'C': lr_c, 'solver': lr_solver}
|
|
|
|
| 90 |
X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
|
| 91 |
logit_model = sm.Logit(y_train, X_train_sm)
|
| 92 |
result = logit_model.fit(disp=0)
|
| 93 |
y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
|
| 94 |
-
importances, title = result.tvalues.drop('const', errors='ignore'), '
|
| 95 |
else:
|
| 96 |
if model_name == '決策樹':
|
| 97 |
params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
|
|
@@ -107,23 +116,23 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
|
|
| 107 |
model.fit(X_train, y_train)
|
| 108 |
y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
|
| 109 |
|
| 110 |
-
if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], '
|
| 111 |
-
elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, '
|
| 112 |
-
else: importances, title = None, '
|
| 113 |
|
| 114 |
# --- 3. 評估與繪圖 ---
|
| 115 |
accuracy = accuracy_score(y_test, y_pred)
|
| 116 |
-
report = classification_report(y_test, y_pred, target_names=['
|
| 117 |
auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
|
| 118 |
cm = confusion_matrix(y_test, y_pred)
|
| 119 |
-
fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['
|
| 120 |
|
| 121 |
fig_imp, ax_imp = plt.subplots()
|
| 122 |
if importances is not None:
|
| 123 |
feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
|
| 124 |
sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
|
| 125 |
else:
|
| 126 |
-
ax_imp.text(0.5, 0.5, '
|
| 127 |
plt.tight_layout()
|
| 128 |
|
| 129 |
# --- 4. 紀錄日誌 ---
|
|
@@ -164,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 164 |
gr.Markdown("## 2. 模型選擇與超參數調整")
|
| 165 |
model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
|
| 166 |
with gr.Group(visible=False) as lr_box:
|
| 167 |
-
gr.Markdown("#### 羅吉斯回歸")
|
| 168 |
with gr.Group(visible=True) as dt_box:
|
| 169 |
gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
|
| 170 |
with gr.Group(visible=False) as xgb_box:
|
|
@@ -194,7 +203,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 194 |
|
| 195 |
run_btn.click(
|
| 196 |
train_and_evaluate,
|
| 197 |
-
inputs=[log_state, model_selector, feature_selector,
|
| 198 |
outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
|
| 199 |
)
|
| 200 |
|
|
|
|
| 16 |
# --- 初始設定與資料載入 ---
|
| 17 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 18 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 19 |
+
# plt.rcParams['font.family'] = ['SimHei']
|
| 20 |
plt.rcParams['axes.unicode_minus'] = False
|
| 21 |
|
| 22 |
|
|
|
|
| 23 |
|
| 24 |
def load_data():
|
| 25 |
"""
|
|
|
|
| 55 |
fig, ax = plt.subplots()
|
| 56 |
if feature in NUMERICAL_FEATURES:
|
| 57 |
sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
|
| 58 |
+
ax.set_title(f'Histogram of "{feature}" (colored by Response)')
|
| 59 |
else:
|
| 60 |
sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
|
| 61 |
+
ax.set_title(f'Count Plot of "{feature}" (colored by Response)')
|
| 62 |
plt.tight_layout()
|
| 63 |
return fig
|
| 64 |
|
| 65 |
# --- 核心訓練與評估函式 ---
|
| 66 |
+
def train_and_evaluate(history_log, model_name, features, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
|
| 67 |
"""
|
| 68 |
當使用者點擊 "執行模型訓練" 按鈕時觸發。
|
| 69 |
整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
|
|
|
|
| 75 |
# --- 1. 資料準備 ---
|
| 76 |
X = df_processed[features]
|
| 77 |
y = df_processed['Response']
|
| 78 |
+
|
| 79 |
+
# 2. 先切分資料,再進行標準化,避免資料外洩
|
| 80 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
|
| 81 |
+
|
| 82 |
+
# 複製 X_train 和 X_test 以避免 SettingWithCopyWarning
|
| 83 |
+
X_train_scaled = X_train.copy()
|
| 84 |
+
X_test_scaled = X_test.copy()
|
| 85 |
+
# 3. 準備 Scaler
|
| 86 |
+
numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_train.columns]
|
| 87 |
if numerical_cols_in_x:
|
| 88 |
scaler = StandardScaler()
|
| 89 |
+
# 4. 只在 X_train 上 fit_transform
|
| 90 |
+
X_train_scaled[numerical_cols_in_x] = scaler.fit_transform(X_train[numerical_cols_in_x])
|
| 91 |
+
# 5. 在 X_test 上 "只" transform
|
| 92 |
+
X_test_scaled[numerical_cols_in_x] = scaler.transform(X_test[numerical_cols_in_x])
|
| 93 |
|
| 94 |
# --- 2. 模型選擇與訓練 ---
|
| 95 |
params = {}
|
| 96 |
if model_name == '羅吉斯回歸':
|
| 97 |
+
# params = {'C': lr_c, 'solver': lr_solver}
|
| 98 |
+
params = {} # statsmodels 不使用這些參數
|
| 99 |
X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
|
| 100 |
logit_model = sm.Logit(y_train, X_train_sm)
|
| 101 |
result = logit_model.fit(disp=0)
|
| 102 |
y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
|
| 103 |
+
importances, title = result.tvalues.drop('const', errors='ignore'), 'Feature t-values'
|
| 104 |
else:
|
| 105 |
if model_name == '決策樹':
|
| 106 |
params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
|
|
|
|
| 116 |
model.fit(X_train, y_train)
|
| 117 |
y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
|
| 118 |
|
| 119 |
+
if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], 'Feature Coefficients'
|
| 120 |
+
elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, 'Feature Importance'
|
| 121 |
+
else: importances, title = None, 'Feature Importance'
|
| 122 |
|
| 123 |
# --- 3. 評估與繪圖 ---
|
| 124 |
accuracy = accuracy_score(y_test, y_pred)
|
| 125 |
+
report = classification_report(y_test, y_pred, target_names=['not purchase insurance (0)', 'purchase insurance (1)'])
|
| 126 |
auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
|
| 127 |
cm = confusion_matrix(y_test, y_pred)
|
| 128 |
+
fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1']); ax_cm.set_title('Confusion Matrix'); ax_cm.set_xlabel('Predicted Label'); ax_cm.set_ylabel('Actual Label'); plt.tight_layout()
|
| 129 |
|
| 130 |
fig_imp, ax_imp = plt.subplots()
|
| 131 |
if importances is not None:
|
| 132 |
feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
|
| 133 |
sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
|
| 134 |
else:
|
| 135 |
+
ax_imp.text(0.5, 0.5, 'This model/kernel cannot directly display feature importance', ha='center', va='center'); ax_imp.set_title(title)
|
| 136 |
plt.tight_layout()
|
| 137 |
|
| 138 |
# --- 4. 紀錄日誌 ---
|
|
|
|
| 173 |
gr.Markdown("## 2. 模型選擇與超參數調整")
|
| 174 |
model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
|
| 175 |
with gr.Group(visible=False) as lr_box:
|
| 176 |
+
gr.Markdown("#### 羅吉斯回歸")
|
| 177 |
with gr.Group(visible=True) as dt_box:
|
| 178 |
gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
|
| 179 |
with gr.Group(visible=False) as xgb_box:
|
|
|
|
| 203 |
|
| 204 |
run_btn.click(
|
| 205 |
train_and_evaluate,
|
| 206 |
+
inputs=[log_state, model_selector, feature_selector, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
|
| 207 |
outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
|
| 208 |
)
|
| 209 |
|