Spaces:

jeff7522553
/

CIP_Project

Sleeping

App Files Files Community

jeff7522553 commited on Nov 17, 2025

Commit

1e4bbff

1 Parent(s): 0a19352

視覺化中文->英文

Browse files

Files changed (1) hide show

app.py +28 -19

app.py CHANGED Viewed

@@ -16,11 +16,10 @@ import json
 # --- 初始設定與資料載入 ---
 warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
-plt.rcParams['font.family'] = ['Microsoft JhengHei']
 plt.rcParams['axes.unicode_minus'] = False
-# 參考 gemini 的建議，再來調整
 def load_data():
     """
@@ -56,15 +55,15 @@ def generate_feature_plot(feature):
     fig, ax = plt.subplots()
     if feature in NUMERICAL_FEATURES:
         sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
-        ax.set_title(f'"{feature}" 的直方圖 (依 Response 分色)')
     else:
         sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
-        ax.set_title(f'"{feature}" 的計數長條圖 (依 Response 分色)')
     plt.tight_layout()
     return fig
 # --- 核心訓練與評估函式 ---
-def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
     """
     當使用者點擊 "執行模型訓練" 按鈕時觸發。
     整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
@@ -76,22 +75,32 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
     # --- 1. 資料準備 ---
     X = df_processed[features]
     y = df_processed['Response']
-    X_scaled = X.copy()
-    numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_scaled.columns]
     if numerical_cols_in_x:
         scaler = StandardScaler()
-        X_scaled[numerical_cols_in_x] = scaler.fit_transform(X_scaled[numerical_cols_in_x])
-    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
     # --- 2. 模型選擇與訓練 ---
     params = {}
     if model_name == '羅吉斯回歸':
-        params = {'C': lr_c, 'solver': lr_solver}
         X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
         logit_model = sm.Logit(y_train, X_train_sm)
         result = logit_model.fit(disp=0)
         y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
-        importances, title = result.tvalues.drop('const', errors='ignore'), '特徵 t-值 (Feature t-values)'
     else:
         if model_name == '決策樹':
             params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
@@ -107,23 +116,23 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
         model.fit(X_train, y_train)
         y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
-        if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], '特徵係數 (Feature Coefficients)'
-        elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, '特徵重要性 (Feature Importance)'
-        else: importances, title = None, '特徵重要性'
     # --- 3. 評估與繪圖 ---
     accuracy = accuracy_score(y_test, y_pred)
-    report = classification_report(y_test, y_pred, target_names=['不感興趣 (0)', '感興趣 (1)'])
     auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
     cm = confusion_matrix(y_test, y_pred)
-    fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['預測為 0', '預測為 1'], yticklabels=['實際為 0', '實際為 1']); ax_cm.set_title('混淆矩陣'); ax_cm.set_xlabel('預測標籤'); ax_cm.set_ylabel('實際標籤'); plt.tight_layout()
     fig_imp, ax_imp = plt.subplots()
     if importances is not None:
         feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
         sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
     else:
-        ax_imp.text(0.5, 0.5, '此模型/核心無法直接顯示特徵重要性', ha='center', va='center'); ax_imp.set_title(title)
     plt.tight_layout()
     # --- 4. 紀錄日誌 ---
@@ -164,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             gr.Markdown("## 2. 模型選擇與超參數調整")
             model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
             with gr.Group(visible=False) as lr_box:
-                gr.Markdown("#### 羅吉斯回歸"); lr_c = gr.Slider(0.01, 10.0, value=1.0, step=0.01, label="C (正規化強度, statsmodels中未使用)"); lr_solver = gr.Dropdown(['lbfgs', 'liblinear', 'saga'], value='lbfgs', label="優化演算法 (statsmodels中未使用)")
             with gr.Group(visible=True) as dt_box:
                 gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
             with gr.Group(visible=False) as xgb_box:
@@ -194,7 +203,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     run_btn.click(
         train_and_evaluate,
-        inputs=[log_state, model_selector, feature_selector, lr_c, lr_solver, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
         outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
     )

 # --- 初始設定與資料載入 ---
 warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
+# plt.rcParams['font.family'] = ['SimHei']
 plt.rcParams['axes.unicode_minus'] = False
 def load_data():
     """
     fig, ax = plt.subplots()
     if feature in NUMERICAL_FEATURES:
         sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
+        ax.set_title(f'Histogram of "{feature}" (colored by Response)')
     else:
         sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
+        ax.set_title(f'Count Plot of "{feature}" (colored by Response)')
     plt.tight_layout()
     return fig
 # --- 核心訓練與評估函式 ---
+def train_and_evaluate(history_log, model_name, features, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
     """
     當使用者點擊 "執行模型訓練" 按鈕時觸發。
     整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
     # --- 1. 資料準備 ---
     X = df_processed[features]
     y = df_processed['Response']
+    # 2. 先切分資料，再進行標準化，避免資料外洩
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
+    # 複製 X_train 和 X_test 以避免 SettingWithCopyWarning
+    X_train_scaled = X_train.copy()
+    X_test_scaled = X_test.copy()
+    # 3. 準備 Scaler
+    numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_train.columns]
     if numerical_cols_in_x:
         scaler = StandardScaler()
+        # 4. 只在 X_train 上 fit_transform
+        X_train_scaled[numerical_cols_in_x] = scaler.fit_transform(X_train[numerical_cols_in_x])
+        # 5. 在 X_test 上 "只" transform
+        X_test_scaled[numerical_cols_in_x] = scaler.transform(X_test[numerical_cols_in_x])
     # --- 2. 模型選擇與訓練 ---
     params = {}
     if model_name == '羅吉斯回歸':
+        # params = {'C': lr_c, 'solver': lr_solver}
+        params = {}  # statsmodels 不使用這些參數
         X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
         logit_model = sm.Logit(y_train, X_train_sm)
         result = logit_model.fit(disp=0)
         y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
+        importances, title = result.tvalues.drop('const', errors='ignore'), 'Feature t-values'
     else:
         if model_name == '決策樹':
             params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
         model.fit(X_train, y_train)
         y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
+        if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], 'Feature Coefficients'
+        elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, 'Feature Importance'
+        else: importances, title = None, 'Feature Importance'
     # --- 3. 評估與繪圖 ---
     accuracy = accuracy_score(y_test, y_pred)
+    report = classification_report(y_test, y_pred, target_names=['not purchase insurance (0)', 'purchase insurance (1)'])
     auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
     cm = confusion_matrix(y_test, y_pred)
+    fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1']); ax_cm.set_title('Confusion Matrix'); ax_cm.set_xlabel('Predicted Label'); ax_cm.set_ylabel('Actual Label'); plt.tight_layout()
     fig_imp, ax_imp = plt.subplots()
     if importances is not None:
         feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
         sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
     else:
+        ax_imp.text(0.5, 0.5, 'This model/kernel cannot directly display feature importance', ha='center', va='center'); ax_imp.set_title(title)
     plt.tight_layout()
     # --- 4. 紀錄日誌 ---
             gr.Markdown("## 2. 模型選擇與超參數調整")
             model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
             with gr.Group(visible=False) as lr_box:
+                gr.Markdown("#### 羅吉斯回歸")
             with gr.Group(visible=True) as dt_box:
                 gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
             with gr.Group(visible=False) as xgb_box:
     run_btn.click(
         train_and_evaluate,
+        inputs=[log_state, model_selector, feature_selector, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
         outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
     )