jeff7522553 commited on
Commit
1e4bbff
·
1 Parent(s): 0a19352

視覺化中文->英文

Browse files
Files changed (1) hide show
  1. app.py +28 -19
app.py CHANGED
@@ -16,11 +16,10 @@ import json
16
  # --- 初始設定與資料載入 ---
17
  warnings.filterwarnings("ignore", category=UserWarning)
18
  warnings.filterwarnings("ignore", category=FutureWarning)
19
- plt.rcParams['font.family'] = ['Microsoft JhengHei']
20
  plt.rcParams['axes.unicode_minus'] = False
21
 
22
 
23
- # 參考 gemini 的建議,再來調整
24
 
25
  def load_data():
26
  """
@@ -56,15 +55,15 @@ def generate_feature_plot(feature):
56
  fig, ax = plt.subplots()
57
  if feature in NUMERICAL_FEATURES:
58
  sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
59
- ax.set_title(f'"{feature}" 的直方圖 ( Response 分色)')
60
  else:
61
  sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
62
- ax.set_title(f'"{feature}" 的計數長條圖 ( Response 分色)')
63
  plt.tight_layout()
64
  return fig
65
 
66
  # --- 核心訓練與評估函式 ---
67
- def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
68
  """
69
  當使用者點擊 "執行模型訓練" 按鈕時觸發。
70
  整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
@@ -76,22 +75,32 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
76
  # --- 1. 資料準備 ---
77
  X = df_processed[features]
78
  y = df_processed['Response']
79
- X_scaled = X.copy()
80
- numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_scaled.columns]
 
 
 
 
 
 
 
81
  if numerical_cols_in_x:
82
  scaler = StandardScaler()
83
- X_scaled[numerical_cols_in_x] = scaler.fit_transform(X_scaled[numerical_cols_in_x])
84
- X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
 
 
85
 
86
  # --- 2. 模型選擇與訓練 ---
87
  params = {}
88
  if model_name == '羅吉斯回歸':
89
- params = {'C': lr_c, 'solver': lr_solver}
 
90
  X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
91
  logit_model = sm.Logit(y_train, X_train_sm)
92
  result = logit_model.fit(disp=0)
93
  y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
94
- importances, title = result.tvalues.drop('const', errors='ignore'), '特徵 t-值 (Feature t-values)'
95
  else:
96
  if model_name == '決策樹':
97
  params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
@@ -107,23 +116,23 @@ def train_and_evaluate(history_log, model_name, features, lr_c, lr_solver, dt_cr
107
  model.fit(X_train, y_train)
108
  y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
109
 
110
- if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], '特徵係數 (Feature Coefficients)'
111
- elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, '特徵重要性 (Feature Importance)'
112
- else: importances, title = None, '特徵重要性'
113
 
114
  # --- 3. 評估與繪圖 ---
115
  accuracy = accuracy_score(y_test, y_pred)
116
- report = classification_report(y_test, y_pred, target_names=['不感興趣 (0)', '感興趣 (1)'])
117
  auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
118
  cm = confusion_matrix(y_test, y_pred)
119
- fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['預測為 0', '預測為 1'], yticklabels=['實際為 0', '實際為 1']); ax_cm.set_title('混淆矩陣'); ax_cm.set_xlabel('預測標籤'); ax_cm.set_ylabel('實際標籤'); plt.tight_layout()
120
 
121
  fig_imp, ax_imp = plt.subplots()
122
  if importances is not None:
123
  feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
124
  sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
125
  else:
126
- ax_imp.text(0.5, 0.5, '此模型/核心無法直接顯示特徵重要性', ha='center', va='center'); ax_imp.set_title(title)
127
  plt.tight_layout()
128
 
129
  # --- 4. 紀錄日誌 ---
@@ -164,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
164
  gr.Markdown("## 2. 模型選擇與超參數調整")
165
  model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
166
  with gr.Group(visible=False) as lr_box:
167
- gr.Markdown("#### 羅吉斯回歸"); lr_c = gr.Slider(0.01, 10.0, value=1.0, step=0.01, label="C (正規化強度, statsmodels中未使用)"); lr_solver = gr.Dropdown(['lbfgs', 'liblinear', 'saga'], value='lbfgs', label="優化演算法 (statsmodels中未使用)")
168
  with gr.Group(visible=True) as dt_box:
169
  gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
170
  with gr.Group(visible=False) as xgb_box:
@@ -194,7 +203,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
194
 
195
  run_btn.click(
196
  train_and_evaluate,
197
- inputs=[log_state, model_selector, feature_selector, lr_c, lr_solver, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
198
  outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
199
  )
200
 
 
16
  # --- 初始設定與資料載入 ---
17
  warnings.filterwarnings("ignore", category=UserWarning)
18
  warnings.filterwarnings("ignore", category=FutureWarning)
19
+ # plt.rcParams['font.family'] = ['SimHei']
20
  plt.rcParams['axes.unicode_minus'] = False
21
 
22
 
 
23
 
24
  def load_data():
25
  """
 
55
  fig, ax = plt.subplots()
56
  if feature in NUMERICAL_FEATURES:
57
  sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
58
+ ax.set_title(f'Histogram of "{feature}" (colored by Response)')
59
  else:
60
  sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
61
+ ax.set_title(f'Count Plot of "{feature}" (colored by Response)')
62
  plt.tight_layout()
63
  return fig
64
 
65
  # --- 核心訓練與評估函式 ---
66
+ def train_and_evaluate(history_log, model_name, features, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
67
  """
68
  當使用者點擊 "執行模型訓練" 按鈕時觸發。
69
  整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
 
75
  # --- 1. 資料準備 ---
76
  X = df_processed[features]
77
  y = df_processed['Response']
78
+
79
+ # 2. 先切分資料,再進行標準化,避免資料外洩
80
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
81
+
82
+ # 複製 X_train 和 X_test 以避免 SettingWithCopyWarning
83
+ X_train_scaled = X_train.copy()
84
+ X_test_scaled = X_test.copy()
85
+ # 3. 準備 Scaler
86
+ numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_train.columns]
87
  if numerical_cols_in_x:
88
  scaler = StandardScaler()
89
+ # 4. 只在 X_train 上 fit_transform
90
+ X_train_scaled[numerical_cols_in_x] = scaler.fit_transform(X_train[numerical_cols_in_x])
91
+ # 5. 在 X_test 上 "只" transform
92
+ X_test_scaled[numerical_cols_in_x] = scaler.transform(X_test[numerical_cols_in_x])
93
 
94
  # --- 2. 模型選擇與訓練 ---
95
  params = {}
96
  if model_name == '羅吉斯回歸':
97
+ # params = {'C': lr_c, 'solver': lr_solver}
98
+ params = {} # statsmodels 不使用這些參數
99
  X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
100
  logit_model = sm.Logit(y_train, X_train_sm)
101
  result = logit_model.fit(disp=0)
102
  y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
103
+ importances, title = result.tvalues.drop('const', errors='ignore'), 'Feature t-values'
104
  else:
105
  if model_name == '決策樹':
106
  params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
 
116
  model.fit(X_train, y_train)
117
  y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
118
 
119
+ if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], 'Feature Coefficients'
120
+ elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, 'Feature Importance'
121
+ else: importances, title = None, 'Feature Importance'
122
 
123
  # --- 3. 評估與繪圖 ---
124
  accuracy = accuracy_score(y_test, y_pred)
125
+ report = classification_report(y_test, y_pred, target_names=['not purchase insurance (0)', 'purchase insurance (1)'])
126
  auc_score = f"ROC-AUC 分數: {roc_auc_score(y_test, y_pred_proba):.4f}"
127
  cm = confusion_matrix(y_test, y_pred)
128
+ fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1']); ax_cm.set_title('Confusion Matrix'); ax_cm.set_xlabel('Predicted Label'); ax_cm.set_ylabel('Actual Label'); plt.tight_layout()
129
 
130
  fig_imp, ax_imp = plt.subplots()
131
  if importances is not None:
132
  feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
133
  sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
134
  else:
135
+ ax_imp.text(0.5, 0.5, 'This model/kernel cannot directly display feature importance', ha='center', va='center'); ax_imp.set_title(title)
136
  plt.tight_layout()
137
 
138
  # --- 4. 紀錄日誌 ---
 
173
  gr.Markdown("## 2. 模型選擇與超參數調整")
174
  model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
175
  with gr.Group(visible=False) as lr_box:
176
+ gr.Markdown("#### 羅吉斯回歸")
177
  with gr.Group(visible=True) as dt_box:
178
  gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
179
  with gr.Group(visible=False) as xgb_box:
 
203
 
204
  run_btn.click(
205
  train_and_evaluate,
206
+ inputs=[log_state, model_selector, feature_selector, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
207
  outputs=[model_output_report, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
208
  )
209