Spaces:
Sleeping
Sleeping
File size: 14,420 Bytes
0a19352 5b16257 0a19352 1e4bbff 0a19352 e970db4 0a19352 e970db4 0a19352 e970db4 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 5b16257 e970db4 5b16257 e970db4 5b16257 0a19352 1e4bbff 0a19352 1e4bbff 0a19352 5b16257 0a19352 5b16257 0a19352 5b16257 0a19352 e970db4 0a19352 1e4bbff 0a19352 5b16257 0a19352 5b16257 0a19352 1e4bbff 5b16257 0a19352 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import warnings
import json
# --- 初始設定與資料載入 ---
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def processDisplayDataframe(df):
# 假設 df 已經存在
num_cols = df.select_dtypes(include=np.number).columns # 先抓出數值欄位名稱
df[num_cols] = df[num_cols].map(lambda x: f"{x:.4f}")
return df
def load_data():
"""
載入並對資料進行固定的預處理。
此函式只在應用程式啟動時執行一次。
"""
df = pd.read_csv('sampled_data.csv')
df_processed = df.copy()
df_processed = df_processed.drop('id', axis=1)
df_processed['Gender'] = df_processed['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
df_processed['Vehicle_Age'] = df_processed['Vehicle_Age'].map(age_mapping)
df_processed['Vehicle_Damage'] = df_processed['Vehicle_Damage'].apply(lambda x: 1 if x == 'Yes' else 0)
return df, df_processed
df_original, df_processed = load_data()
ALL_FEATURES = [col for col in df_processed.columns if col != 'Response']
NUMERICAL_FEATURES = [f for f in df_original.select_dtypes(include=np.number).columns.tolist() if f in ALL_FEATURES]
# --- EDA 相關函式 ---
def update_eda_section(selected_features):
if not selected_features:
return pd.DataFrame(), pd.DataFrame(), gr.update(choices=[], value=None), None
stats = df_processed[selected_features].describe().T.reset_index().rename(columns={'index': 'Feature'})
corrs = df_processed[selected_features + ['Response']].corr(numeric_only=True)['Response'].drop('Response').to_frame().reset_index()
corrs.columns = ['Feature', 'Correlation with Response']
first_feature_plot = generate_feature_plot(selected_features[0])
plot_selector_update = gr.update(choices=selected_features, value=selected_features[0])
stats = processDisplayDataframe(stats)
corrs = processDisplayDataframe(corrs)
return stats, corrs, plot_selector_update, first_feature_plot
def generate_feature_plot(feature):
if not feature: return None
fig, ax = plt.subplots()
if feature in NUMERICAL_FEATURES:
sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
ax.set_title(f'Histogram of "{feature}" (colored by Response)')
else:
sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
ax.set_title(f'Count Plot of "{feature}" (colored by Response)')
plt.tight_layout()
return fig
# --- 核心訓練與評估函式 ---
def train_and_evaluate(history_log, model_name, features, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
"""
當使用者點擊 "執行模型訓練" 按鈕時觸發。
整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
"""
if not features:
# 如果沒有選擇特徵,只回傳錯誤訊息和空的日誌
return "錯誤:請至少選擇一個特徵!", None, None, None, pd.DataFrame(history_log, columns=LOG_COLUMNS), history_log
# --- 1. 資料準備 ---
X = df_processed[features]
y = df_processed['Response']
# 2. 先切分資料,再進行標準化,避免資料外洩
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# 複製 X_train 和 X_test 以避免 SettingWithCopyWarning
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
# 3. 準備 Scaler
numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_train.columns]
if numerical_cols_in_x:
scaler = StandardScaler()
# 4. 只在 X_train 上 fit_transform
X_train_scaled[numerical_cols_in_x] = scaler.fit_transform(X_train[numerical_cols_in_x])
# 5. 在 X_test 上 "只" transform
X_test_scaled[numerical_cols_in_x] = scaler.transform(X_test[numerical_cols_in_x])
# --- 2. 模型選擇與訓練 ---
params = {}
if model_name == '羅吉斯回歸':
# params = {'C': lr_c, 'solver': lr_solver}
params = {} # statsmodels 不使用這些參數
X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit(disp=0)
y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
importances, title = result.tvalues.drop('const', errors='ignore'), 'Feature t-values'
else:
if model_name == '決策樹':
params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
model = DecisionTreeClassifier(**params, random_state=42, class_weight='balanced')
elif model_name == 'XGBoost':
params = {'n_estimators': int(xgb_n_estimators), 'max_depth': int(xgb_max_depth), 'learning_rate': xgb_learning_rate}
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
model = xgb.XGBClassifier(**params, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss', random_state=42)
elif model_name == 'SVM':
params = {'C': svm_c, 'kernel': svm_kernel}
model = SVC(**params, probability=True, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]
if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], 'Feature Coefficients'
elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, 'Feature Importance'
else: importances, title = None, 'Feature Importance'
# --- 3. 評估與繪圖 ---
accuracy_value = accuracy_score(y_test, y_pred)
precision_value = precision_score(y_test, y_pred)
recall_value = recall_score(y_test, y_pred)
f1_score_value = f1_score(y_test, y_pred)
roc_auc_value = roc_auc_score(y_test, y_pred_proba)
accuracy_text = f"準確率 分數: {accuracy_value:.4f}"
precision_text = f"精確率 分數: {precision_value:.4f}"
recall_text = f"召回率 分數: {recall_value:.4f}"
f1_score_text = f"F1 分數: {f1_score_value:.4f}"
roc_auc_text = f"ROC-AUC 分數: {roc_auc_value:.4f}"
report_dict = classification_report(y_test, y_pred, target_names=['not purchase insurance (0)', 'purchase insurance (1)'], output_dict=True)
classfy_report = pd.DataFrame({
'not purchase insurance (0)':report_dict['not purchase insurance (0)'],
'purchase insurance (1)':report_dict['purchase insurance (1)'],
}, columns=[ 'not purchase insurance (0)', 'purchase insurance (1)']).T
classfy_report.insert(0, "index", classfy_report.index)
classfy_report = processDisplayDataframe(classfy_report)
avg_report = pd.DataFrame([
report_dict["macro avg"],
report_dict["weighted avg"],
], index=["macro avg", "weighted avg"])
avg_report.insert(0, "index", avg_report.index)
avg_report = processDisplayDataframe(avg_report)
# 2. 轉成 DataFrame(每個類別一列)
# df_report = pd.DataFrame(report_dict).T # T = transpose,讓 index 變成類別名稱
# df_report.insert(0, "index", df_report.index)
# # print(df_report)
# df_report = processDisplayDataframe(df_report)
cm = confusion_matrix(y_test, y_pred)
fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1']); ax_cm.set_title('Confusion Matrix'); ax_cm.set_xlabel('Predicted Label'); ax_cm.set_ylabel('Actual Label'); plt.tight_layout()
fig_imp, ax_imp = plt.subplots()
if importances is not None:
feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
else:
ax_imp.text(0.5, 0.5, 'This model/kernel cannot directly display feature importance', ha='center', va='center'); ax_imp.set_title(title)
plt.tight_layout()
# --- 4. 紀錄日誌 ---
new_log_entry = [
pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
model_name,
', '.join(features),
json.dumps(params),
f"{accuracy_value:.4f}",
f"{precision_value:.4f}",
f"{recall_value:.4f}",
f"{f1_score_value:.4f}",
f"{roc_auc_value:.4f}",
]
# 將新紀錄加到歷史紀錄的開頭
updated_log = [new_log_entry] + history_log
log_df = pd.DataFrame(updated_log, columns=LOG_COLUMNS)
return classfy_report, avg_report, accuracy_text, precision_text, recall_text, f1_score_text, roc_auc_text, fig_cm, fig_imp, log_df, updated_log
# --- Gradio 介面設計 ---
LOG_COLUMNS = ["時間", "模型", "特徵", "參數", "準確率", "精確率", "召回率", "F1 分數", "ROC-AUC 分數"]
with gr.Blocks(theme=gr.themes.Soft()) as demo:
# 用於儲存日誌的隱藏狀態元件
log_state = gr.State([])
gr.Markdown("# 投保預測模型建置專案")
gr.Markdown("在左側選擇特徵並點擊按鈕進行探索,或調整參數後點擊按鈕以訓練模型。")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## 1. 特徵選擇與探索")
feature_selector = gr.CheckboxGroup(ALL_FEATURES, label="選擇特徵", value=['Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vehicle_Age', 'Age'])
with gr.Row():
select_all_btn = gr.Button("全部選取"); deselect_all_btn = gr.Button("全部清除")
with gr.Accordion("特徵探索 (EDA)", open=True):
eda_run_btn = gr.Button("執行資料探索", variant="secondary")
eda_stats = gr.DataFrame(label="敘述性統計")
eda_corr = gr.DataFrame(label="與目標 'Response' 的相關係數")
eda_plot_selector = gr.Dropdown(label="選擇要視覺化的特徵")
eda_plot = gr.Plot(label="視覺化")
gr.Markdown("## 2. 模型選擇與超參數調整")
model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
with gr.Group(visible=False) as lr_box:
gr.Markdown("#### 羅吉斯回歸")
with gr.Group(visible=True) as dt_box:
gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
with gr.Group(visible=False) as xgb_box:
gr.Markdown("#### XGBoost"); xgb_n_estimators = gr.Slider(50, 500, value=100, step=10, label="樹的數量"); xgb_max_depth = gr.Slider(3, 15, value=5, step=1, label="最大深度"); xgb_learning_rate = gr.Slider(0.01, 0.3, value=0.1, step=0.01, label="學習率")
with gr.Group(visible=False) as svm_box:
gr.Markdown("#### SVM"); svm_c = gr.Slider(0.01, 10.0, value=1.0, step=0.01, label="C (懲罰參數)"); svm_kernel = gr.Radio(['linear', 'rbf', 'poly'], value='linear', label="核心")
run_btn = gr.Button("🚀 執行模型訓練", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## 3. 模型評估結果")
model_output_accuracy = gr.Textbox(label="準確率 分數")
model_output_precision = gr.Textbox(label="精確率 分數")
model_output_recall = gr.Textbox(label="召回率 分數")
model_output_f1_score = gr.Textbox(label="F1 分數")
model_output_auc = gr.Textbox(label="AUC 分數")
model_output_report = gr.DataFrame(label="分類報告")
model_output_report_avg = gr.DataFrame(label="平均報告")
model_plot_cm = gr.Plot(label="混淆矩陣")
model_plot_importance = gr.Plot(label="特徵重要性/係數")
with gr.Accordion("操作紀錄 (History Log)", open=False):
log_df_display = gr.DataFrame(headers=LOG_COLUMNS, datatype=["str", "str", "str", "str", "str"])
# --- 事件處理 ---
eda_run_btn.click(update_eda_section, inputs=feature_selector, outputs=[eda_stats, eda_corr, eda_plot_selector, eda_plot])
eda_plot_selector.change(generate_feature_plot, inputs=eda_plot_selector, outputs=eda_plot)
def show_hyperparameters(model_name): return {lr_box: gr.update(visible=model_name == '羅吉斯回歸'), dt_box: gr.update(visible=model_name == '決策樹'), xgb_box: gr.update(visible=model_name == 'XGBoost'), svm_box: gr.update(visible=model_name == 'SVM')}
model_selector.change(show_hyperparameters, inputs=model_selector, outputs=[lr_box, dt_box, xgb_box, svm_box])
def select_all_features(): return gr.update(value=ALL_FEATURES)
def deselect_all_features(): return gr.update(value=[])
select_all_btn.click(select_all_features, None, feature_selector)
deselect_all_btn.click(deselect_all_features, None, feature_selector)
run_btn.click(
train_and_evaluate,
inputs=[log_state, model_selector, feature_selector, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel],
outputs=[model_output_report, model_output_report_avg, model_output_accuracy, model_output_precision, model_output_recall, model_output_f1_score, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
)
if __name__ == "__main__":
demo.launch() |