File size: 14,420 Bytes
0a19352
 
 
 
 
 
 
 
 
 
 
5b16257
0a19352
 
 
 
 
 
1e4bbff
0a19352
 
e970db4
 
 
0a19352
e970db4
 
0a19352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e970db4
 
 
0a19352
 
 
 
 
 
 
1e4bbff
0a19352
 
1e4bbff
0a19352
 
 
 
1e4bbff
0a19352
 
 
 
 
 
 
 
 
 
 
1e4bbff
 
 
 
 
 
 
 
 
0a19352
 
1e4bbff
 
 
 
0a19352
 
 
 
1e4bbff
 
0a19352
 
 
 
1e4bbff
0a19352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e4bbff
 
 
0a19352
 
5b16257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e970db4
5b16257
 
 
 
e970db4
5b16257
0a19352
1e4bbff
0a19352
 
 
 
 
 
1e4bbff
0a19352
 
 
 
 
 
 
 
5b16257
 
 
 
 
0a19352
 
 
 
 
5b16257
0a19352
 
5b16257
0a19352
 
 
 
 
e970db4
0a19352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e4bbff
0a19352
 
 
 
 
 
 
 
 
5b16257
 
 
 
0a19352
5b16257
 
0a19352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e4bbff
5b16257
0a19352
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import warnings
import json

# --- 初始設定與資料載入 ---
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def processDisplayDataframe(df):
    # 假設 df 已經存在
    num_cols = df.select_dtypes(include=np.number).columns  # 先抓出數值欄位名稱

    df[num_cols] = df[num_cols].map(lambda x: f"{x:.4f}")
    return df

def load_data():
    """
    載入並對資料進行固定的預處理。
    此函式只在應用程式啟動時執行一次。
    """
    df = pd.read_csv('sampled_data.csv')
    df_processed = df.copy()
    df_processed = df_processed.drop('id', axis=1)
    df_processed['Gender'] = df_processed['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    df_processed['Vehicle_Age'] = df_processed['Vehicle_Age'].map(age_mapping)
    df_processed['Vehicle_Damage'] = df_processed['Vehicle_Damage'].apply(lambda x: 1 if x == 'Yes' else 0)
    return df, df_processed

df_original, df_processed = load_data()
ALL_FEATURES = [col for col in df_processed.columns if col != 'Response']
NUMERICAL_FEATURES = [f for f in df_original.select_dtypes(include=np.number).columns.tolist() if f in ALL_FEATURES]

# --- EDA 相關函式 ---
def update_eda_section(selected_features):
    if not selected_features:
        return pd.DataFrame(), pd.DataFrame(), gr.update(choices=[], value=None), None
    stats = df_processed[selected_features].describe().T.reset_index().rename(columns={'index': 'Feature'})
    corrs = df_processed[selected_features + ['Response']].corr(numeric_only=True)['Response'].drop('Response').to_frame().reset_index()
    corrs.columns = ['Feature', 'Correlation with Response']
    first_feature_plot = generate_feature_plot(selected_features[0])
    plot_selector_update = gr.update(choices=selected_features, value=selected_features[0])
    
    stats = processDisplayDataframe(stats)
    corrs = processDisplayDataframe(corrs)
    return stats, corrs, plot_selector_update, first_feature_plot

def generate_feature_plot(feature):
    if not feature: return None
    fig, ax = plt.subplots()
    if feature in NUMERICAL_FEATURES:
        sns.histplot(data=df_processed, x=feature, hue='Response', kde=True, ax=ax, palette='viridis', multiple="stack")
        ax.set_title(f'Histogram of "{feature}" (colored by Response)')
    else:
        sns.countplot(data=df_processed, x=feature, hue='Response', ax=ax, palette='viridis')
        ax.set_title(f'Count Plot of "{feature}" (colored by Response)')
    plt.tight_layout()
    return fig

# --- 核心訓練與評估函式 ---
def train_and_evaluate(history_log, model_name, features, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel):
    """
    當使用者點擊 "執行模型訓練" 按鈕時觸發。
    整合了資料準備、模型訓練、評估、結果視覺化以及紀錄日誌的完整流程。
    """
    if not features:
        # 如果沒有選擇特徵,只回傳錯誤訊息和空的日誌
        return "錯誤:請至少選擇一個特徵!", None, None, None, pd.DataFrame(history_log, columns=LOG_COLUMNS), history_log
    
    # --- 1. 資料準備 ---
    X = df_processed[features]
    y = df_processed['Response']
    
    # 2. 先切分資料,再進行標準化,避免資料外洩
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    # 複製 X_train 和 X_test 以避免 SettingWithCopyWarning
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    # 3. 準備 Scaler
    numerical_cols_in_x = [f for f in NUMERICAL_FEATURES if f in X_train.columns]
    if numerical_cols_in_x:
        scaler = StandardScaler()
        # 4. 只在 X_train 上 fit_transform
        X_train_scaled[numerical_cols_in_x] = scaler.fit_transform(X_train[numerical_cols_in_x])
        # 5. 在 X_test 上 "只" transform
        X_test_scaled[numerical_cols_in_x] = scaler.transform(X_test[numerical_cols_in_x])

    # --- 2. 模型選擇與訓練 ---
    params = {}
    if model_name == '羅吉斯回歸':
        # params = {'C': lr_c, 'solver': lr_solver}
        params = {}  # statsmodels 不使用這些參數
        X_train_sm = sm.add_constant(X_train); X_test_sm = sm.add_constant(X_test)
        logit_model = sm.Logit(y_train, X_train_sm)
        result = logit_model.fit(disp=0)
        y_pred_proba = result.predict(X_test_sm); y_pred = (y_pred_proba > 0.5).astype(int)
        importances, title = result.tvalues.drop('const', errors='ignore'), 'Feature t-values'
    else:
        if model_name == '決策樹':
            params = {'criterion': dt_criterion, 'max_depth': dt_max_depth}
            model = DecisionTreeClassifier(**params, random_state=42, class_weight='balanced')
        elif model_name == 'XGBoost':
            params = {'n_estimators': int(xgb_n_estimators), 'max_depth': int(xgb_max_depth), 'learning_rate': xgb_learning_rate}
            scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
            model = xgb.XGBClassifier(**params, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss', random_state=42)
        elif model_name == 'SVM':
            params = {'C': svm_c, 'kernel': svm_kernel}
            model = SVC(**params, probability=True, random_state=42, class_weight='balanced')
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test); y_pred_proba = model.predict_proba(X_test)[:, 1]

        if model_name == 'SVM' and svm_kernel == 'linear': importances, title = model.coef_[0], 'Feature Coefficients'
        elif model_name in ['決策樹', 'XGBoost']: importances, title = model.feature_importances_, 'Feature Importance'
        else: importances, title = None, 'Feature Importance'

    # --- 3. 評估與繪圖 ---
    accuracy_value = accuracy_score(y_test, y_pred)
    precision_value = precision_score(y_test, y_pred)
    recall_value = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    roc_auc_value = roc_auc_score(y_test, y_pred_proba)
    
    accuracy_text = f"準確率 分數: {accuracy_value:.4f}"
    precision_text = f"精確率 分數: {precision_value:.4f}"
    recall_text = f"召回率 分數: {recall_value:.4f}"
    f1_score_text = f"F1 分數: {f1_score_value:.4f}"
    roc_auc_text = f"ROC-AUC 分數: {roc_auc_value:.4f}"
    
    report_dict = classification_report(y_test, y_pred, target_names=['not purchase insurance (0)', 'purchase insurance (1)'], output_dict=True)   
    classfy_report = pd.DataFrame({
        'not purchase insurance (0)':report_dict['not purchase insurance (0)'],
        'purchase insurance (1)':report_dict['purchase insurance (1)'],
    }, columns=[ 'not purchase insurance (0)', 'purchase insurance (1)']).T
    classfy_report.insert(0, "index",  classfy_report.index)
    classfy_report = processDisplayDataframe(classfy_report)
    
    
    avg_report = pd.DataFrame([
        report_dict["macro avg"],
        report_dict["weighted avg"],    
    ], index=["macro avg", "weighted avg"])
    avg_report.insert(0, "index",  avg_report.index)
    avg_report = processDisplayDataframe(avg_report)
    
    # 2. 轉成 DataFrame(每個類別一列)
    # df_report = pd.DataFrame(report_dict).T  # T = transpose,讓 index 變成類別名稱
    # df_report.insert(0, "index",  df_report.index)
    # # print(df_report)
    # df_report = processDisplayDataframe(df_report)
    

    cm = confusion_matrix(y_test, y_pred)
    fig_cm, ax_cm = plt.subplots(); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1']); ax_cm.set_title('Confusion Matrix'); ax_cm.set_xlabel('Predicted Label'); ax_cm.set_ylabel('Actual Label'); plt.tight_layout()

    fig_imp, ax_imp = plt.subplots()
    if importances is not None:
        feature_imp = pd.Series(importances, index=features).sort_values(ascending=False)
        sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax_imp); ax_imp.set_title(title)
    else:
        ax_imp.text(0.5, 0.5, 'This model/kernel cannot directly display feature importance', ha='center', va='center'); ax_imp.set_title(title)
    plt.tight_layout()

    # --- 4. 紀錄日誌 ---
    new_log_entry = [
        pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
        model_name,
        ', '.join(features),
        json.dumps(params),
        f"{accuracy_value:.4f}",
        f"{precision_value:.4f}",
        f"{recall_value:.4f}",
        f"{f1_score_value:.4f}",
        f"{roc_auc_value:.4f}",
    ]
    # 將新紀錄加到歷史紀錄的開頭
    updated_log = [new_log_entry] + history_log
    log_df = pd.DataFrame(updated_log, columns=LOG_COLUMNS)

    return classfy_report, avg_report, accuracy_text, precision_text, recall_text, f1_score_text, roc_auc_text, fig_cm, fig_imp, log_df, updated_log

# --- Gradio 介面設計 ---
LOG_COLUMNS = ["時間", "模型", "特徵", "參數", "準確率", "精確率", "召回率", "F1 分數", "ROC-AUC 分數"]

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # 用於儲存日誌的隱藏狀態元件
    log_state = gr.State([])

    gr.Markdown("# 投保預測模型建置專案")
    gr.Markdown("在左側選擇特徵並點擊按鈕進行探索,或調整參數後點擊按鈕以訓練模型。")
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 1. 特徵選擇與探索")
            feature_selector = gr.CheckboxGroup(ALL_FEATURES, label="選擇特徵", value=['Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vehicle_Age', 'Age'])
            with gr.Row():
                select_all_btn = gr.Button("全部選取"); deselect_all_btn = gr.Button("全部清除")
            with gr.Accordion("特徵探索 (EDA)", open=True):
                eda_run_btn = gr.Button("執行資料探索", variant="secondary")
                eda_stats = gr.DataFrame(label="敘述性統計")
                eda_corr = gr.DataFrame(label="與目標 'Response' 的相關係數")
                eda_plot_selector = gr.Dropdown(label="選擇要視覺化的特徵")
                eda_plot = gr.Plot(label="視覺化")
            gr.Markdown("## 2. 模型選擇與超參數調整")
            model_selector = gr.Dropdown(['羅吉斯回歸', '決策樹', 'XGBoost', 'SVM'], label="選擇模型", value='決策樹')
            with gr.Group(visible=False) as lr_box:
                gr.Markdown("#### 羅吉斯回歸")
            with gr.Group(visible=True) as dt_box:
                gr.Markdown("#### 決策樹"); dt_criterion = gr.Radio(['gini', 'entropy'], value='gini', label="評估標準"); dt_max_depth = gr.Slider(3, 30, value=8, step=1, label="最大深度")
            with gr.Group(visible=False) as xgb_box:
                gr.Markdown("#### XGBoost"); xgb_n_estimators = gr.Slider(50, 500, value=100, step=10, label="樹的數量"); xgb_max_depth = gr.Slider(3, 15, value=5, step=1, label="最大深度"); xgb_learning_rate = gr.Slider(0.01, 0.3, value=0.1, step=0.01, label="學習率")
            with gr.Group(visible=False) as svm_box:
                gr.Markdown("#### SVM"); svm_c = gr.Slider(0.01, 10.0, value=1.0, step=0.01, label="C (懲罰參數)"); svm_kernel = gr.Radio(['linear', 'rbf', 'poly'], value='linear', label="核心")
            run_btn = gr.Button("🚀 執行模型訓練", variant="primary")
        with gr.Column(scale=2):
            gr.Markdown("## 3. 模型評估結果")
            model_output_accuracy = gr.Textbox(label="準確率 分數")
            model_output_precision = gr.Textbox(label="精確率 分數")
            model_output_recall = gr.Textbox(label="召回率 分數")
            model_output_f1_score = gr.Textbox(label="F1 分數")
            model_output_auc = gr.Textbox(label="AUC 分數")
            model_output_report = gr.DataFrame(label="分類報告")
            model_output_report_avg = gr.DataFrame(label="平均報告")
            model_plot_cm = gr.Plot(label="混淆矩陣")
            model_plot_importance = gr.Plot(label="特徵重要性/係數")
    
    with gr.Accordion("操作紀錄 (History Log)", open=False):
        log_df_display = gr.DataFrame(headers=LOG_COLUMNS, datatype=["str", "str", "str", "str", "str"])

    # --- 事件處理 ---
    eda_run_btn.click(update_eda_section, inputs=feature_selector, outputs=[eda_stats, eda_corr, eda_plot_selector, eda_plot])
    eda_plot_selector.change(generate_feature_plot, inputs=eda_plot_selector, outputs=eda_plot)
    def show_hyperparameters(model_name): return {lr_box: gr.update(visible=model_name == '羅吉斯回歸'), dt_box: gr.update(visible=model_name == '決策樹'), xgb_box: gr.update(visible=model_name == 'XGBoost'), svm_box: gr.update(visible=model_name == 'SVM')}
    model_selector.change(show_hyperparameters, inputs=model_selector, outputs=[lr_box, dt_box, xgb_box, svm_box])
    def select_all_features(): return gr.update(value=ALL_FEATURES)
    def deselect_all_features(): return gr.update(value=[])
    select_all_btn.click(select_all_features, None, feature_selector)
    deselect_all_btn.click(deselect_all_features, None, feature_selector)
    
    run_btn.click(
        train_and_evaluate, 
        inputs=[log_state, model_selector, feature_selector, dt_criterion, dt_max_depth, xgb_n_estimators, xgb_max_depth, xgb_learning_rate, svm_c, svm_kernel], 
        outputs=[model_output_report, model_output_report_avg, model_output_accuracy, model_output_precision, model_output_recall, model_output_f1_score, model_output_auc, model_plot_cm, model_plot_importance, log_df_display, log_state]
    )

if __name__ == "__main__":
    demo.launch()