# 基础数据处理与可视化库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 机器学习预处理与模型选择
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate,
    RandomizedSearchCV, learning_curve
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 机器学习模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 评估指标
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve, average_precision_score,
    ConfusionMatrixDisplay
)

# 校准曲线
from sklearn.calibration import calibration_curve

# 模型可解释性
import shap
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

# 聚类与降维
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# 忽略警告
import warnings
warnings.filterwarnings('ignore')

# 设置画图风格
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = (10, 6)

# 1.1 加载数据 & 基本信息
df = pd.read_csv('online_shoppers_intention.csv')

print("=== 数据集形状 ===")
print(f"行数: {df.shape[0]}, 列数: {df.shape[1]}\n")

display(df.head())

print("\n=== 数据集基本信息 ===")
df.info()

print("\n=== 数值变量描述性统计 ===")
display(df.describe().round(2))

print("\n=== 缺失值统计 ===")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "无缺失值")

print("\n=== 重复行统计 ===")
print(f"重复行数量: {df.duplicated().sum()}")

=== 数据集形状 ===
行数: 12330, 列数: 18

=== 数据集基本信息 ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType              12330 non-null  int64  
 15  VisitorType              12330 non-null  object 
 16  Weekend                  12330 non-null  bool   
 17  Revenue                  12330 non-null  bool   
dtypes: bool(2), float64(7), int64(7), object(2)
memory usage: 1.5+ MB

=== 数值变量描述性统计 ===

=== 缺失值统计 ===
无缺失值

=== 重复行统计 ===
重复行数量: 125

# 1.2 目标变量 Revenue 分布 (类别不平衡分析)
print("=== 目标变量 Revenue 分布 ===")
rev_counts = df['Revenue'].value_counts()
rev_pct = df['Revenue'].value_counts(normalize=True)
print(pd.DataFrame({'Count': rev_counts, 'Percentage': rev_pct.round(4)}))

imbalance_ratio = rev_counts[False] / rev_counts[True]
print(f"\n不平衡比例 (Negative : Positive) = {imbalance_ratio:.2f} : 1")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.countplot(x='Revenue', data=df, ax=axes[0])
axes[0].set_title('Revenue Count Distribution')
axes[0].bar_label(axes[0].containers[0])

df['Revenue'].value_counts().plot.pie(
    autopct='%1.1f%%', ax=axes[1], startangle=90, explode=[0, 0.05]
)
axes[1].set_ylabel('')
axes[1].set_title('Revenue Proportion')
plt.tight_layout()
plt.show()

=== 目标变量 Revenue 分布 ===
         Count  Percentage
Revenue                   
False    10422      0.8453
True      1908      0.1547

不平衡比例 (Negative : Positive) = 5.46 : 1

# 1.3 连续变量分布与目标变量的关系
continuous_cols = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay'
]

fig, axes = plt.subplots(2, 5, figsize=(22, 8))
axes = axes.flatten()

for i, col in enumerate(continuous_cols):
    sns.boxplot(data=df, x='Revenue', y=col, ax=axes[i])
    axes[i].set_title(f'{col}')

plt.suptitle('Continuous Variables by Revenue (Boxplot)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# KDE 对比关键变量
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.kdeplot(data=df, x="PageValues", hue="Revenue", fill=True, ax=axes[0])
axes[0].set_title('PageValues Distribution by Revenue')
axes[0].set_xlim(0, 100)

sns.kdeplot(data=df, x="BounceRates", hue="Revenue", fill=True, ax=axes[1])
axes[1].set_title('BounceRates Distribution by Revenue')

sns.kdeplot(data=df, x="ExitRates", hue="Revenue", fill=True, ax=axes[2])
axes[2].set_title('ExitRates Distribution by Revenue')

plt.tight_layout()
plt.show()

# 1.4 类别变量与目标变量关系
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

month_order = ['Feb', 'Mar', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.countplot(data=df, x='Month', hue='Revenue', order=month_order, ax=axes[0])
axes[0].set_title('Purchasing Intention across Months')
axes[0].tick_params(axis='x', rotation=45)

sns.countplot(data=df, x='VisitorType', hue='Revenue', ax=axes[1])
axes[1].set_title('Purchasing Intention by Visitor Type')

sns.countplot(data=df, x='Weekend', hue='Revenue', ax=axes[2])
axes[2].set_title('Purchasing Intention by Weekend')

plt.tight_layout()
plt.show()

# 各月份购买转化率
monthly_conv = df.groupby('Month')['Revenue'].mean().reindex(month_order)
plt.figure(figsize=(8, 4))
monthly_conv.plot(kind='bar', color='coral', edgecolor='black')
plt.title('Monthly Conversion Rate')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)
plt.show()

# 1.5 相关性热图
plt.figure(figsize=(14, 10))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
            fmt=".2f", linewidths=0.5, center=0)
plt.title('Correlation Heatmap of Numerical Variables')
plt.tight_layout()
plt.show()

# 与目标变量 Revenue 的相关性排序
df_temp = df.copy()
df_temp['Revenue'] = df_temp['Revenue'].astype(int)
target_corr = df_temp.select_dtypes(include=[np.number]).corr()['Revenue'].drop('Revenue').sort_values(ascending=False)
print("\n=== 特征与 Revenue 的相关系数 (降序) ===")
print(target_corr.round(4))

=== 特征与 Revenue 的相关系数 (降序) ===
PageValues                 0.4926
ProductRelated             0.1585
ProductRelated_Duration    0.1524
Administrative             0.1389
Informational              0.0952
Administrative_Duration    0.0936
Informational_Duration     0.0703
Browser                    0.0240
TrafficType               -0.0051
Region                    -0.0116
OperatingSystems          -0.0147
SpecialDay                -0.0823
BounceRates               -0.1507
ExitRates                 -0.2071
Name: Revenue, dtype: float64

# 2.1 特征工程 (Feature Engineering) —— 升级方向6
df_prep = df.copy()

# 目标变量与布尔变量转换
df_prep['Revenue'] = df_prep['Revenue'].astype(int)
df_prep['Weekend'] = df_prep['Weekend'].astype(int)

# ---- 构造新特征 ----
# 总页面访问数
df_prep['TotalPages'] = (
    df_prep['Administrative'] + df_prep['Informational'] + df_prep['ProductRelated']
)

# 总浏览时长
df_prep['TotalDuration'] = (
    df_prep['Administrative_Duration'] +
    df_prep['Informational_Duration'] +
    df_prep['ProductRelated_Duration']
)

# 平均每页产品浏览时长 (避免除零)
df_prep['AvgProductDuration'] = np.where(
    df_prep['ProductRelated'] > 0,
    df_prep['ProductRelated_Duration'] / df_prep['ProductRelated'],
    0
)

# 产品页面占比 (产品相关页面 / 总页面)
df_prep['ProductPageRatio'] = np.where(
    df_prep['TotalPages'] > 0,
    df_prep['ProductRelated'] / df_prep['TotalPages'],
    0
)

# 跳出率与退出率的交互特征
df_prep['Bounce_Exit_Ratio'] = np.where(
    df_prep['ExitRates'] > 0,
    df_prep['BounceRates'] / df_prep['ExitRates'],
    0
)

# ---- 对偏态变量做对数变换 ----
skew_cols = ['PageValues', 'ProductRelated_Duration', 'Administrative_Duration',
             'Informational_Duration', 'TotalDuration']

for col in skew_cols:
    df_prep[f'{col}_log'] = np.log1p(df_prep[col])

print("=== 新增特征列表 ===")
new_cols = ['TotalPages', 'TotalDuration', 'AvgProductDuration',
            'ProductPageRatio', 'Bounce_Exit_Ratio'] + [f'{c}_log' for c in skew_cols]
print(new_cols)
print(f"\n特征工程后数据形状: {df_prep.shape}")

=== 新增特征列表 ===
['TotalPages', 'TotalDuration', 'AvgProductDuration', 'ProductPageRatio', 'Bounce_Exit_Ratio', 'PageValues_log', 'ProductRelated_Duration_log', 'Administrative_Duration_log', 'Informational_Duration_log', 'TotalDuration_log']

特征工程后数据形状: (12330, 28)

# 2.2 特征编码 (One-Hot Encoding)
categorical_cols = ['Month', 'OperatingSystems', 'Browser', 'Region',
                    'TrafficType', 'VisitorType']
df_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)

print(f"编码后数据形状: {df_encoded.shape}")

# 2.3 数据集划分 (分层抽样)
X = df_encoded.drop('Revenue', axis=1)
y = df_encoded['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"训练集: X={X_train.shape}, y={y_train.shape}")
print(f"测试集: X={X_test.shape}, y={y_test.shape}")
print(f"\n训练集类别分布:\n{y_train.value_counts()}")
print(f"\n测试集类别分布:\n{y_test.value_counts()}")

# 计算 scale_pos_weight (后续 XGBoost/cost-sensitive 使用)
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight_val = neg_count / pos_count
print(f"\nscale_pos_weight = {scale_pos_weight_val:.2f}")

# 保存特征名列表 (后续可解释性分析使用)
feature_names = X.columns.tolist()

编码后数据形状: (12330, 79)
训练集: X=(9864, 78), y=(9864,)
测试集: X=(2466, 78), y=(2466,)

训练集类别分布:
Revenue
0    8338
1    1526
Name: count, dtype: int64

测试集类别分布:
Revenue
0    2084
1     382
Name: count, dtype: int64

scale_pos_weight = 5.46

# 3.1 定义统一的交叉验证框架
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

def run_cv(estimator, X, y, cv, name):
    """
    执行交叉验证并返回平均值和标准差。
    """
    results = cross_validate(estimator, X, y, cv=cv,
                             scoring=scoring_metrics, n_jobs=-1)
    row = {'Model': name}
    for metric in scoring_metrics:
        key = f'test_{metric}'
        row[f'{metric}_mean'] = results[key].mean()
        row[f'{metric}_std'] = results[key].std()
    return row

# 3.2 基线模型 (默认参数 + SMOTE Pipeline) 交叉验证
baseline_configs = {
    'Logistic Regression': ImbPipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('model', LogisticRegression(random_state=42, max_iter=1000))
    ]),
    'Decision Tree': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', DecisionTreeClassifier(random_state=42, max_depth=5))
    ]),
    'Random Forest': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(random_state=42, n_estimators=100))
    ]),
    'XGBoost': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', XGBClassifier(random_state=42, eval_metric='logloss',
                                verbosity=0))
    ]),
    'LightGBM': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', LGBMClassifier(random_state=42, verbose=-1))
    ]),
}

baseline_results = []
for name, pipe in baseline_configs.items():
    print(f"正在交叉验证: {name} ...")
    row = run_cv(pipe, X_train, y_train, cv, name)
    baseline_results.append(row)

baseline_df = pd.DataFrame(baseline_results).set_index('Model')

# 格式化显示: mean ± std
display_cols = {}
for metric in scoring_metrics:
    display_cols[metric] = baseline_df.apply(
        lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
    )
display(pd.DataFrame(display_cols, index=baseline_df.index))

正在交叉验证: Logistic Regression ...
正在交叉验证: Decision Tree ...
正在交叉验证: Random Forest ...
正在交叉验证: XGBoost ...
正在交叉验证: LightGBM ...

# 3.3 基线模型性能可视化
fig, ax = plt.subplots(figsize=(12, 6))
plot_metrics = ['f1_mean', 'roc_auc_mean', 'recall_mean', 'precision_mean']
plot_labels = ['F1-score', 'ROC-AUC', 'Recall', 'Precision']

x = np.arange(len(baseline_df))
width = 0.2
for i, (col, label) in enumerate(zip(plot_metrics, plot_labels)):
    bars = ax.bar(x + i * width, baseline_df[col], width, label=label)
    # 添加 std 误差线
    ax.errorbar(x + i * width, baseline_df[col],
                yerr=baseline_df[col.replace('mean', 'std')],
                fmt='none', ecolor='black', capsize=3)

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Baseline Models - 5-Fold Stratified CV Performance (with SMOTE)')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(baseline_df.index, rotation=30, ha='right')
ax.legend(loc='lower right')
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

# 4.1 定义三种不平衡处理策略的 Pipeline

def get_imbalance_configs(model_name):
    """
    针对给定模型，返回三种不平衡处理策略。
    """
    configs = {}

    if model_name == 'Logistic Regression':
        configs['No Balancing'] = SkPipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(random_state=42, max_iter=1000))
        ])
        configs['SMOTE'] = ImbPipeline([
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=42)),
            ('model', LogisticRegression(random_state=42, max_iter=1000))
        ])
        configs['class_weight=balanced'] = SkPipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(random_state=42, max_iter=1000,
                                         class_weight='balanced'))
        ])

    elif model_name == 'Random Forest':
        configs['No Balancing'] = RandomForestClassifier(
            random_state=42, n_estimators=100
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', RandomForestClassifier(random_state=42, n_estimators=100))
        ])
        configs['class_weight=balanced'] = RandomForestClassifier(
            random_state=42, n_estimators=100, class_weight='balanced'
        )

    elif model_name == 'XGBoost':
        configs['No Balancing'] = XGBClassifier(
            random_state=42, eval_metric='logloss', verbosity=0
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', XGBClassifier(random_state=42, eval_metric='logloss',
                                    verbosity=0))
        ])
        configs['scale_pos_weight'] = XGBClassifier(
            random_state=42, eval_metric='logloss', verbosity=0,
            scale_pos_weight=scale_pos_weight_val
        )

    elif model_name == 'LightGBM':
        configs['No Balancing'] = LGBMClassifier(
            random_state=42, verbose=-1
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', LGBMClassifier(random_state=42, verbose=-1))
        ])
        configs['class_weight=balanced'] = LGBMClassifier(
            random_state=42, verbose=-1, class_weight='balanced'
        )

    return configs

# 4.2 对比实验 —— 对4个代表性模型 × 3种策略进行交叉验证
compare_models = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']
imbalance_results = []

for model_name in compare_models:
    configs = get_imbalance_configs(model_name)
    for strategy_name, estimator in configs.items():
        label = f"{model_name} | {strategy_name}"
        print(f"正在验证: {label} ...")
        row = run_cv(estimator, X_train, y_train, cv, label)
        row['Base_Model'] = model_name
        row['Strategy'] = strategy_name
        imbalance_results.append(row)

imb_df = pd.DataFrame(imbalance_results)

正在验证: Logistic Regression | No Balancing ...
正在验证: Logistic Regression | SMOTE ...
正在验证: Logistic Regression | class_weight=balanced ...
正在验证: Random Forest | No Balancing ...
正在验证: Random Forest | SMOTE ...
正在验证: Random Forest | class_weight=balanced ...
正在验证: XGBoost | No Balancing ...
正在验证: XGBoost | SMOTE ...
正在验证: XGBoost | scale_pos_weight ...
正在验证: LightGBM | No Balancing ...
正在验证: LightGBM | SMOTE ...
正在验证: LightGBM | class_weight=balanced ...

# 4.3 不平衡策略对比可视化
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, model_name in enumerate(compare_models):
    ax = axes[idx]
    subset = imb_df[imb_df['Base_Model'] == model_name]

    x = np.arange(len(subset))
    width = 0.25

    ax.bar(x - width, subset['f1_mean'], width, label='F1', color='#66c2a5')
    ax.bar(x, subset['roc_auc_mean'], width, label='ROC-AUC', color='#fc8d62')
    ax.bar(x + width, subset['recall_mean'], width, label='Recall', color='#8da0cb')

    ax.set_title(f'{model_name}')
    ax.set_xticks(x)
    ax.set_xticklabels(subset['Strategy'], rotation=20, ha='right')
    ax.set_ylim(0, 1.05)
    ax.legend(fontsize=9)

plt.suptitle('Imbalance Handling Strategy Comparison (5-Fold CV)', fontsize=14)
plt.tight_layout()
plt.show()

# 找出每个模型的最佳策略
print("\n=== 各模型最佳不平衡处理策略 (按 F1-score) ===")
for model_name in compare_models:
    subset = imb_df[imb_df['Base_Model'] == model_name]
    best = subset.loc[subset['f1_mean'].idxmax()]
    print(f"  {model_name}: {best['Strategy']} (F1 = {best['f1_mean']:.4f})")

=== 各模型最佳不平衡处理策略 (按 F1-score) ===
  Logistic Regression: SMOTE (F1 = 0.6537)
  Random Forest: SMOTE (F1 = 0.6824)
  XGBoost: scale_pos_weight (F1 = 0.6641)
  LightGBM: class_weight=balanced (F1 = 0.6740)

# 5.1 Random Forest 超参数调优
print("=" * 60)
print("超参数调优: Random Forest (SMOTE Pipeline)")
print("=" * 60)

pipe_rf_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

param_dist_rf = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2', None]
}

search_rf = RandomizedSearchCV(
    pipe_rf_tune,
    param_distributions=param_dist_rf,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_rf.fit(X_train, y_train)

print(f"Best F1 (CV): {search_rf.best_score_:.4f}")
print(f"Best Params: {search_rf.best_params_}")

============================================================
超参数调优: Random Forest (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6854
Best Params: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 4, 'model__max_features': 'log2', 'model__max_depth': None}

# 5.2 XGBoost 超参数调优
print("=" * 60)
print("超参数调优: XGBoost (SMOTE Pipeline)")
print("=" * 60)

pipe_xgb_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0))
])

param_dist_xgb = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [3, 5, 7, 10],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__min_child_weight': [1, 3, 5, 7],
    'model__gamma': [0, 0.1, 0.2, 0.3]
}

search_xgb = RandomizedSearchCV(
    pipe_xgb_tune,
    param_distributions=param_dist_xgb,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_xgb.fit(X_train, y_train)

print(f"Best F1 (CV): {search_xgb.best_score_:.4f}")
print(f"Best Params: {search_xgb.best_params_}")

============================================================
超参数调优: XGBoost (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6871
Best Params: {'model__subsample': 0.9, 'model__n_estimators': 100, 'model__min_child_weight': 3, 'model__max_depth': 3, 'model__learning_rate': 0.1, 'model__gamma': 0, 'model__colsample_bytree': 0.9}

# 5.3 LightGBM 超参数调优
print("=" * 60)
print("超参数调优: LightGBM (SMOTE Pipeline)")
print("=" * 60)

pipe_lgbm_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', LGBMClassifier(random_state=42, verbose=-1))
])

param_dist_lgbm = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [3, 5, 7, 10, -1],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__num_leaves': [15, 31, 63, 127],
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__min_child_samples': [5, 10, 20, 50],
    'model__reg_alpha': [0, 0.01, 0.1, 1],
    'model__reg_lambda': [0, 0.01, 0.1, 1]
}

search_lgbm = RandomizedSearchCV(
    pipe_lgbm_tune,
    param_distributions=param_dist_lgbm,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_lgbm.fit(X_train, y_train)

print(f"Best F1 (CV): {search_lgbm.best_score_:.4f}")
print(f"Best Params: {search_lgbm.best_params_}")

============================================================
超参数调优: LightGBM (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6850
Best Params: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0.01, 'model__num_leaves': 63, 'model__n_estimators': 300, 'model__min_child_samples': 10, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}

# 5.4 调参前后对比
print("\n=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===")

tuning_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'LightGBM'],
    'Before Tuning (F1)': [
        baseline_df.loc['Random Forest', 'f1_mean'],
        baseline_df.loc['XGBoost', 'f1_mean'],
        baseline_df.loc['LightGBM', 'f1_mean']
    ],
    'After Tuning (F1)': [
        search_rf.best_score_,
        search_xgb.best_score_,
        search_lgbm.best_score_
    ]
}).set_index('Model')

tuning_comparison['Improvement'] = (
    tuning_comparison['After Tuning (F1)'] - tuning_comparison['Before Tuning (F1)']
)

display(tuning_comparison.round(4))

# 选出最优模型
best_searches = {
    'Random Forest': search_rf,
    'XGBoost': search_xgb,
    'LightGBM': search_lgbm
}
best_model_name = tuning_comparison['After Tuning (F1)'].idxmax()
best_search = best_searches[best_model_name]
print(f"\n>>> 最优模型: {best_model_name} (Tuned F1 = {best_search.best_score_:.4f})")

=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===

>>> 最优模型: XGBoost (Tuned F1 = 0.6871)

# 6.1 使用最优调参模型在测试集上获取概率预测
best_pipeline = best_search.best_estimator_
y_proba_best = best_pipeline.predict_proba(X_test)[:, 1]

# 6.2 计算各阈值下的 Precision, Recall, F1
precisions_curve, recalls_curve, thresholds_pr = precision_recall_curve(y_test, y_proba_best)
f1_scores_curve = 2 * (precisions_curve * recalls_curve) / (precisions_curve + recalls_curve + 1e-10)

# 最优阈值 (最大化 F1)
best_threshold_idx = np.argmax(f1_scores_curve[:-1])  # 最后一个点可能无意义
best_threshold = thresholds_pr[best_threshold_idx]
best_f1_at_threshold = f1_scores_curve[best_threshold_idx]

print(f"默认阈值 0.5 的 F1: {f1_score(y_test, (y_proba_best >= 0.5).astype(int)):.4f}")
print(f"最优阈值: {best_threshold:.4f}")
print(f"最优阈值下的 F1: {best_f1_at_threshold:.4f}")

默认阈值 0.5 的 F1: 0.6723
最优阈值: 0.5889
最优阈值下的 F1: 0.6750

# 6.3 阈值 vs Precision / Recall / F1 曲线
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# 左图: Threshold vs Metrics
axes[0].plot(thresholds_pr, precisions_curve[:-1], label='Precision', linewidth=2)
axes[0].plot(thresholds_pr, recalls_curve[:-1], label='Recall', linewidth=2)
axes[0].plot(thresholds_pr, f1_scores_curve[:-1], label='F1-score', linewidth=2)
axes[0].axvline(x=best_threshold, color='red', linestyle='--',
                label=f'Best Threshold = {best_threshold:.3f}')
axes[0].axvline(x=0.5, color='gray', linestyle=':', label='Default 0.5')
axes[0].set_xlabel('Decision Threshold')
axes[0].set_ylabel('Score')
axes[0].set_title('Threshold Optimization: Precision / Recall / F1')
axes[0].legend()

# 右图: Precision-Recall Curve
axes[1].plot(recalls_curve, precisions_curve, linewidth=2)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
ap = average_precision_score(y_test, y_proba_best)
axes[1].set_title(f'Precision-Recall Curve (AP = {ap:.3f})')
axes[1].fill_between(recalls_curve, precisions_curve, alpha=0.2)

plt.tight_layout()
plt.show()

# 6.4 对比默认阈值 vs 最优阈值
y_pred_default = (y_proba_best >= 0.5).astype(int)
y_pred_optimal = (y_proba_best >= best_threshold).astype(int)

print("=== 默认阈值 (0.5) ===")
print(classification_report(y_test, y_pred_default, digits=4))

print(f"\n=== 最优阈值 ({best_threshold:.4f}) ===")
print(classification_report(y_test, y_pred_optimal, digits=4))

# 混淆矩阵对比
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_default, ax=axes[0],
                                         cmap='Blues', colorbar=False)
axes[0].set_title('Confusion Matrix (Threshold = 0.5)')

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_optimal, ax=axes[1],
                                         cmap='Oranges', colorbar=False)
axes[1].set_title(f'Confusion Matrix (Threshold = {best_threshold:.3f})')

plt.tight_layout()
plt.show()

=== 默认阈值 (0.5) ===
              precision    recall  f1-score   support

           0     0.9485    0.9199    0.9340      2084
           1     0.6247    0.7277    0.6723       382

    accuracy                         0.8901      2466
   macro avg     0.7866    0.8238    0.8031      2466
weighted avg     0.8984    0.8901    0.8934      2466


=== 最优阈值 (0.5889) ===
              precision    recall  f1-score   support

           0     0.9449    0.9299    0.9374      2084
           1     0.6482    0.7042    0.6750       382

    accuracy                         0.8950      2466
   macro avg     0.7965    0.8171    0.8062      2466
weighted avg     0.8989    0.8950    0.8967      2466

# 7.1 训练所有最终模型并收集预测概率

# 用 SMOTE 处理训练集 (供单独模型训练使用)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 为线性模型准备标准化数据
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# 最终模型集合 (包含调优后参数)
final_models = {}

# Logistic Regression (使用标准化数据)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_res_scaled, y_train_res)
final_models['Logistic Regression'] = {
    'model': lr_model,
    'y_proba': lr_model.predict_proba(X_test_scaled)[:, 1]
}

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train_res, y_train_res)
final_models['Decision Tree'] = {
    'model': dt_model,
    'y_proba': dt_model.predict_proba(X_test.values)[:, 1]
}

# Random Forest (调优后)
rf_best_params = {k.replace('model__', ''): v
                  for k, v in search_rf.best_params_.items()}
rf_tuned = RandomForestClassifier(**rf_best_params, random_state=42)
rf_tuned.fit(X_train_res, y_train_res)
final_models['Random Forest (Tuned)'] = {
    'model': rf_tuned,
    'y_proba': rf_tuned.predict_proba(X_test.values)[:, 1]
}

# XGBoost (调优后)
xgb_best_params = {k.replace('model__', ''): v
                   for k, v in search_xgb.best_params_.items()}
xgb_tuned = XGBClassifier(**xgb_best_params, random_state=42,
                          eval_metric='logloss', verbosity=0)
xgb_tuned.fit(X_train_res, y_train_res)
final_models['XGBoost (Tuned)'] = {
    'model': xgb_tuned,
    'y_proba': xgb_tuned.predict_proba(X_test.values)[:, 1]
}

# LightGBM (调优后)
lgbm_best_params = {k.replace('model__', ''): v
                    for k, v in search_lgbm.best_params_.items()}
lgbm_tuned = LGBMClassifier(**lgbm_best_params, random_state=42, verbose=-1)
lgbm_tuned.fit(X_train_res, y_train_res)
final_models['LightGBM (Tuned)'] = {
    'model': lgbm_tuned,
    'y_proba': lgbm_tuned.predict_proba(X_test.values)[:, 1]
}

# 7.2 ROC Curve 对比 (所有模型)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 左图: ROC Curve
for name, info in final_models.items():
    fpr, tpr, _ = roc_curve(y_test, info['y_proba'])
    auc_val = roc_auc_score(y_test, info['y_proba'])
    axes[0].plot(fpr, tpr, linewidth=2, label=f"{name} (AUC={auc_val:.3f})")

axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve Comparison')
axes[0].legend(fontsize=9)

# 右图: Precision-Recall Curve
for name, info in final_models.items():
    prec_c, rec_c, _ = precision_recall_curve(y_test, info['y_proba'])
    ap_val = average_precision_score(y_test, info['y_proba'])
    axes[1].plot(rec_c, prec_c, linewidth=2, label=f"{name} (AP={ap_val:.3f})")

axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve Comparison')
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.show()

# 7.3 最终模型对比汇总表 (使用最优阈值)
final_results = []
for name, info in final_models.items():
    y_proba = info['y_proba']
    # 使用最优阈值 (基于 F1)
    prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba)
    f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
    opt_idx = np.argmax(f1_c[:-1])
    opt_thr = thr_c[opt_idx]

    y_pred = (y_proba >= opt_thr).astype(int)

    final_results.append({
        'Model': name,
        'Optimal Threshold': opt_thr,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba),
        'Avg Precision': average_precision_score(y_test, y_proba)
    })

final_results_df = pd.DataFrame(final_results).set_index('Model')
display(final_results_df.round(4))

# 找出综合最优模型
overall_best_name = final_results_df['F1-score'].idxmax()
print(f"\n>>> 综合最优模型: {overall_best_name}")
print(f"    F1 = {final_results_df.loc[overall_best_name, 'F1-score']:.4f}")
print(f"    ROC-AUC = {final_results_df.loc[overall_best_name, 'ROC-AUC']:.4f}")

>>> 综合最优模型: XGBoost (Tuned)
    F1 = 0.6750
    ROC-AUC = 0.9235

# 选取表现最好的树模型进行可解释性分析
# (使用 RF Tuned 作为演示，因为 SHAP TreeExplainer 对其支持最好)
explain_model = rf_tuned
explain_model_name = "Random Forest (Tuned)"

print(f"可解释性分析模型: {explain_model_name}")

可解释性分析模型: Random Forest (Tuned)

# 8.1 SHAP Values 分析
# 取测试集样本 (控制计算量)
n_shap_samples = 200
X_test_sample = X_test.iloc[:n_shap_samples].copy()

explainer = shap.TreeExplainer(explain_model)
shap_exp = explainer(X_test_sample)

# 处理二分类可能产生的三维 SHAP values
if isinstance(shap_exp.values, np.ndarray) and len(shap_exp.values.shape) == 3:
    shap_values_pos = shap.Explanation(
        values=shap_exp.values[:, :, 1],
        base_values=(shap_exp.base_values[:, 1]
                     if len(shap_exp.base_values.shape) > 1
                     else shap_exp.base_values),
        data=shap_exp.data,
        feature_names=feature_names
    )
else:
    shap_values_pos = shap_exp

# SHAP Beeswarm Plot (全局特征重要性 + 方向)
print("=== SHAP Beeswarm Plot (Top 20 Features) ===")
shap.plots.beeswarm(shap_values_pos, max_display=20, show=True)

=== SHAP Beeswarm Plot (Top 20 Features) ===

# SHAP Bar Plot (平均绝对 SHAP 值)
print("=== SHAP Bar Plot (Mean |SHAP|) ===")
shap.plots.bar(shap_values_pos, max_display=20, show=True)

=== SHAP Bar Plot (Mean |SHAP|) ===

# SHAP Dependence Plot (Top 2 特征的详细交互)
top_features_idx = np.argsort(-np.abs(shap_values_pos.values).mean(axis=0))[:2]
top_feature_names = [feature_names[i] for i in top_features_idx]

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
for i, feat in enumerate(top_feature_names):
    shap.plots.scatter(shap_values_pos[:, feat], ax=axes[i], show=False)
    axes[i].set_title(f'SHAP Dependence: {feat}')
plt.tight_layout()
plt.show()

# 8.2 Permutation Importance (模型无关的特征重要性评估)
print("=== Permutation Importance (10 repeats) ===")
perm_result = permutation_importance(
    explain_model, X_test, y_test,
    n_repeats=10, random_state=42, scoring='f1', n_jobs=-1
)

perm_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance_mean': perm_result.importances_mean,
    'Importance_std': perm_result.importances_std
}).sort_values('Importance_mean', ascending=False)

# 可视化 Top 15
fig, ax = plt.subplots(figsize=(10, 7))
top_perm = perm_df.head(15)
ax.barh(range(len(top_perm)), top_perm['Importance_mean'],
        xerr=top_perm['Importance_std'], color='steelblue', edgecolor='black')
ax.set_yticks(range(len(top_perm)))
ax.set_yticklabels(top_perm['Feature'])
ax.invert_yaxis()
ax.set_xlabel('Mean Permutation Importance (F1 decrease)')
ax.set_title('Top 15 Permutation Importance')
plt.tight_layout()
plt.show()

=== Permutation Importance (10 repeats) ===

# 8.3 SHAP vs Permutation Importance 交叉验证
print("\n=== SHAP 与 Permutation Importance Top 10 对比 ===")

shap_importance = pd.Series(
    np.abs(shap_values_pos.values).mean(axis=0),
    index=feature_names
).sort_values(ascending=False)

perm_importance = perm_df.set_index('Feature')['Importance_mean']

comparison_top10 = pd.DataFrame({
    'SHAP Rank': range(1, 11),
    'SHAP Feature': shap_importance.head(10).index.tolist(),
    'Perm Rank': range(1, 11),
    'Perm Feature': perm_importance.sort_values(ascending=False).head(10).index.tolist()
})
display(comparison_top10)

=== SHAP 与 Permutation Importance Top 10 对比 ===

# 8.4 Partial Dependence Plot (PDP) —— 展示特征边际效应
print("=== Partial Dependence Plots ===")

# 选取 SHAP 重要性最高的 4 个连续特征
top_continuous_features = []
continuous_original = ['PageValues', 'ExitRates', 'BounceRates',
                       'ProductRelated_Duration', 'ProductRelated',
                       'TotalDuration', 'TotalPages', 'PageValues_log']
for feat in shap_importance.index:
    if feat in continuous_original:
        top_continuous_features.append(feat)
    if len(top_continuous_features) >= 4:
        break

fig, axes = plt.subplots(1, len(top_continuous_features),
                         figsize=(5 * len(top_continuous_features), 4))
if len(top_continuous_features) == 1:
    axes = [axes]

PartialDependenceDisplay.from_estimator(
    explain_model, X_test, top_continuous_features,
    ax=axes, grid_resolution=50
)

plt.suptitle('Partial Dependence Plots (PDP)', fontsize=14, y=1.05)
plt.tight_layout()
plt.show()

=== Partial Dependence Plots ===

# 9.1 校准曲线 (Calibration Curve)
# 检验模型输出概率的可靠性
print("=== 校准曲线 (Calibration Curve) ===")

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')

for name, info in final_models.items():
    prob_true, prob_pred = calibration_curve(y_test, info['y_proba'], n_bins=10)
    ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=name)

ax.set_xlabel('Mean Predicted Probability')
ax.set_ylabel('Fraction of Positives')
ax.set_title('Calibration Curve (Reliability Diagram)')
ax.legend(fontsize=9, loc='lower right')
plt.tight_layout()
plt.show()

=== 校准曲线 (Calibration Curve) ===

# 9.2 学习曲线 (Learning Curve) —— 检测过拟合/欠拟合
print("=== 学习曲线 (Learning Curve) ===")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
lc_models = {
    'Random Forest (Tuned)': rf_tuned,
    'XGBoost (Tuned)': xgb_tuned,
    'LightGBM (Tuned)': lgbm_tuned
}

for idx, (name, model) in enumerate(lc_models.items()):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train_res, y_train_res,
        cv=5, scoring='f1',
        train_sizes=np.linspace(0.2, 1.0, 5),
        n_jobs=-1, random_state=42
    )

    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)

    axes[idx].fill_between(train_sizes, train_mean - train_std,
                           train_mean + train_std, alpha=0.1, color='blue')
    axes[idx].fill_between(train_sizes, val_mean - val_std,
                           val_mean + val_std, alpha=0.1, color='orange')
    axes[idx].plot(train_sizes, train_mean, 'o-', color='blue',
                   label='Training F1')
    axes[idx].plot(train_sizes, val_mean, 'o-', color='orange',
                   label='Validation F1')
    axes[idx].set_title(f'Learning Curve: {name}')
    axes[idx].set_xlabel('Training Set Size')
    axes[idx].set_ylabel('F1 Score')
    axes[idx].legend(loc='lower right')
    axes[idx].set_ylim(0, 1.1)

plt.tight_layout()
plt.show()

=== 学习曲线 (Learning Curve) ===

# 9.3 错误分析 (Error Analysis) —— 分析哪些样本容易被误判

# 使用综合最优模型
best_final_model = final_models[overall_best_name]
y_proba_final = best_final_model['y_proba']

# 使用最优阈值
prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba_final)
f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
final_threshold = thr_c[np.argmax(f1_c[:-1])]
y_pred_final = (y_proba_final >= final_threshold).astype(int)

# 构建错误分析 DataFrame
error_df = df.iloc[y_test.index].copy()
error_df['y_true'] = y_test.values
error_df['y_pred'] = y_pred_final
error_df['y_proba'] = y_proba_final
error_df['correct'] = (error_df['y_true'] == error_df['y_pred'])
error_df['error_type'] = 'Correct'
error_df.loc[(error_df['y_true'] == 1) & (error_df['y_pred'] == 0), 'error_type'] = 'False Negative'
error_df.loc[(error_df['y_true'] == 0) & (error_df['y_pred'] == 1), 'error_type'] = 'False Positive'

print("=== 错误类型分布 ===")
print(error_df['error_type'].value_counts())

=== 错误类型分布 ===
error_type
Correct           2207
False Positive     146
False Negative     113
Name: count, dtype: int64

# 9.3a 误判样本的特征分析
print("\n=== False Negative vs True Positive 特征均值对比 ===")
analysis_cols = ['PageValues', 'BounceRates', 'ExitRates',
                 'ProductRelated', 'ProductRelated_Duration']

fn_samples = error_df[error_df['error_type'] == 'False Negative']
tp_samples = error_df[(error_df['y_true'] == 1) & (error_df['y_pred'] == 1)]
fp_samples = error_df[error_df['error_type'] == 'False Positive']
tn_samples = error_df[(error_df['y_true'] == 0) & (error_df['y_pred'] == 0)]

error_analysis = pd.DataFrame({
    'True Positive (Mean)': tp_samples[analysis_cols].mean(),
    'False Negative (Mean)': fn_samples[analysis_cols].mean(),
    'True Negative (Mean)': tn_samples[analysis_cols].mean(),
    'False Positive (Mean)': fp_samples[analysis_cols].mean()
})
display(error_analysis.round(4))

=== False Negative vs True Positive 特征均值对比 ===

# 9.3b 各月份/用户类型的误判率
print("\n=== 各月份误判率 (False Negative Rate among actual positives) ===")
actual_positive = error_df[error_df['y_true'] == 1]
monthly_fn_rate = actual_positive.groupby('Month').apply(
    lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(monthly_fn_rate.round(4))

print("\n=== 各访客类型误判率 ===")
visitor_fn_rate = actual_positive.groupby('VisitorType').apply(
    lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(visitor_fn_rate.round(4))

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

monthly_fn_rate.plot(kind='bar', ax=axes[0], color='salmon', edgecolor='black')
axes[0].set_title('False Negative Rate by Month')
axes[0].set_ylabel('FN Rate')
axes[0].tick_params(axis='x', rotation=45)

visitor_fn_rate.plot(kind='bar', ax=axes[1], color='lightblue', edgecolor='black')
axes[1].set_title('False Negative Rate by Visitor Type')
axes[1].set_ylabel('FN Rate')

plt.tight_layout()
plt.show()

=== 各月份误判率 (False Negative Rate among actual positives) ===
Month
Aug     0.6000
Jul     0.5500
Nov     0.3678
Sep     0.3636
Oct     0.2353
Dec     0.2188
June    0.1667
May     0.1370
Mar     0.0909
Feb     0.0000
dtype: float64

=== 各访客类型误判率 ===
VisitorType
Returning_Visitor    0.3103
New_Visitor          0.2584
Other                0.0000
dtype: float64

# 9.4 成本敏感分析 (Cost-Sensitive Business Analysis)
print("=== 成本敏感分析 ===")

# 假设业务成本模型:
# - 发送优惠券成本: ¥5 / 用户
# - 成功促成转化收益: ¥50 / 用户
# - 错过真实购买用户 (False Negative) 机会成本: ¥50
# - 错误发送优惠券 (False Positive) 成本: ¥5

cost_per_fp = 5    # 误发优惠券
gain_per_tp = 50   # 成功转化
loss_per_fn = 50   # 错失客户

def calculate_business_value(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    total_gain = tp * gain_per_tp
    total_cost = fp * cost_per_fp + fn * loss_per_fn
    net_value = total_gain - total_cost
    return {
        'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn,
        'Total Gain': total_gain,
        'Total Cost': total_cost,
        'Net Business Value': net_value
    }

# 比较不同阈值的商业价值
thresholds_to_compare = [0.3, 0.4, best_threshold, 0.5, 0.6, 0.7]
business_results = []

for thr in thresholds_to_compare:
    y_pred_thr = (y_proba_final >= thr).astype(int)
    bv = calculate_business_value(y_test, y_pred_thr)
    bv['Threshold'] = thr
    business_results.append(bv)

bv_df = pd.DataFrame(business_results).set_index('Threshold')
display(bv_df)

# 可视化商业价值
plt.figure(figsize=(8, 5))
plt.plot(bv_df.index, bv_df['Net Business Value'], 'o-', linewidth=2,
         markersize=8, color='green')
plt.axvline(x=best_threshold, color='red', linestyle='--',
            label=f'F1-Optimal Threshold = {best_threshold:.3f}')
plt.xlabel('Decision Threshold')
plt.ylabel('Net Business Value (¥)')
plt.title('Business Value at Different Thresholds')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

best_bv_threshold = bv_df['Net Business Value'].idxmax()
print(f"\n>>> 商业价值最大化阈值: {best_bv_threshold}")
print(f"    净商业价值: ¥{bv_df.loc[best_bv_threshold, 'Net Business Value']}")

=== 成本敏感分析 ===

>>> 商业价值最大化阈值: 0.3
    净商业价值: ¥10695

# 10.1 聚类特征选择与标准化
cluster_features = ['Administrative', 'Administrative_Duration', 'Informational',
                    'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
                    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

X_cluster = df[cluster_features].copy()

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

print(f"聚类数据形状: {X_cluster_scaled.shape}")

聚类数据形状: (12330, 10)

# 10.2 肘部法则 + 轮廓系数确定最优 K
k_range = range(2, 9)
inertias = []
sil_scores = []

for k in k_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_cluster_scaled)
    inertias.append(kmeans_temp.inertia_)
    sil_scores.append(silhouette_score(X_cluster_scaled, labels_temp))
    print(f"K={k}: Inertia={kmeans_temp.inertia_:.0f}, Silhouette={sil_scores[-1]:.4f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (Within-Cluster Sum of Squares)')
axes[0].set_title('Elbow Method')

axes[1].plot(k_range, sil_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score Analysis')

# 标记最优 K
best_k = list(k_range)[np.argmax(sil_scores)]
axes[1].axvline(x=best_k, color='green', linestyle='--',
                label=f'Best k = {best_k}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\n>>> 基于轮廓系数的最优 K = {best_k}")

K=2: Inertia=99672, Silhouette=0.4359
K=3: Inertia=80117, Silhouette=0.4517
K=4: Inertia=70377, Silhouette=0.4304
K=5: Inertia=62615, Silhouette=0.3981
K=6: Inertia=55748, Silhouette=0.4076
K=7: Inertia=49918, Silhouette=0.4079
K=8: Inertia=46254, Silhouette=0.3727

>>> 基于轮廓系数的最优 K = 3

# 10.3 使用最优 K 执行 KMeans 聚类
kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_cluster_scaled)

df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels

print("=== 各聚类簇的样本数量 ===")
print(df_clustered['Cluster'].value_counts().sort_index())

print("\n=== 各聚类簇的购买转化率 ===")
print(df_clustered.groupby('Cluster')['Revenue'].mean().round(4))

=== 各聚类簇的样本数量 ===
Cluster
0    9652
1    1628
2    1050
Name: count, dtype: int64

=== 各聚类簇的购买转化率 ===
Cluster
0    0.1502
1    0.2776
2    0.0057
Name: Revenue, dtype: float64

# 10.4 PCA 降维可视化
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)

print(f"PCA 解释方差比: {pca.explained_variance_ratio_.round(4)}")
print(f"PCA 累计解释方差: {pca.explained_variance_ratio_.sum():.4f}")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 按聚类标签着色
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1],
                           c=cluster_labels, cmap='viridis',
                           alpha=0.5, s=15)
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')
axes[0].set_title(f'K-Means Clustering (K={best_k}) - PCA Projection')
plt.colorbar(scatter1, ax=axes[0], label='Cluster')

# 按实际 Revenue 着色 (对照)
scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1],
                           c=df['Revenue'].astype(int), cmap='coolwarm',
                           alpha=0.5, s=15)
axes[1].set_xlabel('PCA Component 1')
axes[1].set_ylabel('PCA Component 2')
axes[1].set_title('Actual Revenue Labels - PCA Projection')
plt.colorbar(scatter2, ax=axes[1], label='Revenue')

plt.tight_layout()
plt.show()

PCA 解释方差比: [0.34   0.1675]
PCA 累计解释方差: 0.5076

# 10.5 聚类画像分析 (Cluster Profiling)
print("=== 详细聚类画像 (各簇特征均值) ===")

profile_features = cluster_features + ['Revenue']
cluster_profile = df_clustered.groupby('Cluster')[profile_features].agg(['mean', 'median'])
display(cluster_profile.round(4))

# 简化版画像 (均值)
profile_mean = df_clustered.groupby('Cluster')[profile_features].mean()
display(profile_mean.round(4))

# 10.5a 聚类画像热力图
fig, ax = plt.subplots(figsize=(14, best_k + 2))
profile_norm = profile_mean.copy()
for col in profile_norm.columns:
    col_min = profile_norm[col].min()
    col_max = profile_norm[col].max()
    if col_max > col_min:
        profile_norm[col] = (profile_norm[col] - col_min) / (col_max - col_min)
    else:
        profile_norm[col] = 0

sns.heatmap(profile_norm, annot=profile_mean.round(2).values,
            cmap='YlOrRd', fmt='', linewidths=1, ax=ax)
ax.set_title('Cluster Profile Heatmap (Normalized, Values = Actual Means)')
ax.set_ylabel('Cluster')
plt.tight_layout()
plt.show()

=== 详细聚类画像 (各簇特征均值) ===

# 10.5b 聚类雷达图 (Spider Chart)
from matplotlib.patches import FancyBboxPatch

radar_features = ['Administrative', 'ProductRelated', 'BounceRates',
                  'ExitRates', 'PageValues', 'Revenue']

# 标准化到 0-1 范围用于雷达图
radar_data = profile_mean[radar_features].copy()
for col in radar_data.columns:
    col_min = radar_data[col].min()
    col_max = radar_data[col].max()
    if col_max > col_min:
        radar_data[col] = (radar_data[col] - col_min) / (col_max - col_min)
    else:
        radar_data[col] = 0

angles = np.linspace(0, 2 * np.pi, len(radar_features), endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
colors = plt.cm.Set2(np.linspace(0, 1, best_k))

for i in range(best_k):
    values = radar_data.iloc[i].tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {i}', color=colors[i])
    ax.fill(angles, values, alpha=0.15, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(radar_features, fontsize=10)
ax.set_title('Cluster Profile Radar Chart', fontsize=14, pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()

# 10.6 聚类命名与业务解读
print("=== 聚类命名与业务解读 ===\n")

# 根据画像自动生成聚类描述
for cluster_id in range(best_k):
    p = profile_mean.loc[cluster_id]
    conv_rate = p['Revenue']
    page_val = p['PageValues']
    bounce = p['BounceRates']
    exit_r = p['ExitRates']
    prod_pages = p['ProductRelated']
    prod_dur = p['ProductRelated_Duration']
    count = (df_clustered['Cluster'] == cluster_id).sum()

    print(f"--- Cluster {cluster_id} ({count} users, {count/len(df_clustered)*100:.1f}%) ---")
    print(f"  Conversion Rate: {conv_rate:.2%}")
    print(f"  Avg PageValues: {page_val:.2f}")
    print(f"  Avg BounceRate: {bounce:.4f}")
    print(f"  Avg ExitRate: {exit_r:.4f}")
    print(f"  Avg ProductRelated Pages: {prod_pages:.1f}")
    print(f"  Avg ProductRelated Duration: {prod_dur:.1f}s")

    # 自动命名逻辑
    if conv_rate > 0.20 and page_val > 10:
        print(f"  >>> 命名: High-Intent Purchasers (高意向购买者)")
    elif bounce > 0.03 or exit_r > 0.05:
        print(f"  >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)")
    elif prod_pages > 30:
        print(f"  >>> 命名: Active Browsers (活跃浏览用户)")
    else:
        print(f"  >>> 命名: Casual Visitors (普通访客)")
    print()

=== 聚类命名与业务解读 ===

--- Cluster 0 (9652 users, 78.3%) ---
  Conversion Rate: 15.02%
  Avg PageValues: 5.88
  Avg BounceRate: 0.0092
  Avg ExitRate: 0.0321
  Avg ProductRelated Pages: 23.9
  Avg ProductRelated Duration: 873.8s
  >>> 命名: Casual Visitors (普通访客)

--- Cluster 1 (1628 users, 13.2%) ---
  Conversion Rate: 27.76%
  Avg PageValues: 9.76
  Avg BounceRate: 0.0063
  Avg ExitRate: 0.0194
  Avg ProductRelated Pages: 96.7
  Avg ProductRelated Duration: 3831.1s
  >>> 命名: Active Browsers (活跃浏览用户)

--- Cluster 2 (1050 users, 8.5%) ---
  Conversion Rate: 0.57%
  Avg PageValues: 0.00
  Avg BounceRate: 0.1665
  Avg ExitRate: 0.1805
  Avg ProductRelated Pages: 3.0
  Avg ProductRelated Duration: 57.3s
  >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)

# 11.1 准备无特征工程版本的数据
df_no_fe = df.copy()
df_no_fe['Revenue'] = df_no_fe['Revenue'].astype(int)
df_no_fe['Weekend'] = df_no_fe['Weekend'].astype(int)
df_no_fe = pd.get_dummies(df_no_fe, columns=categorical_cols, drop_first=True)

X_no_fe = df_no_fe.drop('Revenue', axis=1)
y_no_fe = df_no_fe['Revenue']

X_train_no_fe, X_test_no_fe, y_train_no_fe, y_test_no_fe = train_test_split(
    X_no_fe, y_no_fe, test_size=0.2, random_state=42, stratify=y_no_fe
)

# 使用同一个 RF 模型对比
pipe_rf_no_fe = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200))
])

pipe_rf_with_fe = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200))
])

# 交叉验证
print("=== 无特征工程 ===")
result_no_fe = run_cv(pipe_rf_no_fe, X_train_no_fe, y_train_no_fe, cv, "RF (No FE)")

print("=== 有特征工程 ===")
result_with_fe = run_cv(pipe_rf_with_fe, X_train, y_train, cv, "RF (With FE)")

fe_compare = pd.DataFrame([result_no_fe, result_with_fe]).set_index('Model')
display_fe = {}
for metric in scoring_metrics:
    display_fe[metric] = fe_compare.apply(
        lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
    )
print("\n=== 特征工程前后对比 ===")
display(pd.DataFrame(display_fe, index=fe_compare.index))

=== 无特征工程 ===
=== 有特征工程 ===

=== 特征工程前后对比 ===

print("=" * 60)
print("分析完成！")
print("=" * 60)
print(f"最终推荐模型: {overall_best_name}")
print(f"推荐决策阈值 (F1最优): {final_threshold:.4f}")
print(f"推荐决策阈值 (商业价值最优): {best_bv_threshold}")
print(f"最优聚类数: {best_k}")

============================================================
分析完成！
============================================================
最终推荐模型: XGBoost (Tuned)
推荐决策阈值 (F1最优): 0.5889
推荐决策阈值 (商业价值最优): 0.3
最优聚类数: 3

	Administrative	Administrative_Duration	Informational	Informational_Duration	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	PageValues	SpecialDay	OperatingSystems	Browser	Region	TrafficType
count	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00	12330.00
mean	2.32	80.82	0.50	34.47	31.73	1194.75	0.02	0.04	5.89	0.06	2.12	2.36	3.15	4.07
std	3.32	176.78	1.27	140.75	44.48	1913.67	0.05	0.05	18.57	0.20	0.91	1.72	2.40	4.03
min	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	1.00	1.00	1.00	1.00
25%	0.00	0.00	0.00	0.00	7.00	184.14	0.00	0.01	0.00	0.00	2.00	2.00	1.00	2.00
50%	1.00	7.50	0.00	0.00	18.00	598.94	0.00	0.03	0.00	0.00	2.00	2.00	3.00	2.00
75%	4.00	93.26	0.00	0.00	38.00	1464.16	0.02	0.05	0.00	0.00	3.00	2.00	4.00	4.00
max	27.00	3398.75	24.00	2549.38	705.00	63973.52	0.20	0.20	361.76	1.00	8.00	13.00	9.00	20.00

	accuracy	precision	recall	f1	roc_auc
Model
Logistic Regression	0.8683 ± 0.0037	0.5513 ± 0.0099	0.8034 ± 0.0124	0.6537 ± 0.0033	0.9164 ± 0.0033
Decision Tree	0.8875 ± 0.0060	0.6110 ± 0.0203	0.7556 ± 0.0228	0.6752 ± 0.0117	0.9056 ± 0.0012
Random Forest	0.8940 ± 0.0049	0.6362 ± 0.0162	0.7359 ± 0.0091	0.6824 ± 0.0124	0.9211 ± 0.0034
XGBoost	0.8939 ± 0.0035	0.6575 ± 0.0128	0.6560 ± 0.0295	0.6563 ± 0.0155	0.9212 ± 0.0028
LightGBM	0.8947 ± 0.0036	0.6545 ± 0.0124	0.6763 ± 0.0170	0.6651 ± 0.0120	0.9263 ± 0.0038

	Optimal Threshold	Accuracy	Precision	Recall	F1-score	ROC-AUC	Avg Precision
Model
Logistic Regression	0.4372	0.8816	0.6098	0.6545	0.6313	0.8887	0.6177
Decision Tree	0.6044	0.8881	0.6157	0.7382	0.6714	0.9108	0.6308
Random Forest (Tuned)	0.5966	0.8958	0.6574	0.6832	0.6701	0.9189	0.6926
XGBoost (Tuned)	0.5889	0.8950	0.6482	0.7042	0.6750	0.9235	0.6951
LightGBM (Tuned)	0.4827	0.8897	0.6239	0.7251	0.6707	0.9299	0.7202

	True Positive (Mean)	False Negative (Mean)	True Negative (Mean)	False Positive (Mean)
PageValues	34.2793	3.6043	0.4245	24.1529
BounceRates	0.0044	0.0110	0.0254	0.0075
ExitRates	0.0186	0.0293	0.0475	0.0241
ProductRelated	48.3048	60.0708	27.8839	49.1644
ProductRelated_Duration	1949.1567	2218.8107	1011.3520	2000.5623

	TP	FP	FN	TN	Total Gain	Total Cost	Net Business Value
Threshold
0.300000	311	261	71	1823	15550	4855	10695
0.400000	287	194	95	1890	14350	5720	8630
0.588857	269	146	113	1938	13450	6380	7070
0.500000	278	167	104	1917	13900	6035	7865
0.600000	264	144	118	1940	13200	6620	6580
0.700000	237	108	145	1976	11850	7790	4060

1. 业务理解 (Business Understanding) & 数据理解 (Data Understanding)¶

2. 数据准备 (Data Preparation)¶

3. 基线建模 + 分层交叉验证¶

4. 类别不平衡处理方案对比¶

5. 超参数调优¶

6. 阈值优化¶

7. 最终模型全面评估¶

8. 模型可解释性分析¶

9. 高级诊断分析¶

10. 客户分群聚类分析¶

11. 特征工程效果验证¶

12. 结论与部署建议 (Deployment Recommendations)¶

核心结论¶

业务部署建议¶

	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	Month	OperatingSystems	Browser	Region	TrafficType	VisitorType	Weekend	Revenue
0	1	0.000000	0.20	0.20	Feb	1	1	1	1	Returning_Visitor	False	False
1	2	64.000000	0.00	0.10	Feb	2	2	1	2	Returning_Visitor	False	False
2	1	0.000000	0.20	0.20	Feb	4	1	9	3	Returning_Visitor	False	False
3	2	2.666667	0.05	0.14	Feb	3	2	2	4	Returning_Visitor	False	False
4	10	627.500000	0.02	0.05	Feb	3	3	1	4	Returning_Visitor	True	False

	Before Tuning (F1)	After Tuning (F1)	Improvement
Model
Random Forest	0.6824	0.6854	0.0030
XGBoost	0.6563	0.6871	0.0308
LightGBM	0.6651	0.6850	0.0199

	SHAP Rank	SHAP Feature	Perm Rank	Perm Feature
0	1	PageValues	1	PageValues
1	2	PageValues_log	2	PageValues_log
2	3	Month_Nov	3	Month_Nov
3	4	TrafficType_2	4	Administrative
4	5	ExitRates	5	Month_Mar
5	6	OperatingSystems_2	6	ProductRelated_Duration
6	7	Administrative_Duration	7	Informational
7	8	Administrative_Duration_log	8	ProductRelated_Duration_log
8	9	ProductPageRatio	9	TotalPages
9	10	Administrative	10	Administrative_Duration_log

	Administrative		Administrative_Duration		Informational		Informational_Duration		ProductRelated		...	BounceRates		ExitRates		PageValues		SpecialDay		Revenue
	mean	median	mean	median	mean	median	mean	median	mean	median	...	mean	median	mean	median	mean	median	mean	median	mean	median
Cluster
0	1.7090	1.0	51.1829	4.0000	0.2165	0.0	8.1459	0.0000	23.8882	17.0	...	0.0092	0.0000	0.0321	0.0250	5.8774	0.0000	0.0617	0.0	0.1502	0.0
1	7.3710	7.0	307.7312	197.9757	2.5203	2.0	212.7327	93.2083	96.7439	75.0	...	0.0063	0.0041	0.0194	0.0175	9.7577	1.8372	0.0301	0.0	0.2776	0.0
2	0.0486	0.0	1.4189	0.0000	0.0152	0.0	0.0869	0.0000	3.0295	1.0	...	0.1665	0.2000	0.1805	0.2000	0.0000	0.0000	0.1074	0.0	0.0057	0.0

	accuracy	precision	recall	f1	roc_auc
Model
RF (No FE)	0.8969 ± 0.0056	0.6556 ± 0.0243	0.7064 ± 0.0158	0.6796 ± 0.0116	0.9240 ± 0.0030
RF (With FE)	0.8937 ± 0.0040	0.6338 ± 0.0134	0.7412 ± 0.0118	0.6832 ± 0.0100	0.9229 ± 0.0033