In [35]:
# 基础数据处理与可视化库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 机器学习预处理与模型选择
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate,
    RandomizedSearchCV, learning_curve
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 机器学习模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 评估指标
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve, average_precision_score,
    ConfusionMatrixDisplay
)

# 校准曲线
from sklearn.calibration import calibration_curve

# 模型可解释性
import shap
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

# 聚类与降维
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# 忽略警告
import warnings
warnings.filterwarnings('ignore')

# 设置画图风格
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = (10, 6)

1. 业务理解 (Business Understanding) & 数据理解 (Data Understanding)¶

加载数据集,深入观察基本信息、数据分布、缺失值、异常值及特征间关系。

In [36]:
# 1.1 加载数据 & 基本信息
df = pd.read_csv('online_shoppers_intention.csv')

print("=== 数据集形状 ===")
print(f"行数: {df.shape[0]}, 列数: {df.shape[1]}\n")

display(df.head())

print("\n=== 数据集基本信息 ===")
df.info()

print("\n=== 数值变量描述性统计 ===")
display(df.describe().round(2))

print("\n=== 缺失值统计 ===")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "无缺失值")

print("\n=== 重复行统计 ===")
print(f"重复行数量: {df.duplicated().sum()}")
=== 数据集形状 ===
行数: 12330, 列数: 18

Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay Month OperatingSystems Browser Region TrafficType VisitorType Weekend Revenue
0 0 0.0 0 0.0 1 0.000000 0.20 0.20 0.0 0.0 Feb 1 1 1 1 Returning_Visitor False False
1 0 0.0 0 0.0 2 64.000000 0.00 0.10 0.0 0.0 Feb 2 2 1 2 Returning_Visitor False False
2 0 0.0 0 0.0 1 0.000000 0.20 0.20 0.0 0.0 Feb 4 1 9 3 Returning_Visitor False False
3 0 0.0 0 0.0 2 2.666667 0.05 0.14 0.0 0.0 Feb 3 2 2 4 Returning_Visitor False False
4 0 0.0 0 0.0 10 627.500000 0.02 0.05 0.0 0.0 Feb 3 3 1 4 Returning_Visitor True False
=== 数据集基本信息 ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType              12330 non-null  int64  
 15  VisitorType              12330 non-null  object 
 16  Weekend                  12330 non-null  bool   
 17  Revenue                  12330 non-null  bool   
dtypes: bool(2), float64(7), int64(7), object(2)
memory usage: 1.5+ MB

=== 数值变量描述性统计 ===
Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
count 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00 12330.00
mean 2.32 80.82 0.50 34.47 31.73 1194.75 0.02 0.04 5.89 0.06 2.12 2.36 3.15 4.07
std 3.32 176.78 1.27 140.75 44.48 1913.67 0.05 0.05 18.57 0.20 0.91 1.72 2.40 4.03
min 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 1.00 1.00
25% 0.00 0.00 0.00 0.00 7.00 184.14 0.00 0.01 0.00 0.00 2.00 2.00 1.00 2.00
50% 1.00 7.50 0.00 0.00 18.00 598.94 0.00 0.03 0.00 0.00 2.00 2.00 3.00 2.00
75% 4.00 93.26 0.00 0.00 38.00 1464.16 0.02 0.05 0.00 0.00 3.00 2.00 4.00 4.00
max 27.00 3398.75 24.00 2549.38 705.00 63973.52 0.20 0.20 361.76 1.00 8.00 13.00 9.00 20.00
=== 缺失值统计 ===
无缺失值

=== 重复行统计 ===
重复行数量: 125
In [37]:
# 1.2 目标变量 Revenue 分布 (类别不平衡分析)
print("=== 目标变量 Revenue 分布 ===")
rev_counts = df['Revenue'].value_counts()
rev_pct = df['Revenue'].value_counts(normalize=True)
print(pd.DataFrame({'Count': rev_counts, 'Percentage': rev_pct.round(4)}))

imbalance_ratio = rev_counts[False] / rev_counts[True]
print(f"\n不平衡比例 (Negative : Positive) = {imbalance_ratio:.2f} : 1")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.countplot(x='Revenue', data=df, ax=axes[0])
axes[0].set_title('Revenue Count Distribution')
axes[0].bar_label(axes[0].containers[0])

df['Revenue'].value_counts().plot.pie(
    autopct='%1.1f%%', ax=axes[1], startangle=90, explode=[0, 0.05]
)
axes[1].set_ylabel('')
axes[1].set_title('Revenue Proportion')
plt.tight_layout()
plt.show()
=== 目标变量 Revenue 分布 ===
         Count  Percentage
Revenue                   
False    10422      0.8453
True      1908      0.1547

不平衡比例 (Negative : Positive) = 5.46 : 1
No description has been provided for this image
In [38]:
# 1.3 连续变量分布与目标变量的关系
continuous_cols = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay'
]

fig, axes = plt.subplots(2, 5, figsize=(22, 8))
axes = axes.flatten()

for i, col in enumerate(continuous_cols):
    sns.boxplot(data=df, x='Revenue', y=col, ax=axes[i])
    axes[i].set_title(f'{col}')

plt.suptitle('Continuous Variables by Revenue (Boxplot)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# KDE 对比关键变量
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.kdeplot(data=df, x="PageValues", hue="Revenue", fill=True, ax=axes[0])
axes[0].set_title('PageValues Distribution by Revenue')
axes[0].set_xlim(0, 100)

sns.kdeplot(data=df, x="BounceRates", hue="Revenue", fill=True, ax=axes[1])
axes[1].set_title('BounceRates Distribution by Revenue')

sns.kdeplot(data=df, x="ExitRates", hue="Revenue", fill=True, ax=axes[2])
axes[2].set_title('ExitRates Distribution by Revenue')

plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [39]:
# 1.4 类别变量与目标变量关系
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

month_order = ['Feb', 'Mar', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.countplot(data=df, x='Month', hue='Revenue', order=month_order, ax=axes[0])
axes[0].set_title('Purchasing Intention across Months')
axes[0].tick_params(axis='x', rotation=45)

sns.countplot(data=df, x='VisitorType', hue='Revenue', ax=axes[1])
axes[1].set_title('Purchasing Intention by Visitor Type')

sns.countplot(data=df, x='Weekend', hue='Revenue', ax=axes[2])
axes[2].set_title('Purchasing Intention by Weekend')

plt.tight_layout()
plt.show()

# 各月份购买转化率
monthly_conv = df.groupby('Month')['Revenue'].mean().reindex(month_order)
plt.figure(figsize=(8, 4))
monthly_conv.plot(kind='bar', color='coral', edgecolor='black')
plt.title('Monthly Conversion Rate')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [40]:
# 1.5 相关性热图
plt.figure(figsize=(14, 10))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
            fmt=".2f", linewidths=0.5, center=0)
plt.title('Correlation Heatmap of Numerical Variables')
plt.tight_layout()
plt.show()

# 与目标变量 Revenue 的相关性排序
df_temp = df.copy()
df_temp['Revenue'] = df_temp['Revenue'].astype(int)
target_corr = df_temp.select_dtypes(include=[np.number]).corr()['Revenue'].drop('Revenue').sort_values(ascending=False)
print("\n=== 特征与 Revenue 的相关系数 (降序) ===")
print(target_corr.round(4))
No description has been provided for this image
=== 特征与 Revenue 的相关系数 (降序) ===
PageValues                 0.4926
ProductRelated             0.1585
ProductRelated_Duration    0.1524
Administrative             0.1389
Informational              0.0952
Administrative_Duration    0.0936
Informational_Duration     0.0703
Browser                    0.0240
TrafficType               -0.0051
Region                    -0.0116
OperatingSystems          -0.0147
SpecialDay                -0.0823
BounceRates               -0.1507
ExitRates                 -0.2071
Name: Revenue, dtype: float64

2. 数据准备 (Data Preparation)¶

包括 特征工程、特征编码、数据集划分。

In [41]:
# 2.1 特征工程 (Feature Engineering) —— 升级方向6
df_prep = df.copy()

# 目标变量与布尔变量转换
df_prep['Revenue'] = df_prep['Revenue'].astype(int)
df_prep['Weekend'] = df_prep['Weekend'].astype(int)

# ---- 构造新特征 ----
# 总页面访问数
df_prep['TotalPages'] = (
    df_prep['Administrative'] + df_prep['Informational'] + df_prep['ProductRelated']
)

# 总浏览时长
df_prep['TotalDuration'] = (
    df_prep['Administrative_Duration'] +
    df_prep['Informational_Duration'] +
    df_prep['ProductRelated_Duration']
)

# 平均每页产品浏览时长 (避免除零)
df_prep['AvgProductDuration'] = np.where(
    df_prep['ProductRelated'] > 0,
    df_prep['ProductRelated_Duration'] / df_prep['ProductRelated'],
    0
)

# 产品页面占比 (产品相关页面 / 总页面)
df_prep['ProductPageRatio'] = np.where(
    df_prep['TotalPages'] > 0,
    df_prep['ProductRelated'] / df_prep['TotalPages'],
    0
)

# 跳出率与退出率的交互特征
df_prep['Bounce_Exit_Ratio'] = np.where(
    df_prep['ExitRates'] > 0,
    df_prep['BounceRates'] / df_prep['ExitRates'],
    0
)

# ---- 对偏态变量做对数变换 ----
skew_cols = ['PageValues', 'ProductRelated_Duration', 'Administrative_Duration',
             'Informational_Duration', 'TotalDuration']

for col in skew_cols:
    df_prep[f'{col}_log'] = np.log1p(df_prep[col])

print("=== 新增特征列表 ===")
new_cols = ['TotalPages', 'TotalDuration', 'AvgProductDuration',
            'ProductPageRatio', 'Bounce_Exit_Ratio'] + [f'{c}_log' for c in skew_cols]
print(new_cols)
print(f"\n特征工程后数据形状: {df_prep.shape}")
=== 新增特征列表 ===
['TotalPages', 'TotalDuration', 'AvgProductDuration', 'ProductPageRatio', 'Bounce_Exit_Ratio', 'PageValues_log', 'ProductRelated_Duration_log', 'Administrative_Duration_log', 'Informational_Duration_log', 'TotalDuration_log']

特征工程后数据形状: (12330, 28)
In [42]:
# 2.2 特征编码 (One-Hot Encoding)
categorical_cols = ['Month', 'OperatingSystems', 'Browser', 'Region',
                    'TrafficType', 'VisitorType']
df_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)

print(f"编码后数据形状: {df_encoded.shape}")

# 2.3 数据集划分 (分层抽样)
X = df_encoded.drop('Revenue', axis=1)
y = df_encoded['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"训练集: X={X_train.shape}, y={y_train.shape}")
print(f"测试集: X={X_test.shape}, y={y_test.shape}")
print(f"\n训练集类别分布:\n{y_train.value_counts()}")
print(f"\n测试集类别分布:\n{y_test.value_counts()}")

# 计算 scale_pos_weight (后续 XGBoost/cost-sensitive 使用)
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight_val = neg_count / pos_count
print(f"\nscale_pos_weight = {scale_pos_weight_val:.2f}")

# 保存特征名列表 (后续可解释性分析使用)
feature_names = X.columns.tolist()
编码后数据形状: (12330, 79)
训练集: X=(9864, 78), y=(9864,)
测试集: X=(2466, 78), y=(2466,)

训练集类别分布:
Revenue
0    8338
1    1526
Name: count, dtype: int64

测试集类别分布:
Revenue
0    2084
1     382
Name: count, dtype: int64

scale_pos_weight = 5.46

3. 基线建模 + 分层交叉验证¶

使用 imblearn Pipeline 确保 SMOTE 仅在 CV 训练折中应用,杜绝数据泄露。 线性模型使用 StandardScaler,树模型直接训练。

In [43]:
# 3.1 定义统一的交叉验证框架
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

def run_cv(estimator, X, y, cv, name):
    """
    执行交叉验证并返回平均值和标准差。
    """
    results = cross_validate(estimator, X, y, cv=cv,
                             scoring=scoring_metrics, n_jobs=-1)
    row = {'Model': name}
    for metric in scoring_metrics:
        key = f'test_{metric}'
        row[f'{metric}_mean'] = results[key].mean()
        row[f'{metric}_std'] = results[key].std()
    return row
In [44]:
# 3.2 基线模型 (默认参数 + SMOTE Pipeline) 交叉验证
baseline_configs = {
    'Logistic Regression': ImbPipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('model', LogisticRegression(random_state=42, max_iter=1000))
    ]),
    'Decision Tree': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', DecisionTreeClassifier(random_state=42, max_depth=5))
    ]),
    'Random Forest': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(random_state=42, n_estimators=100))
    ]),
    'XGBoost': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', XGBClassifier(random_state=42, eval_metric='logloss',
                                verbosity=0))
    ]),
    'LightGBM': ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', LGBMClassifier(random_state=42, verbose=-1))
    ]),
}

baseline_results = []
for name, pipe in baseline_configs.items():
    print(f"正在交叉验证: {name} ...")
    row = run_cv(pipe, X_train, y_train, cv, name)
    baseline_results.append(row)

baseline_df = pd.DataFrame(baseline_results).set_index('Model')

# 格式化显示: mean ± std
display_cols = {}
for metric in scoring_metrics:
    display_cols[metric] = baseline_df.apply(
        lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
    )
display(pd.DataFrame(display_cols, index=baseline_df.index))
正在交叉验证: Logistic Regression ...
正在交叉验证: Decision Tree ...
正在交叉验证: Random Forest ...
正在交叉验证: XGBoost ...
正在交叉验证: LightGBM ...
accuracy precision recall f1 roc_auc
Model
Logistic Regression 0.8683 ± 0.0037 0.5513 ± 0.0099 0.8034 ± 0.0124 0.6537 ± 0.0033 0.9164 ± 0.0033
Decision Tree 0.8875 ± 0.0060 0.6110 ± 0.0203 0.7556 ± 0.0228 0.6752 ± 0.0117 0.9056 ± 0.0012
Random Forest 0.8940 ± 0.0049 0.6362 ± 0.0162 0.7359 ± 0.0091 0.6824 ± 0.0124 0.9211 ± 0.0034
XGBoost 0.8939 ± 0.0035 0.6575 ± 0.0128 0.6560 ± 0.0295 0.6563 ± 0.0155 0.9212 ± 0.0028
LightGBM 0.8947 ± 0.0036 0.6545 ± 0.0124 0.6763 ± 0.0170 0.6651 ± 0.0120 0.9263 ± 0.0038
In [45]:
# 3.3 基线模型性能可视化
fig, ax = plt.subplots(figsize=(12, 6))
plot_metrics = ['f1_mean', 'roc_auc_mean', 'recall_mean', 'precision_mean']
plot_labels = ['F1-score', 'ROC-AUC', 'Recall', 'Precision']

x = np.arange(len(baseline_df))
width = 0.2
for i, (col, label) in enumerate(zip(plot_metrics, plot_labels)):
    bars = ax.bar(x + i * width, baseline_df[col], width, label=label)
    # 添加 std 误差线
    ax.errorbar(x + i * width, baseline_df[col],
                yerr=baseline_df[col.replace('mean', 'std')],
                fmt='none', ecolor='black', capsize=3)

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Baseline Models - 5-Fold Stratified CV Performance (with SMOTE)')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(baseline_df.index, rotation=30, ha='right')
ax.legend(loc='lower right')
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()
No description has been provided for this image

4. 类别不平衡处理方案对比¶

比较三种策略: (1) 无处理 (Baseline)、(2) SMOTE、(3) class_weight / scale_pos_weight。 全部使用交叉验证以保证公平性。

In [46]:
# 4.1 定义三种不平衡处理策略的 Pipeline

def get_imbalance_configs(model_name):
    """
    针对给定模型,返回三种不平衡处理策略。
    """
    configs = {}

    if model_name == 'Logistic Regression':
        configs['No Balancing'] = SkPipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(random_state=42, max_iter=1000))
        ])
        configs['SMOTE'] = ImbPipeline([
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=42)),
            ('model', LogisticRegression(random_state=42, max_iter=1000))
        ])
        configs['class_weight=balanced'] = SkPipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(random_state=42, max_iter=1000,
                                         class_weight='balanced'))
        ])

    elif model_name == 'Random Forest':
        configs['No Balancing'] = RandomForestClassifier(
            random_state=42, n_estimators=100
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', RandomForestClassifier(random_state=42, n_estimators=100))
        ])
        configs['class_weight=balanced'] = RandomForestClassifier(
            random_state=42, n_estimators=100, class_weight='balanced'
        )

    elif model_name == 'XGBoost':
        configs['No Balancing'] = XGBClassifier(
            random_state=42, eval_metric='logloss', verbosity=0
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', XGBClassifier(random_state=42, eval_metric='logloss',
                                    verbosity=0))
        ])
        configs['scale_pos_weight'] = XGBClassifier(
            random_state=42, eval_metric='logloss', verbosity=0,
            scale_pos_weight=scale_pos_weight_val
        )

    elif model_name == 'LightGBM':
        configs['No Balancing'] = LGBMClassifier(
            random_state=42, verbose=-1
        )
        configs['SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('model', LGBMClassifier(random_state=42, verbose=-1))
        ])
        configs['class_weight=balanced'] = LGBMClassifier(
            random_state=42, verbose=-1, class_weight='balanced'
        )

    return configs
In [47]:
# 4.2 对比实验 —— 对4个代表性模型 × 3种策略进行交叉验证
compare_models = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']
imbalance_results = []

for model_name in compare_models:
    configs = get_imbalance_configs(model_name)
    for strategy_name, estimator in configs.items():
        label = f"{model_name} | {strategy_name}"
        print(f"正在验证: {label} ...")
        row = run_cv(estimator, X_train, y_train, cv, label)
        row['Base_Model'] = model_name
        row['Strategy'] = strategy_name
        imbalance_results.append(row)

imb_df = pd.DataFrame(imbalance_results)
正在验证: Logistic Regression | No Balancing ...
正在验证: Logistic Regression | SMOTE ...
正在验证: Logistic Regression | class_weight=balanced ...
正在验证: Random Forest | No Balancing ...
正在验证: Random Forest | SMOTE ...
正在验证: Random Forest | class_weight=balanced ...
正在验证: XGBoost | No Balancing ...
正在验证: XGBoost | SMOTE ...
正在验证: XGBoost | scale_pos_weight ...
正在验证: LightGBM | No Balancing ...
正在验证: LightGBM | SMOTE ...
正在验证: LightGBM | class_weight=balanced ...
In [48]:
# 4.3 不平衡策略对比可视化
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, model_name in enumerate(compare_models):
    ax = axes[idx]
    subset = imb_df[imb_df['Base_Model'] == model_name]

    x = np.arange(len(subset))
    width = 0.25

    ax.bar(x - width, subset['f1_mean'], width, label='F1', color='#66c2a5')
    ax.bar(x, subset['roc_auc_mean'], width, label='ROC-AUC', color='#fc8d62')
    ax.bar(x + width, subset['recall_mean'], width, label='Recall', color='#8da0cb')

    ax.set_title(f'{model_name}')
    ax.set_xticks(x)
    ax.set_xticklabels(subset['Strategy'], rotation=20, ha='right')
    ax.set_ylim(0, 1.05)
    ax.legend(fontsize=9)

plt.suptitle('Imbalance Handling Strategy Comparison (5-Fold CV)', fontsize=14)
plt.tight_layout()
plt.show()

# 找出每个模型的最佳策略
print("\n=== 各模型最佳不平衡处理策略 (按 F1-score) ===")
for model_name in compare_models:
    subset = imb_df[imb_df['Base_Model'] == model_name]
    best = subset.loc[subset['f1_mean'].idxmax()]
    print(f"  {model_name}: {best['Strategy']} (F1 = {best['f1_mean']:.4f})")
No description has been provided for this image
=== 各模型最佳不平衡处理策略 (按 F1-score) ===
  Logistic Regression: SMOTE (F1 = 0.6537)
  Random Forest: SMOTE (F1 = 0.6824)
  XGBoost: scale_pos_weight (F1 = 0.6641)
  LightGBM: class_weight=balanced (F1 = 0.6740)

5. 超参数调优¶

对三个最有潜力的树模型 (Random Forest, XGBoost, LightGBM) 进行 RandomizedSearchCV, 以 F1-score 为优化目标 (因为类别不平衡,优化 accuracy 无意义)。

In [49]:
# 5.1 Random Forest 超参数调优
print("=" * 60)
print("超参数调优: Random Forest (SMOTE Pipeline)")
print("=" * 60)

pipe_rf_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

param_dist_rf = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2', None]
}

search_rf = RandomizedSearchCV(
    pipe_rf_tune,
    param_distributions=param_dist_rf,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_rf.fit(X_train, y_train)

print(f"Best F1 (CV): {search_rf.best_score_:.4f}")
print(f"Best Params: {search_rf.best_params_}")
============================================================
超参数调优: Random Forest (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6854
Best Params: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 4, 'model__max_features': 'log2', 'model__max_depth': None}
In [50]:
# 5.2 XGBoost 超参数调优
print("=" * 60)
print("超参数调优: XGBoost (SMOTE Pipeline)")
print("=" * 60)

pipe_xgb_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0))
])

param_dist_xgb = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [3, 5, 7, 10],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__min_child_weight': [1, 3, 5, 7],
    'model__gamma': [0, 0.1, 0.2, 0.3]
}

search_xgb = RandomizedSearchCV(
    pipe_xgb_tune,
    param_distributions=param_dist_xgb,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_xgb.fit(X_train, y_train)

print(f"Best F1 (CV): {search_xgb.best_score_:.4f}")
print(f"Best Params: {search_xgb.best_params_}")
============================================================
超参数调优: XGBoost (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6871
Best Params: {'model__subsample': 0.9, 'model__n_estimators': 100, 'model__min_child_weight': 3, 'model__max_depth': 3, 'model__learning_rate': 0.1, 'model__gamma': 0, 'model__colsample_bytree': 0.9}
In [52]:
# 5.3 LightGBM 超参数调优
print("=" * 60)
print("超参数调优: LightGBM (SMOTE Pipeline)")
print("=" * 60)

pipe_lgbm_tune = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', LGBMClassifier(random_state=42, verbose=-1))
])

param_dist_lgbm = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [3, 5, 7, 10, -1],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__num_leaves': [15, 31, 63, 127],
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__min_child_samples': [5, 10, 20, 50],
    'model__reg_alpha': [0, 0.01, 0.1, 1],
    'model__reg_lambda': [0, 0.01, 0.1, 1]
}

search_lgbm = RandomizedSearchCV(
    pipe_lgbm_tune,
    param_distributions=param_dist_lgbm,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
search_lgbm.fit(X_train, y_train)

print(f"Best F1 (CV): {search_lgbm.best_score_:.4f}")
print(f"Best Params: {search_lgbm.best_params_}")
============================================================
超参数调优: LightGBM (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6850
Best Params: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0.01, 'model__num_leaves': 63, 'model__n_estimators': 300, 'model__min_child_samples': 10, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}
In [53]:
# 5.4 调参前后对比
print("\n=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===")

tuning_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'LightGBM'],
    'Before Tuning (F1)': [
        baseline_df.loc['Random Forest', 'f1_mean'],
        baseline_df.loc['XGBoost', 'f1_mean'],
        baseline_df.loc['LightGBM', 'f1_mean']
    ],
    'After Tuning (F1)': [
        search_rf.best_score_,
        search_xgb.best_score_,
        search_lgbm.best_score_
    ]
}).set_index('Model')

tuning_comparison['Improvement'] = (
    tuning_comparison['After Tuning (F1)'] - tuning_comparison['Before Tuning (F1)']
)

display(tuning_comparison.round(4))

# 选出最优模型
best_searches = {
    'Random Forest': search_rf,
    'XGBoost': search_xgb,
    'LightGBM': search_lgbm
}
best_model_name = tuning_comparison['After Tuning (F1)'].idxmax()
best_search = best_searches[best_model_name]
print(f"\n>>> 最优模型: {best_model_name} (Tuned F1 = {best_search.best_score_:.4f})")
=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===
Before Tuning (F1) After Tuning (F1) Improvement
Model
Random Forest 0.6824 0.6854 0.0030
XGBoost 0.6563 0.6871 0.0308
LightGBM 0.6651 0.6850 0.0199
>>> 最优模型: XGBoost (Tuned F1 = 0.6871)

6. 阈值优化¶

默认阈值 0.5 不一定最优。通过 Precision-Recall Curve 找到使 F1 最大化的阈值。 同时展示不同阈值对 Precision/Recall/F1 的影响曲线。

In [54]:
# 6.1 使用最优调参模型在测试集上获取概率预测
best_pipeline = best_search.best_estimator_
y_proba_best = best_pipeline.predict_proba(X_test)[:, 1]

# 6.2 计算各阈值下的 Precision, Recall, F1
precisions_curve, recalls_curve, thresholds_pr = precision_recall_curve(y_test, y_proba_best)
f1_scores_curve = 2 * (precisions_curve * recalls_curve) / (precisions_curve + recalls_curve + 1e-10)

# 最优阈值 (最大化 F1)
best_threshold_idx = np.argmax(f1_scores_curve[:-1])  # 最后一个点可能无意义
best_threshold = thresholds_pr[best_threshold_idx]
best_f1_at_threshold = f1_scores_curve[best_threshold_idx]

print(f"默认阈值 0.5 的 F1: {f1_score(y_test, (y_proba_best >= 0.5).astype(int)):.4f}")
print(f"最优阈值: {best_threshold:.4f}")
print(f"最优阈值下的 F1: {best_f1_at_threshold:.4f}")
默认阈值 0.5 的 F1: 0.6723
最优阈值: 0.5889
最优阈值下的 F1: 0.6750
In [55]:
# 6.3 阈值 vs Precision / Recall / F1 曲线
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# 左图: Threshold vs Metrics
axes[0].plot(thresholds_pr, precisions_curve[:-1], label='Precision', linewidth=2)
axes[0].plot(thresholds_pr, recalls_curve[:-1], label='Recall', linewidth=2)
axes[0].plot(thresholds_pr, f1_scores_curve[:-1], label='F1-score', linewidth=2)
axes[0].axvline(x=best_threshold, color='red', linestyle='--',
                label=f'Best Threshold = {best_threshold:.3f}')
axes[0].axvline(x=0.5, color='gray', linestyle=':', label='Default 0.5')
axes[0].set_xlabel('Decision Threshold')
axes[0].set_ylabel('Score')
axes[0].set_title('Threshold Optimization: Precision / Recall / F1')
axes[0].legend()

# 右图: Precision-Recall Curve
axes[1].plot(recalls_curve, precisions_curve, linewidth=2)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
ap = average_precision_score(y_test, y_proba_best)
axes[1].set_title(f'Precision-Recall Curve (AP = {ap:.3f})')
axes[1].fill_between(recalls_curve, precisions_curve, alpha=0.2)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
# 6.4 对比默认阈值 vs 最优阈值
y_pred_default = (y_proba_best >= 0.5).astype(int)
y_pred_optimal = (y_proba_best >= best_threshold).astype(int)

print("=== 默认阈值 (0.5) ===")
print(classification_report(y_test, y_pred_default, digits=4))

print(f"\n=== 最优阈值 ({best_threshold:.4f}) ===")
print(classification_report(y_test, y_pred_optimal, digits=4))

# 混淆矩阵对比
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_default, ax=axes[0],
                                         cmap='Blues', colorbar=False)
axes[0].set_title('Confusion Matrix (Threshold = 0.5)')

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_optimal, ax=axes[1],
                                         cmap='Oranges', colorbar=False)
axes[1].set_title(f'Confusion Matrix (Threshold = {best_threshold:.3f})')

plt.tight_layout()
plt.show()
=== 默认阈值 (0.5) ===
              precision    recall  f1-score   support

           0     0.9485    0.9199    0.9340      2084
           1     0.6247    0.7277    0.6723       382

    accuracy                         0.8901      2466
   macro avg     0.7866    0.8238    0.8031      2466
weighted avg     0.8984    0.8901    0.8934      2466


=== 最优阈值 (0.5889) ===
              precision    recall  f1-score   support

           0     0.9449    0.9299    0.9374      2084
           1     0.6482    0.7042    0.6750       382

    accuracy                         0.8950      2466
   macro avg     0.7965    0.8171    0.8062      2466
weighted avg     0.8989    0.8950    0.8967      2466

No description has been provided for this image

7. 最终模型全面评估¶

包括 ROC Curve 和 Precision-Recall Curve 对比所有调优后/基线模型, 以及最优模型的详细分类报告。

In [57]:
# 7.1 训练所有最终模型并收集预测概率

# 用 SMOTE 处理训练集 (供单独模型训练使用)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 为线性模型准备标准化数据
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# 最终模型集合 (包含调优后参数)
final_models = {}

# Logistic Regression (使用标准化数据)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_res_scaled, y_train_res)
final_models['Logistic Regression'] = {
    'model': lr_model,
    'y_proba': lr_model.predict_proba(X_test_scaled)[:, 1]
}

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train_res, y_train_res)
final_models['Decision Tree'] = {
    'model': dt_model,
    'y_proba': dt_model.predict_proba(X_test.values)[:, 1]
}

# Random Forest (调优后)
rf_best_params = {k.replace('model__', ''): v
                  for k, v in search_rf.best_params_.items()}
rf_tuned = RandomForestClassifier(**rf_best_params, random_state=42)
rf_tuned.fit(X_train_res, y_train_res)
final_models['Random Forest (Tuned)'] = {
    'model': rf_tuned,
    'y_proba': rf_tuned.predict_proba(X_test.values)[:, 1]
}

# XGBoost (调优后)
xgb_best_params = {k.replace('model__', ''): v
                   for k, v in search_xgb.best_params_.items()}
xgb_tuned = XGBClassifier(**xgb_best_params, random_state=42,
                          eval_metric='logloss', verbosity=0)
xgb_tuned.fit(X_train_res, y_train_res)
final_models['XGBoost (Tuned)'] = {
    'model': xgb_tuned,
    'y_proba': xgb_tuned.predict_proba(X_test.values)[:, 1]
}

# LightGBM (调优后)
lgbm_best_params = {k.replace('model__', ''): v
                    for k, v in search_lgbm.best_params_.items()}
lgbm_tuned = LGBMClassifier(**lgbm_best_params, random_state=42, verbose=-1)
lgbm_tuned.fit(X_train_res, y_train_res)
final_models['LightGBM (Tuned)'] = {
    'model': lgbm_tuned,
    'y_proba': lgbm_tuned.predict_proba(X_test.values)[:, 1]
}
In [58]:
# 7.2 ROC Curve 对比 (所有模型)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 左图: ROC Curve
for name, info in final_models.items():
    fpr, tpr, _ = roc_curve(y_test, info['y_proba'])
    auc_val = roc_auc_score(y_test, info['y_proba'])
    axes[0].plot(fpr, tpr, linewidth=2, label=f"{name} (AUC={auc_val:.3f})")

axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve Comparison')
axes[0].legend(fontsize=9)

# 右图: Precision-Recall Curve
for name, info in final_models.items():
    prec_c, rec_c, _ = precision_recall_curve(y_test, info['y_proba'])
    ap_val = average_precision_score(y_test, info['y_proba'])
    axes[1].plot(rec_c, prec_c, linewidth=2, label=f"{name} (AP={ap_val:.3f})")

axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve Comparison')
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [59]:
# 7.3 最终模型对比汇总表 (使用最优阈值)
final_results = []
for name, info in final_models.items():
    y_proba = info['y_proba']
    # 使用最优阈值 (基于 F1)
    prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba)
    f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
    opt_idx = np.argmax(f1_c[:-1])
    opt_thr = thr_c[opt_idx]

    y_pred = (y_proba >= opt_thr).astype(int)

    final_results.append({
        'Model': name,
        'Optimal Threshold': opt_thr,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba),
        'Avg Precision': average_precision_score(y_test, y_proba)
    })

final_results_df = pd.DataFrame(final_results).set_index('Model')
display(final_results_df.round(4))

# 找出综合最优模型
overall_best_name = final_results_df['F1-score'].idxmax()
print(f"\n>>> 综合最优模型: {overall_best_name}")
print(f"    F1 = {final_results_df.loc[overall_best_name, 'F1-score']:.4f}")
print(f"    ROC-AUC = {final_results_df.loc[overall_best_name, 'ROC-AUC']:.4f}")
Optimal Threshold Accuracy Precision Recall F1-score ROC-AUC Avg Precision
Model
Logistic Regression 0.4372 0.8816 0.6098 0.6545 0.6313 0.8887 0.6177
Decision Tree 0.6044 0.8881 0.6157 0.7382 0.6714 0.9108 0.6308
Random Forest (Tuned) 0.5966 0.8958 0.6574 0.6832 0.6701 0.9189 0.6926
XGBoost (Tuned) 0.5889 0.8950 0.6482 0.7042 0.6750 0.9235 0.6951
LightGBM (Tuned) 0.4827 0.8897 0.6239 0.7251 0.6707 0.9299 0.7202
>>> 综合最优模型: XGBoost (Tuned)
    F1 = 0.6750
    ROC-AUC = 0.9235

8. 模型可解释性分析¶

使用 SHAP Values、Permutation Importance 和 Partial Dependence Plot (PDP) 多角度打开机器学习"黑盒"。

In [60]:
# 选取表现最好的树模型进行可解释性分析
# (使用 RF Tuned 作为演示,因为 SHAP TreeExplainer 对其支持最好)
explain_model = rf_tuned
explain_model_name = "Random Forest (Tuned)"

print(f"可解释性分析模型: {explain_model_name}")
可解释性分析模型: Random Forest (Tuned)
In [61]:
# 8.1 SHAP Values 分析
# 取测试集样本 (控制计算量)
n_shap_samples = 200
X_test_sample = X_test.iloc[:n_shap_samples].copy()

explainer = shap.TreeExplainer(explain_model)
shap_exp = explainer(X_test_sample)

# 处理二分类可能产生的三维 SHAP values
if isinstance(shap_exp.values, np.ndarray) and len(shap_exp.values.shape) == 3:
    shap_values_pos = shap.Explanation(
        values=shap_exp.values[:, :, 1],
        base_values=(shap_exp.base_values[:, 1]
                     if len(shap_exp.base_values.shape) > 1
                     else shap_exp.base_values),
        data=shap_exp.data,
        feature_names=feature_names
    )
else:
    shap_values_pos = shap_exp

# SHAP Beeswarm Plot (全局特征重要性 + 方向)
print("=== SHAP Beeswarm Plot (Top 20 Features) ===")
shap.plots.beeswarm(shap_values_pos, max_display=20, show=True)
=== SHAP Beeswarm Plot (Top 20 Features) ===
No description has been provided for this image
In [62]:
# SHAP Bar Plot (平均绝对 SHAP 值)
print("=== SHAP Bar Plot (Mean |SHAP|) ===")
shap.plots.bar(shap_values_pos, max_display=20, show=True)
=== SHAP Bar Plot (Mean |SHAP|) ===
No description has been provided for this image
In [63]:
# SHAP Dependence Plot (Top 2 特征的详细交互)
top_features_idx = np.argsort(-np.abs(shap_values_pos.values).mean(axis=0))[:2]
top_feature_names = [feature_names[i] for i in top_features_idx]

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
for i, feat in enumerate(top_feature_names):
    shap.plots.scatter(shap_values_pos[:, feat], ax=axes[i], show=False)
    axes[i].set_title(f'SHAP Dependence: {feat}')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [64]:
# 8.2 Permutation Importance (模型无关的特征重要性评估)
print("=== Permutation Importance (10 repeats) ===")
perm_result = permutation_importance(
    explain_model, X_test, y_test,
    n_repeats=10, random_state=42, scoring='f1', n_jobs=-1
)

perm_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance_mean': perm_result.importances_mean,
    'Importance_std': perm_result.importances_std
}).sort_values('Importance_mean', ascending=False)

# 可视化 Top 15
fig, ax = plt.subplots(figsize=(10, 7))
top_perm = perm_df.head(15)
ax.barh(range(len(top_perm)), top_perm['Importance_mean'],
        xerr=top_perm['Importance_std'], color='steelblue', edgecolor='black')
ax.set_yticks(range(len(top_perm)))
ax.set_yticklabels(top_perm['Feature'])
ax.invert_yaxis()
ax.set_xlabel('Mean Permutation Importance (F1 decrease)')
ax.set_title('Top 15 Permutation Importance')
plt.tight_layout()
plt.show()
=== Permutation Importance (10 repeats) ===
No description has been provided for this image
In [65]:
# 8.3 SHAP vs Permutation Importance 交叉验证
print("\n=== SHAP 与 Permutation Importance Top 10 对比 ===")

shap_importance = pd.Series(
    np.abs(shap_values_pos.values).mean(axis=0),
    index=feature_names
).sort_values(ascending=False)

perm_importance = perm_df.set_index('Feature')['Importance_mean']

comparison_top10 = pd.DataFrame({
    'SHAP Rank': range(1, 11),
    'SHAP Feature': shap_importance.head(10).index.tolist(),
    'Perm Rank': range(1, 11),
    'Perm Feature': perm_importance.sort_values(ascending=False).head(10).index.tolist()
})
display(comparison_top10)
=== SHAP 与 Permutation Importance Top 10 对比 ===
SHAP Rank SHAP Feature Perm Rank Perm Feature
0 1 PageValues 1 PageValues
1 2 PageValues_log 2 PageValues_log
2 3 Month_Nov 3 Month_Nov
3 4 TrafficType_2 4 Administrative
4 5 ExitRates 5 Month_Mar
5 6 OperatingSystems_2 6 ProductRelated_Duration
6 7 Administrative_Duration 7 Informational
7 8 Administrative_Duration_log 8 ProductRelated_Duration_log
8 9 ProductPageRatio 9 TotalPages
9 10 Administrative 10 Administrative_Duration_log
In [66]:
# 8.4 Partial Dependence Plot (PDP) —— 展示特征边际效应
print("=== Partial Dependence Plots ===")

# 选取 SHAP 重要性最高的 4 个连续特征
top_continuous_features = []
continuous_original = ['PageValues', 'ExitRates', 'BounceRates',
                       'ProductRelated_Duration', 'ProductRelated',
                       'TotalDuration', 'TotalPages', 'PageValues_log']
for feat in shap_importance.index:
    if feat in continuous_original:
        top_continuous_features.append(feat)
    if len(top_continuous_features) >= 4:
        break

fig, axes = plt.subplots(1, len(top_continuous_features),
                         figsize=(5 * len(top_continuous_features), 4))
if len(top_continuous_features) == 1:
    axes = [axes]

PartialDependenceDisplay.from_estimator(
    explain_model, X_test, top_continuous_features,
    ax=axes, grid_resolution=50
)

plt.suptitle('Partial Dependence Plots (PDP)', fontsize=14, y=1.05)
plt.tight_layout()
plt.show()
=== Partial Dependence Plots ===
No description has been provided for this image

9. 高级诊断分析¶

包括 校准曲线 (Calibration Curve)、学习曲线 (Learning Curve)、 错误分析 (Error Analysis) 和 成本敏感分析 (Cost-Sensitive Analysis)。

In [67]:
# 9.1 校准曲线 (Calibration Curve)
# 检验模型输出概率的可靠性
print("=== 校准曲线 (Calibration Curve) ===")

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')

for name, info in final_models.items():
    prob_true, prob_pred = calibration_curve(y_test, info['y_proba'], n_bins=10)
    ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=name)

ax.set_xlabel('Mean Predicted Probability')
ax.set_ylabel('Fraction of Positives')
ax.set_title('Calibration Curve (Reliability Diagram)')
ax.legend(fontsize=9, loc='lower right')
plt.tight_layout()
plt.show()
=== 校准曲线 (Calibration Curve) ===
No description has been provided for this image
In [68]:
# 9.2 学习曲线 (Learning Curve) —— 检测过拟合/欠拟合
print("=== 学习曲线 (Learning Curve) ===")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
lc_models = {
    'Random Forest (Tuned)': rf_tuned,
    'XGBoost (Tuned)': xgb_tuned,
    'LightGBM (Tuned)': lgbm_tuned
}

for idx, (name, model) in enumerate(lc_models.items()):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train_res, y_train_res,
        cv=5, scoring='f1',
        train_sizes=np.linspace(0.2, 1.0, 5),
        n_jobs=-1, random_state=42
    )

    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)

    axes[idx].fill_between(train_sizes, train_mean - train_std,
                           train_mean + train_std, alpha=0.1, color='blue')
    axes[idx].fill_between(train_sizes, val_mean - val_std,
                           val_mean + val_std, alpha=0.1, color='orange')
    axes[idx].plot(train_sizes, train_mean, 'o-', color='blue',
                   label='Training F1')
    axes[idx].plot(train_sizes, val_mean, 'o-', color='orange',
                   label='Validation F1')
    axes[idx].set_title(f'Learning Curve: {name}')
    axes[idx].set_xlabel('Training Set Size')
    axes[idx].set_ylabel('F1 Score')
    axes[idx].legend(loc='lower right')
    axes[idx].set_ylim(0, 1.1)

plt.tight_layout()
plt.show()
=== 学习曲线 (Learning Curve) ===
No description has been provided for this image
In [69]:
# 9.3 错误分析 (Error Analysis) —— 分析哪些样本容易被误判

# 使用综合最优模型
best_final_model = final_models[overall_best_name]
y_proba_final = best_final_model['y_proba']

# 使用最优阈值
prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba_final)
f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
final_threshold = thr_c[np.argmax(f1_c[:-1])]
y_pred_final = (y_proba_final >= final_threshold).astype(int)

# 构建错误分析 DataFrame
error_df = df.iloc[y_test.index].copy()
error_df['y_true'] = y_test.values
error_df['y_pred'] = y_pred_final
error_df['y_proba'] = y_proba_final
error_df['correct'] = (error_df['y_true'] == error_df['y_pred'])
error_df['error_type'] = 'Correct'
error_df.loc[(error_df['y_true'] == 1) & (error_df['y_pred'] == 0), 'error_type'] = 'False Negative'
error_df.loc[(error_df['y_true'] == 0) & (error_df['y_pred'] == 1), 'error_type'] = 'False Positive'

print("=== 错误类型分布 ===")
print(error_df['error_type'].value_counts())
=== 错误类型分布 ===
error_type
Correct           2207
False Positive     146
False Negative     113
Name: count, dtype: int64
In [70]:
# 9.3a 误判样本的特征分析
print("\n=== False Negative vs True Positive 特征均值对比 ===")
analysis_cols = ['PageValues', 'BounceRates', 'ExitRates',
                 'ProductRelated', 'ProductRelated_Duration']

fn_samples = error_df[error_df['error_type'] == 'False Negative']
tp_samples = error_df[(error_df['y_true'] == 1) & (error_df['y_pred'] == 1)]
fp_samples = error_df[error_df['error_type'] == 'False Positive']
tn_samples = error_df[(error_df['y_true'] == 0) & (error_df['y_pred'] == 0)]

error_analysis = pd.DataFrame({
    'True Positive (Mean)': tp_samples[analysis_cols].mean(),
    'False Negative (Mean)': fn_samples[analysis_cols].mean(),
    'True Negative (Mean)': tn_samples[analysis_cols].mean(),
    'False Positive (Mean)': fp_samples[analysis_cols].mean()
})
display(error_analysis.round(4))
=== False Negative vs True Positive 特征均值对比 ===
True Positive (Mean) False Negative (Mean) True Negative (Mean) False Positive (Mean)
PageValues 34.2793 3.6043 0.4245 24.1529
BounceRates 0.0044 0.0110 0.0254 0.0075
ExitRates 0.0186 0.0293 0.0475 0.0241
ProductRelated 48.3048 60.0708 27.8839 49.1644
ProductRelated_Duration 1949.1567 2218.8107 1011.3520 2000.5623
In [71]:
# 9.3b 各月份/用户类型的误判率
print("\n=== 各月份误判率 (False Negative Rate among actual positives) ===")
actual_positive = error_df[error_df['y_true'] == 1]
monthly_fn_rate = actual_positive.groupby('Month').apply(
    lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(monthly_fn_rate.round(4))

print("\n=== 各访客类型误判率 ===")
visitor_fn_rate = actual_positive.groupby('VisitorType').apply(
    lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(visitor_fn_rate.round(4))

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

monthly_fn_rate.plot(kind='bar', ax=axes[0], color='salmon', edgecolor='black')
axes[0].set_title('False Negative Rate by Month')
axes[0].set_ylabel('FN Rate')
axes[0].tick_params(axis='x', rotation=45)

visitor_fn_rate.plot(kind='bar', ax=axes[1], color='lightblue', edgecolor='black')
axes[1].set_title('False Negative Rate by Visitor Type')
axes[1].set_ylabel('FN Rate')

plt.tight_layout()
plt.show()
=== 各月份误判率 (False Negative Rate among actual positives) ===
Month
Aug     0.6000
Jul     0.5500
Nov     0.3678
Sep     0.3636
Oct     0.2353
Dec     0.2188
June    0.1667
May     0.1370
Mar     0.0909
Feb     0.0000
dtype: float64

=== 各访客类型误判率 ===
VisitorType
Returning_Visitor    0.3103
New_Visitor          0.2584
Other                0.0000
dtype: float64
No description has been provided for this image
In [72]:
# 9.4 成本敏感分析 (Cost-Sensitive Business Analysis)
print("=== 成本敏感分析 ===")

# 假设业务成本模型:
# - 发送优惠券成本: ¥5 / 用户
# - 成功促成转化收益: ¥50 / 用户
# - 错过真实购买用户 (False Negative) 机会成本: ¥50
# - 错误发送优惠券 (False Positive) 成本: ¥5

cost_per_fp = 5    # 误发优惠券
gain_per_tp = 50   # 成功转化
loss_per_fn = 50   # 错失客户

def calculate_business_value(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    total_gain = tp * gain_per_tp
    total_cost = fp * cost_per_fp + fn * loss_per_fn
    net_value = total_gain - total_cost
    return {
        'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn,
        'Total Gain': total_gain,
        'Total Cost': total_cost,
        'Net Business Value': net_value
    }

# 比较不同阈值的商业价值
thresholds_to_compare = [0.3, 0.4, best_threshold, 0.5, 0.6, 0.7]
business_results = []

for thr in thresholds_to_compare:
    y_pred_thr = (y_proba_final >= thr).astype(int)
    bv = calculate_business_value(y_test, y_pred_thr)
    bv['Threshold'] = thr
    business_results.append(bv)

bv_df = pd.DataFrame(business_results).set_index('Threshold')
display(bv_df)

# 可视化商业价值
plt.figure(figsize=(8, 5))
plt.plot(bv_df.index, bv_df['Net Business Value'], 'o-', linewidth=2,
         markersize=8, color='green')
plt.axvline(x=best_threshold, color='red', linestyle='--',
            label=f'F1-Optimal Threshold = {best_threshold:.3f}')
plt.xlabel('Decision Threshold')
plt.ylabel('Net Business Value (¥)')
plt.title('Business Value at Different Thresholds')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

best_bv_threshold = bv_df['Net Business Value'].idxmax()
print(f"\n>>> 商业价值最大化阈值: {best_bv_threshold}")
print(f"    净商业价值: ¥{bv_df.loc[best_bv_threshold, 'Net Business Value']}")
=== 成本敏感分析 ===
TP FP FN TN Total Gain Total Cost Net Business Value
Threshold
0.300000 311 261 71 1823 15550 4855 10695
0.400000 287 194 95 1890 14350 5720 8630
0.588857 269 146 113 1938 13450 6380 7070
0.500000 278 167 104 1917 13900 6035 7865
0.600000 264 144 118 1940 13200 6620 6580
0.700000 237 108 145 1976 11850 7790 4060
No description has been provided for this image
>>> 商业价值最大化阈值: 0.3
    净商业价值: ¥10695

10. 客户分群聚类分析¶

使用 肘部法则 (Elbow Method) 和 轮廓系数 (Silhouette Score) 确定最优聚类数, 进行 K-Means 聚类,使用 PCA 可视化,并做 聚类画像 (Cluster Profiling) 和命名。

In [73]:
# 10.1 聚类特征选择与标准化
cluster_features = ['Administrative', 'Administrative_Duration', 'Informational',
                    'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
                    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

X_cluster = df[cluster_features].copy()

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

print(f"聚类数据形状: {X_cluster_scaled.shape}")
聚类数据形状: (12330, 10)
In [74]:
# 10.2 肘部法则 + 轮廓系数确定最优 K
k_range = range(2, 9)
inertias = []
sil_scores = []

for k in k_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_cluster_scaled)
    inertias.append(kmeans_temp.inertia_)
    sil_scores.append(silhouette_score(X_cluster_scaled, labels_temp))
    print(f"K={k}: Inertia={kmeans_temp.inertia_:.0f}, Silhouette={sil_scores[-1]:.4f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (Within-Cluster Sum of Squares)')
axes[0].set_title('Elbow Method')

axes[1].plot(k_range, sil_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score Analysis')

# 标记最优 K
best_k = list(k_range)[np.argmax(sil_scores)]
axes[1].axvline(x=best_k, color='green', linestyle='--',
                label=f'Best k = {best_k}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\n>>> 基于轮廓系数的最优 K = {best_k}")
K=2: Inertia=99672, Silhouette=0.4359
K=3: Inertia=80117, Silhouette=0.4517
K=4: Inertia=70377, Silhouette=0.4304
K=5: Inertia=62615, Silhouette=0.3981
K=6: Inertia=55748, Silhouette=0.4076
K=7: Inertia=49918, Silhouette=0.4079
K=8: Inertia=46254, Silhouette=0.3727
No description has been provided for this image
>>> 基于轮廓系数的最优 K = 3
In [75]:
# 10.3 使用最优 K 执行 KMeans 聚类
kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_cluster_scaled)

df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels

print("=== 各聚类簇的样本数量 ===")
print(df_clustered['Cluster'].value_counts().sort_index())

print("\n=== 各聚类簇的购买转化率 ===")
print(df_clustered.groupby('Cluster')['Revenue'].mean().round(4))
=== 各聚类簇的样本数量 ===
Cluster
0    9652
1    1628
2    1050
Name: count, dtype: int64

=== 各聚类簇的购买转化率 ===
Cluster
0    0.1502
1    0.2776
2    0.0057
Name: Revenue, dtype: float64
In [76]:
# 10.4 PCA 降维可视化
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)

print(f"PCA 解释方差比: {pca.explained_variance_ratio_.round(4)}")
print(f"PCA 累计解释方差: {pca.explained_variance_ratio_.sum():.4f}")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 按聚类标签着色
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1],
                           c=cluster_labels, cmap='viridis',
                           alpha=0.5, s=15)
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')
axes[0].set_title(f'K-Means Clustering (K={best_k}) - PCA Projection')
plt.colorbar(scatter1, ax=axes[0], label='Cluster')

# 按实际 Revenue 着色 (对照)
scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1],
                           c=df['Revenue'].astype(int), cmap='coolwarm',
                           alpha=0.5, s=15)
axes[1].set_xlabel('PCA Component 1')
axes[1].set_ylabel('PCA Component 2')
axes[1].set_title('Actual Revenue Labels - PCA Projection')
plt.colorbar(scatter2, ax=axes[1], label='Revenue')

plt.tight_layout()
plt.show()
PCA 解释方差比: [0.34   0.1675]
PCA 累计解释方差: 0.5076
No description has been provided for this image
In [77]:
# 10.5 聚类画像分析 (Cluster Profiling)
print("=== 详细聚类画像 (各簇特征均值) ===")

profile_features = cluster_features + ['Revenue']
cluster_profile = df_clustered.groupby('Cluster')[profile_features].agg(['mean', 'median'])
display(cluster_profile.round(4))

# 简化版画像 (均值)
profile_mean = df_clustered.groupby('Cluster')[profile_features].mean()
display(profile_mean.round(4))

# 10.5a 聚类画像热力图
fig, ax = plt.subplots(figsize=(14, best_k + 2))
profile_norm = profile_mean.copy()
for col in profile_norm.columns:
    col_min = profile_norm[col].min()
    col_max = profile_norm[col].max()
    if col_max > col_min:
        profile_norm[col] = (profile_norm[col] - col_min) / (col_max - col_min)
    else:
        profile_norm[col] = 0

sns.heatmap(profile_norm, annot=profile_mean.round(2).values,
            cmap='YlOrRd', fmt='', linewidths=1, ax=ax)
ax.set_title('Cluster Profile Heatmap (Normalized, Values = Actual Means)')
ax.set_ylabel('Cluster')
plt.tight_layout()
plt.show()
=== 详细聚类画像 (各簇特征均值) ===
Administrative Administrative_Duration Informational Informational_Duration ProductRelated ... BounceRates ExitRates PageValues SpecialDay Revenue
mean median mean median mean median mean median mean median ... mean median mean median mean median mean median mean median
Cluster
0 1.7090 1.0 51.1829 4.0000 0.2165 0.0 8.1459 0.0000 23.8882 17.0 ... 0.0092 0.0000 0.0321 0.0250 5.8774 0.0000 0.0617 0.0 0.1502 0.0
1 7.3710 7.0 307.7312 197.9757 2.5203 2.0 212.7327 93.2083 96.7439 75.0 ... 0.0063 0.0041 0.0194 0.0175 9.7577 1.8372 0.0301 0.0 0.2776 0.0
2 0.0486 0.0 1.4189 0.0000 0.0152 0.0 0.0869 0.0000 3.0295 1.0 ... 0.1665 0.2000 0.1805 0.2000 0.0000 0.0000 0.1074 0.0 0.0057 0.0

3 rows × 22 columns

Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay Revenue
Cluster
0 1.7090 51.1829 0.2165 8.1459 23.8882 873.8026 0.0092 0.0321 5.8774 0.0617 0.1502
1 7.3710 307.7312 2.5203 212.7327 96.7439 3831.1453 0.0063 0.0194 9.7577 0.0301 0.2776
2 0.0486 1.4189 0.0152 0.0869 3.0295 57.3080 0.1665 0.1805 0.0000 0.1074 0.0057
No description has been provided for this image
In [78]:
# 10.5b 聚类雷达图 (Spider Chart)
from matplotlib.patches import FancyBboxPatch

radar_features = ['Administrative', 'ProductRelated', 'BounceRates',
                  'ExitRates', 'PageValues', 'Revenue']

# 标准化到 0-1 范围用于雷达图
radar_data = profile_mean[radar_features].copy()
for col in radar_data.columns:
    col_min = radar_data[col].min()
    col_max = radar_data[col].max()
    if col_max > col_min:
        radar_data[col] = (radar_data[col] - col_min) / (col_max - col_min)
    else:
        radar_data[col] = 0

angles = np.linspace(0, 2 * np.pi, len(radar_features), endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
colors = plt.cm.Set2(np.linspace(0, 1, best_k))

for i in range(best_k):
    values = radar_data.iloc[i].tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {i}', color=colors[i])
    ax.fill(angles, values, alpha=0.15, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(radar_features, fontsize=10)
ax.set_title('Cluster Profile Radar Chart', fontsize=14, pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()
No description has been provided for this image
In [79]:
# 10.6 聚类命名与业务解读
print("=== 聚类命名与业务解读 ===\n")

# 根据画像自动生成聚类描述
for cluster_id in range(best_k):
    p = profile_mean.loc[cluster_id]
    conv_rate = p['Revenue']
    page_val = p['PageValues']
    bounce = p['BounceRates']
    exit_r = p['ExitRates']
    prod_pages = p['ProductRelated']
    prod_dur = p['ProductRelated_Duration']
    count = (df_clustered['Cluster'] == cluster_id).sum()

    print(f"--- Cluster {cluster_id} ({count} users, {count/len(df_clustered)*100:.1f}%) ---")
    print(f"  Conversion Rate: {conv_rate:.2%}")
    print(f"  Avg PageValues: {page_val:.2f}")
    print(f"  Avg BounceRate: {bounce:.4f}")
    print(f"  Avg ExitRate: {exit_r:.4f}")
    print(f"  Avg ProductRelated Pages: {prod_pages:.1f}")
    print(f"  Avg ProductRelated Duration: {prod_dur:.1f}s")

    # 自动命名逻辑
    if conv_rate > 0.20 and page_val > 10:
        print(f"  >>> 命名: High-Intent Purchasers (高意向购买者)")
    elif bounce > 0.03 or exit_r > 0.05:
        print(f"  >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)")
    elif prod_pages > 30:
        print(f"  >>> 命名: Active Browsers (活跃浏览用户)")
    else:
        print(f"  >>> 命名: Casual Visitors (普通访客)")
    print()
=== 聚类命名与业务解读 ===

--- Cluster 0 (9652 users, 78.3%) ---
  Conversion Rate: 15.02%
  Avg PageValues: 5.88
  Avg BounceRate: 0.0092
  Avg ExitRate: 0.0321
  Avg ProductRelated Pages: 23.9
  Avg ProductRelated Duration: 873.8s
  >>> 命名: Casual Visitors (普通访客)

--- Cluster 1 (1628 users, 13.2%) ---
  Conversion Rate: 27.76%
  Avg PageValues: 9.76
  Avg BounceRate: 0.0063
  Avg ExitRate: 0.0194
  Avg ProductRelated Pages: 96.7
  Avg ProductRelated Duration: 3831.1s
  >>> 命名: Active Browsers (活跃浏览用户)

--- Cluster 2 (1050 users, 8.5%) ---
  Conversion Rate: 0.57%
  Avg PageValues: 0.00
  Avg BounceRate: 0.1665
  Avg ExitRate: 0.1805
  Avg ProductRelated Pages: 3.0
  Avg ProductRelated Duration: 57.3s
  >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)

11. 特征工程效果验证¶

对比"原始特征"与"特征工程后"的模型性能,验证特征工程的价值。

In [80]:
# 11.1 准备无特征工程版本的数据
df_no_fe = df.copy()
df_no_fe['Revenue'] = df_no_fe['Revenue'].astype(int)
df_no_fe['Weekend'] = df_no_fe['Weekend'].astype(int)
df_no_fe = pd.get_dummies(df_no_fe, columns=categorical_cols, drop_first=True)

X_no_fe = df_no_fe.drop('Revenue', axis=1)
y_no_fe = df_no_fe['Revenue']

X_train_no_fe, X_test_no_fe, y_train_no_fe, y_test_no_fe = train_test_split(
    X_no_fe, y_no_fe, test_size=0.2, random_state=42, stratify=y_no_fe
)

# 使用同一个 RF 模型对比
pipe_rf_no_fe = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200))
])

pipe_rf_with_fe = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200))
])

# 交叉验证
print("=== 无特征工程 ===")
result_no_fe = run_cv(pipe_rf_no_fe, X_train_no_fe, y_train_no_fe, cv, "RF (No FE)")

print("=== 有特征工程 ===")
result_with_fe = run_cv(pipe_rf_with_fe, X_train, y_train, cv, "RF (With FE)")

fe_compare = pd.DataFrame([result_no_fe, result_with_fe]).set_index('Model')
display_fe = {}
for metric in scoring_metrics:
    display_fe[metric] = fe_compare.apply(
        lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
    )
print("\n=== 特征工程前后对比 ===")
display(pd.DataFrame(display_fe, index=fe_compare.index))
=== 无特征工程 ===
=== 有特征工程 ===

=== 特征工程前后对比 ===
accuracy precision recall f1 roc_auc
Model
RF (No FE) 0.8969 ± 0.0056 0.6556 ± 0.0243 0.7064 ± 0.0158 0.6796 ± 0.0116 0.9240 ± 0.0030
RF (With FE) 0.8937 ± 0.0040 0.6338 ± 0.0134 0.7412 ± 0.0118 0.6832 ± 0.0100 0.9229 ± 0.0033

12. 结论与部署建议 (Deployment Recommendations)¶

核心结论¶

1. 最重要的特征 (Model Interpretability)

通过 SHAP 分析、Permutation Importance 和 PDP 三重验证发现:

  • PageValues (页面价值) 是影响购买意向的最核心正向指标
  • ExitRates / BounceRates 高则强烈暗示用户不会购买
  • Month_Nov (11月份) 等季节性特征对结果影响显著
  • 特征工程构造的 TotalDuration、ProductPageRatio 等特征提供了额外的预测信息

2. 模型表现 (Model Performance)

  • 集成树模型 (Random Forest, XGBoost, LightGBM) 全面优于 Logistic Regression 和 Decision Tree
  • 经 RandomizedSearchCV 调参后模型性能进一步提升
  • SMOTE 和 class_weight 策略各有优势,但通过交叉验证确认了各模型的最优策略
  • 阈值优化从默认 0.5 调整后,显著提升了 F1-score

3. 模型可靠性 (Model Reliability)

  • 5-fold 交叉验证确认模型结果稳健,非偶然
  • 校准曲线显示树模型概率输出基本可靠
  • 学习曲线表明模型无严重过拟合或欠拟合
  • 错误分析揭示了模型在特定月份/用户类型上的薄弱环节

4. 用户分层 (Customer Segmentation)

  • 通过 Elbow + Silhouette Score 确定了最优聚类数
  • 聚类画像成功识别出不同参与度的用户群体
  • 高活跃度/高 PageValues 的簇具有显著更高的购买转化率

业务部署建议¶

实时干预系统: 基于最优树模型建立实时预测 API,使用成本敏感分析 确定的最优阈值进行决策。当预测概率处于犹豫区间时,触发客服弹窗 或限时优惠券以促成转化。

页面优化: 深度研究高 PageValues 页面的设计特征, 降低全局 ExitRates 和 BounceRates。

精准营销: 利用聚类结果对用户分层,针对不同群体采取差异化营销策略。 对 High-Intent 群体推送高价值商品推荐,对 Low-Engagement 群体 尝试重定向广告。

季节性策略: 在 11 月等高峰期前提前部署资源和广告预算。 关注错误分析中发现的高误判月份,在这些时段适当调低阈值以 减少漏判。

In [81]:
print("=" * 60)
print("分析完成!")
print("=" * 60)
print(f"最终推荐模型: {overall_best_name}")
print(f"推荐决策阈值 (F1最优): {final_threshold:.4f}")
print(f"推荐决策阈值 (商业价值最优): {best_bv_threshold}")
print(f"最优聚类数: {best_k}")
============================================================
分析完成!
============================================================
最终推荐模型: XGBoost (Tuned)
推荐决策阈值 (F1最优): 0.5889
推荐决策阈值 (商业价值最优): 0.3
最优聚类数: 3
In [ ]: