# 基础数据处理与可视化库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 机器学习预处理与模型选择
from sklearn.model_selection import (
train_test_split, StratifiedKFold, cross_validate,
RandomizedSearchCV, learning_curve
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
# 机器学习模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# 评估指标
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix, classification_report,
roc_curve, precision_recall_curve, average_precision_score,
ConfusionMatrixDisplay
)
# 校准曲线
from sklearn.calibration import calibration_curve
# 模型可解释性
import shap
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
# 聚类与降维
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# 忽略警告
import warnings
warnings.filterwarnings('ignore')
# 设置画图风格
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = (10, 6)
1. 业务理解 (Business Understanding) & 数据理解 (Data Understanding)¶
加载数据集,深入观察基本信息、数据分布、缺失值、异常值及特征间关系。
# 1.1 加载数据 & 基本信息
df = pd.read_csv('online_shoppers_intention.csv')
print("=== 数据集形状 ===")
print(f"行数: {df.shape[0]}, 列数: {df.shape[1]}\n")
display(df.head())
print("\n=== 数据集基本信息 ===")
df.info()
print("\n=== 数值变量描述性统计 ===")
display(df.describe().round(2))
print("\n=== 缺失值统计 ===")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "无缺失值")
print("\n=== 重复行统计 ===")
print(f"重复行数量: {df.duplicated().sum()}")
=== 数据集形状 === 行数: 12330, 列数: 18
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 1 | 1 | 1 | 1 | Returning_Visitor | False | False |
| 1 | 0 | 0.0 | 0 | 0.0 | 2 | 64.000000 | 0.00 | 0.10 | 0.0 | 0.0 | Feb | 2 | 2 | 1 | 2 | Returning_Visitor | False | False |
| 2 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 4 | 1 | 9 | 3 | Returning_Visitor | False | False |
| 3 | 0 | 0.0 | 0 | 0.0 | 2 | 2.666667 | 0.05 | 0.14 | 0.0 | 0.0 | Feb | 3 | 2 | 2 | 4 | Returning_Visitor | False | False |
| 4 | 0 | 0.0 | 0 | 0.0 | 10 | 627.500000 | 0.02 | 0.05 | 0.0 | 0.0 | Feb | 3 | 3 | 1 | 4 | Returning_Visitor | True | False |
=== 数据集基本信息 === <class 'pandas.core.frame.DataFrame'> RangeIndex: 12330 entries, 0 to 12329 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Administrative 12330 non-null int64 1 Administrative_Duration 12330 non-null float64 2 Informational 12330 non-null int64 3 Informational_Duration 12330 non-null float64 4 ProductRelated 12330 non-null int64 5 ProductRelated_Duration 12330 non-null float64 6 BounceRates 12330 non-null float64 7 ExitRates 12330 non-null float64 8 PageValues 12330 non-null float64 9 SpecialDay 12330 non-null float64 10 Month 12330 non-null object 11 OperatingSystems 12330 non-null int64 12 Browser 12330 non-null int64 13 Region 12330 non-null int64 14 TrafficType 12330 non-null int64 15 VisitorType 12330 non-null object 16 Weekend 12330 non-null bool 17 Revenue 12330 non-null bool dtypes: bool(2), float64(7), int64(7), object(2) memory usage: 1.5+ MB === 数值变量描述性统计 ===
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | OperatingSystems | Browser | Region | TrafficType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 | 12330.00 |
| mean | 2.32 | 80.82 | 0.50 | 34.47 | 31.73 | 1194.75 | 0.02 | 0.04 | 5.89 | 0.06 | 2.12 | 2.36 | 3.15 | 4.07 |
| std | 3.32 | 176.78 | 1.27 | 140.75 | 44.48 | 1913.67 | 0.05 | 0.05 | 18.57 | 0.20 | 0.91 | 1.72 | 2.40 | 4.03 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 184.14 | 0.00 | 0.01 | 0.00 | 0.00 | 2.00 | 2.00 | 1.00 | 2.00 |
| 50% | 1.00 | 7.50 | 0.00 | 0.00 | 18.00 | 598.94 | 0.00 | 0.03 | 0.00 | 0.00 | 2.00 | 2.00 | 3.00 | 2.00 |
| 75% | 4.00 | 93.26 | 0.00 | 0.00 | 38.00 | 1464.16 | 0.02 | 0.05 | 0.00 | 0.00 | 3.00 | 2.00 | 4.00 | 4.00 |
| max | 27.00 | 3398.75 | 24.00 | 2549.38 | 705.00 | 63973.52 | 0.20 | 0.20 | 361.76 | 1.00 | 8.00 | 13.00 | 9.00 | 20.00 |
=== 缺失值统计 === 无缺失值 === 重复行统计 === 重复行数量: 125
# 1.2 目标变量 Revenue 分布 (类别不平衡分析)
print("=== 目标变量 Revenue 分布 ===")
rev_counts = df['Revenue'].value_counts()
rev_pct = df['Revenue'].value_counts(normalize=True)
print(pd.DataFrame({'Count': rev_counts, 'Percentage': rev_pct.round(4)}))
imbalance_ratio = rev_counts[False] / rev_counts[True]
print(f"\n不平衡比例 (Negative : Positive) = {imbalance_ratio:.2f} : 1")
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.countplot(x='Revenue', data=df, ax=axes[0])
axes[0].set_title('Revenue Count Distribution')
axes[0].bar_label(axes[0].containers[0])
df['Revenue'].value_counts().plot.pie(
autopct='%1.1f%%', ax=axes[1], startangle=90, explode=[0, 0.05]
)
axes[1].set_ylabel('')
axes[1].set_title('Revenue Proportion')
plt.tight_layout()
plt.show()
=== 目标变量 Revenue 分布 ===
Count Percentage
Revenue
False 10422 0.8453
True 1908 0.1547
不平衡比例 (Negative : Positive) = 5.46 : 1
# 1.3 连续变量分布与目标变量的关系
continuous_cols = [
'Administrative', 'Administrative_Duration',
'Informational', 'Informational_Duration',
'ProductRelated', 'ProductRelated_Duration',
'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay'
]
fig, axes = plt.subplots(2, 5, figsize=(22, 8))
axes = axes.flatten()
for i, col in enumerate(continuous_cols):
sns.boxplot(data=df, x='Revenue', y=col, ax=axes[i])
axes[i].set_title(f'{col}')
plt.suptitle('Continuous Variables by Revenue (Boxplot)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
# KDE 对比关键变量
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.kdeplot(data=df, x="PageValues", hue="Revenue", fill=True, ax=axes[0])
axes[0].set_title('PageValues Distribution by Revenue')
axes[0].set_xlim(0, 100)
sns.kdeplot(data=df, x="BounceRates", hue="Revenue", fill=True, ax=axes[1])
axes[1].set_title('BounceRates Distribution by Revenue')
sns.kdeplot(data=df, x="ExitRates", hue="Revenue", fill=True, ax=axes[2])
axes[2].set_title('ExitRates Distribution by Revenue')
plt.tight_layout()
plt.show()
# 1.4 类别变量与目标变量关系
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
month_order = ['Feb', 'Mar', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.countplot(data=df, x='Month', hue='Revenue', order=month_order, ax=axes[0])
axes[0].set_title('Purchasing Intention across Months')
axes[0].tick_params(axis='x', rotation=45)
sns.countplot(data=df, x='VisitorType', hue='Revenue', ax=axes[1])
axes[1].set_title('Purchasing Intention by Visitor Type')
sns.countplot(data=df, x='Weekend', hue='Revenue', ax=axes[2])
axes[2].set_title('Purchasing Intention by Weekend')
plt.tight_layout()
plt.show()
# 各月份购买转化率
monthly_conv = df.groupby('Month')['Revenue'].mean().reindex(month_order)
plt.figure(figsize=(8, 4))
monthly_conv.plot(kind='bar', color='coral', edgecolor='black')
plt.title('Monthly Conversion Rate')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)
plt.show()
# 1.5 相关性热图
plt.figure(figsize=(14, 10))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
fmt=".2f", linewidths=0.5, center=0)
plt.title('Correlation Heatmap of Numerical Variables')
plt.tight_layout()
plt.show()
# 与目标变量 Revenue 的相关性排序
df_temp = df.copy()
df_temp['Revenue'] = df_temp['Revenue'].astype(int)
target_corr = df_temp.select_dtypes(include=[np.number]).corr()['Revenue'].drop('Revenue').sort_values(ascending=False)
print("\n=== 特征与 Revenue 的相关系数 (降序) ===")
print(target_corr.round(4))
=== 特征与 Revenue 的相关系数 (降序) === PageValues 0.4926 ProductRelated 0.1585 ProductRelated_Duration 0.1524 Administrative 0.1389 Informational 0.0952 Administrative_Duration 0.0936 Informational_Duration 0.0703 Browser 0.0240 TrafficType -0.0051 Region -0.0116 OperatingSystems -0.0147 SpecialDay -0.0823 BounceRates -0.1507 ExitRates -0.2071 Name: Revenue, dtype: float64
2. 数据准备 (Data Preparation)¶
包括 特征工程、特征编码、数据集划分。
# 2.1 特征工程 (Feature Engineering) —— 升级方向6
df_prep = df.copy()
# 目标变量与布尔变量转换
df_prep['Revenue'] = df_prep['Revenue'].astype(int)
df_prep['Weekend'] = df_prep['Weekend'].astype(int)
# ---- 构造新特征 ----
# 总页面访问数
df_prep['TotalPages'] = (
df_prep['Administrative'] + df_prep['Informational'] + df_prep['ProductRelated']
)
# 总浏览时长
df_prep['TotalDuration'] = (
df_prep['Administrative_Duration'] +
df_prep['Informational_Duration'] +
df_prep['ProductRelated_Duration']
)
# 平均每页产品浏览时长 (避免除零)
df_prep['AvgProductDuration'] = np.where(
df_prep['ProductRelated'] > 0,
df_prep['ProductRelated_Duration'] / df_prep['ProductRelated'],
0
)
# 产品页面占比 (产品相关页面 / 总页面)
df_prep['ProductPageRatio'] = np.where(
df_prep['TotalPages'] > 0,
df_prep['ProductRelated'] / df_prep['TotalPages'],
0
)
# 跳出率与退出率的交互特征
df_prep['Bounce_Exit_Ratio'] = np.where(
df_prep['ExitRates'] > 0,
df_prep['BounceRates'] / df_prep['ExitRates'],
0
)
# ---- 对偏态变量做对数变换 ----
skew_cols = ['PageValues', 'ProductRelated_Duration', 'Administrative_Duration',
'Informational_Duration', 'TotalDuration']
for col in skew_cols:
df_prep[f'{col}_log'] = np.log1p(df_prep[col])
print("=== 新增特征列表 ===")
new_cols = ['TotalPages', 'TotalDuration', 'AvgProductDuration',
'ProductPageRatio', 'Bounce_Exit_Ratio'] + [f'{c}_log' for c in skew_cols]
print(new_cols)
print(f"\n特征工程后数据形状: {df_prep.shape}")
=== 新增特征列表 === ['TotalPages', 'TotalDuration', 'AvgProductDuration', 'ProductPageRatio', 'Bounce_Exit_Ratio', 'PageValues_log', 'ProductRelated_Duration_log', 'Administrative_Duration_log', 'Informational_Duration_log', 'TotalDuration_log'] 特征工程后数据形状: (12330, 28)
# 2.2 特征编码 (One-Hot Encoding)
categorical_cols = ['Month', 'OperatingSystems', 'Browser', 'Region',
'TrafficType', 'VisitorType']
df_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)
print(f"编码后数据形状: {df_encoded.shape}")
# 2.3 数据集划分 (分层抽样)
X = df_encoded.drop('Revenue', axis=1)
y = df_encoded['Revenue']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集: X={X_train.shape}, y={y_train.shape}")
print(f"测试集: X={X_test.shape}, y={y_test.shape}")
print(f"\n训练集类别分布:\n{y_train.value_counts()}")
print(f"\n测试集类别分布:\n{y_test.value_counts()}")
# 计算 scale_pos_weight (后续 XGBoost/cost-sensitive 使用)
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight_val = neg_count / pos_count
print(f"\nscale_pos_weight = {scale_pos_weight_val:.2f}")
# 保存特征名列表 (后续可解释性分析使用)
feature_names = X.columns.tolist()
编码后数据形状: (12330, 79) 训练集: X=(9864, 78), y=(9864,) 测试集: X=(2466, 78), y=(2466,) 训练集类别分布: Revenue 0 8338 1 1526 Name: count, dtype: int64 测试集类别分布: Revenue 0 2084 1 382 Name: count, dtype: int64 scale_pos_weight = 5.46
3. 基线建模 + 分层交叉验证¶
使用 imblearn Pipeline 确保 SMOTE 仅在 CV 训练折中应用,杜绝数据泄露。 线性模型使用 StandardScaler,树模型直接训练。
# 3.1 定义统一的交叉验证框架
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
def run_cv(estimator, X, y, cv, name):
"""
执行交叉验证并返回平均值和标准差。
"""
results = cross_validate(estimator, X, y, cv=cv,
scoring=scoring_metrics, n_jobs=-1)
row = {'Model': name}
for metric in scoring_metrics:
key = f'test_{metric}'
row[f'{metric}_mean'] = results[key].mean()
row[f'{metric}_std'] = results[key].std()
return row
# 3.2 基线模型 (默认参数 + SMOTE Pipeline) 交叉验证
baseline_configs = {
'Logistic Regression': ImbPipeline([
('scaler', StandardScaler()),
('smote', SMOTE(random_state=42)),
('model', LogisticRegression(random_state=42, max_iter=1000))
]),
'Decision Tree': ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', DecisionTreeClassifier(random_state=42, max_depth=5))
]),
'Random Forest': ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', RandomForestClassifier(random_state=42, n_estimators=100))
]),
'XGBoost': ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', XGBClassifier(random_state=42, eval_metric='logloss',
verbosity=0))
]),
'LightGBM': ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', LGBMClassifier(random_state=42, verbose=-1))
]),
}
baseline_results = []
for name, pipe in baseline_configs.items():
print(f"正在交叉验证: {name} ...")
row = run_cv(pipe, X_train, y_train, cv, name)
baseline_results.append(row)
baseline_df = pd.DataFrame(baseline_results).set_index('Model')
# 格式化显示: mean ± std
display_cols = {}
for metric in scoring_metrics:
display_cols[metric] = baseline_df.apply(
lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
)
display(pd.DataFrame(display_cols, index=baseline_df.index))
正在交叉验证: Logistic Regression ... 正在交叉验证: Decision Tree ... 正在交叉验证: Random Forest ... 正在交叉验证: XGBoost ... 正在交叉验证: LightGBM ...
| accuracy | precision | recall | f1 | roc_auc | |
|---|---|---|---|---|---|
| Model | |||||
| Logistic Regression | 0.8683 ± 0.0037 | 0.5513 ± 0.0099 | 0.8034 ± 0.0124 | 0.6537 ± 0.0033 | 0.9164 ± 0.0033 |
| Decision Tree | 0.8875 ± 0.0060 | 0.6110 ± 0.0203 | 0.7556 ± 0.0228 | 0.6752 ± 0.0117 | 0.9056 ± 0.0012 |
| Random Forest | 0.8940 ± 0.0049 | 0.6362 ± 0.0162 | 0.7359 ± 0.0091 | 0.6824 ± 0.0124 | 0.9211 ± 0.0034 |
| XGBoost | 0.8939 ± 0.0035 | 0.6575 ± 0.0128 | 0.6560 ± 0.0295 | 0.6563 ± 0.0155 | 0.9212 ± 0.0028 |
| LightGBM | 0.8947 ± 0.0036 | 0.6545 ± 0.0124 | 0.6763 ± 0.0170 | 0.6651 ± 0.0120 | 0.9263 ± 0.0038 |
# 3.3 基线模型性能可视化
fig, ax = plt.subplots(figsize=(12, 6))
plot_metrics = ['f1_mean', 'roc_auc_mean', 'recall_mean', 'precision_mean']
plot_labels = ['F1-score', 'ROC-AUC', 'Recall', 'Precision']
x = np.arange(len(baseline_df))
width = 0.2
for i, (col, label) in enumerate(zip(plot_metrics, plot_labels)):
bars = ax.bar(x + i * width, baseline_df[col], width, label=label)
# 添加 std 误差线
ax.errorbar(x + i * width, baseline_df[col],
yerr=baseline_df[col.replace('mean', 'std')],
fmt='none', ecolor='black', capsize=3)
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Baseline Models - 5-Fold Stratified CV Performance (with SMOTE)')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(baseline_df.index, rotation=30, ha='right')
ax.legend(loc='lower right')
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()
4. 类别不平衡处理方案对比¶
比较三种策略: (1) 无处理 (Baseline)、(2) SMOTE、(3) class_weight / scale_pos_weight。 全部使用交叉验证以保证公平性。
# 4.1 定义三种不平衡处理策略的 Pipeline
def get_imbalance_configs(model_name):
"""
针对给定模型,返回三种不平衡处理策略。
"""
configs = {}
if model_name == 'Logistic Regression':
configs['No Balancing'] = SkPipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(random_state=42, max_iter=1000))
])
configs['SMOTE'] = ImbPipeline([
('scaler', StandardScaler()),
('smote', SMOTE(random_state=42)),
('model', LogisticRegression(random_state=42, max_iter=1000))
])
configs['class_weight=balanced'] = SkPipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(random_state=42, max_iter=1000,
class_weight='balanced'))
])
elif model_name == 'Random Forest':
configs['No Balancing'] = RandomForestClassifier(
random_state=42, n_estimators=100
)
configs['SMOTE'] = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', RandomForestClassifier(random_state=42, n_estimators=100))
])
configs['class_weight=balanced'] = RandomForestClassifier(
random_state=42, n_estimators=100, class_weight='balanced'
)
elif model_name == 'XGBoost':
configs['No Balancing'] = XGBClassifier(
random_state=42, eval_metric='logloss', verbosity=0
)
configs['SMOTE'] = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', XGBClassifier(random_state=42, eval_metric='logloss',
verbosity=0))
])
configs['scale_pos_weight'] = XGBClassifier(
random_state=42, eval_metric='logloss', verbosity=0,
scale_pos_weight=scale_pos_weight_val
)
elif model_name == 'LightGBM':
configs['No Balancing'] = LGBMClassifier(
random_state=42, verbose=-1
)
configs['SMOTE'] = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', LGBMClassifier(random_state=42, verbose=-1))
])
configs['class_weight=balanced'] = LGBMClassifier(
random_state=42, verbose=-1, class_weight='balanced'
)
return configs
# 4.2 对比实验 —— 对4个代表性模型 × 3种策略进行交叉验证
compare_models = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']
imbalance_results = []
for model_name in compare_models:
configs = get_imbalance_configs(model_name)
for strategy_name, estimator in configs.items():
label = f"{model_name} | {strategy_name}"
print(f"正在验证: {label} ...")
row = run_cv(estimator, X_train, y_train, cv, label)
row['Base_Model'] = model_name
row['Strategy'] = strategy_name
imbalance_results.append(row)
imb_df = pd.DataFrame(imbalance_results)
正在验证: Logistic Regression | No Balancing ... 正在验证: Logistic Regression | SMOTE ... 正在验证: Logistic Regression | class_weight=balanced ... 正在验证: Random Forest | No Balancing ... 正在验证: Random Forest | SMOTE ... 正在验证: Random Forest | class_weight=balanced ... 正在验证: XGBoost | No Balancing ... 正在验证: XGBoost | SMOTE ... 正在验证: XGBoost | scale_pos_weight ... 正在验证: LightGBM | No Balancing ... 正在验证: LightGBM | SMOTE ... 正在验证: LightGBM | class_weight=balanced ...
# 4.3 不平衡策略对比可视化
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()
for idx, model_name in enumerate(compare_models):
ax = axes[idx]
subset = imb_df[imb_df['Base_Model'] == model_name]
x = np.arange(len(subset))
width = 0.25
ax.bar(x - width, subset['f1_mean'], width, label='F1', color='#66c2a5')
ax.bar(x, subset['roc_auc_mean'], width, label='ROC-AUC', color='#fc8d62')
ax.bar(x + width, subset['recall_mean'], width, label='Recall', color='#8da0cb')
ax.set_title(f'{model_name}')
ax.set_xticks(x)
ax.set_xticklabels(subset['Strategy'], rotation=20, ha='right')
ax.set_ylim(0, 1.05)
ax.legend(fontsize=9)
plt.suptitle('Imbalance Handling Strategy Comparison (5-Fold CV)', fontsize=14)
plt.tight_layout()
plt.show()
# 找出每个模型的最佳策略
print("\n=== 各模型最佳不平衡处理策略 (按 F1-score) ===")
for model_name in compare_models:
subset = imb_df[imb_df['Base_Model'] == model_name]
best = subset.loc[subset['f1_mean'].idxmax()]
print(f" {model_name}: {best['Strategy']} (F1 = {best['f1_mean']:.4f})")
=== 各模型最佳不平衡处理策略 (按 F1-score) === Logistic Regression: SMOTE (F1 = 0.6537) Random Forest: SMOTE (F1 = 0.6824) XGBoost: scale_pos_weight (F1 = 0.6641) LightGBM: class_weight=balanced (F1 = 0.6740)
5. 超参数调优¶
对三个最有潜力的树模型 (Random Forest, XGBoost, LightGBM) 进行 RandomizedSearchCV, 以 F1-score 为优化目标 (因为类别不平衡,优化 accuracy 无意义)。
# 5.1 Random Forest 超参数调优
print("=" * 60)
print("超参数调优: Random Forest (SMOTE Pipeline)")
print("=" * 60)
pipe_rf_tune = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', RandomForestClassifier(random_state=42))
])
param_dist_rf = {
'model__n_estimators': [100, 200, 300, 500],
'model__max_depth': [5, 10, 15, 20, None],
'model__min_samples_split': [2, 5, 10],
'model__min_samples_leaf': [1, 2, 4],
'model__max_features': ['sqrt', 'log2', None]
}
search_rf = RandomizedSearchCV(
pipe_rf_tune,
param_distributions=param_dist_rf,
n_iter=20,
scoring='f1',
cv=cv,
random_state=42,
n_jobs=-1,
verbose=0
)
search_rf.fit(X_train, y_train)
print(f"Best F1 (CV): {search_rf.best_score_:.4f}")
print(f"Best Params: {search_rf.best_params_}")
============================================================
超参数调优: Random Forest (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6854
Best Params: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 4, 'model__max_features': 'log2', 'model__max_depth': None}
# 5.2 XGBoost 超参数调优
print("=" * 60)
print("超参数调优: XGBoost (SMOTE Pipeline)")
print("=" * 60)
pipe_xgb_tune = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0))
])
param_dist_xgb = {
'model__n_estimators': [100, 200, 300, 500],
'model__max_depth': [3, 5, 7, 10],
'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
'model__min_child_weight': [1, 3, 5, 7],
'model__gamma': [0, 0.1, 0.2, 0.3]
}
search_xgb = RandomizedSearchCV(
pipe_xgb_tune,
param_distributions=param_dist_xgb,
n_iter=20,
scoring='f1',
cv=cv,
random_state=42,
n_jobs=-1,
verbose=0
)
search_xgb.fit(X_train, y_train)
print(f"Best F1 (CV): {search_xgb.best_score_:.4f}")
print(f"Best Params: {search_xgb.best_params_}")
============================================================
超参数调优: XGBoost (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6871
Best Params: {'model__subsample': 0.9, 'model__n_estimators': 100, 'model__min_child_weight': 3, 'model__max_depth': 3, 'model__learning_rate': 0.1, 'model__gamma': 0, 'model__colsample_bytree': 0.9}
# 5.3 LightGBM 超参数调优
print("=" * 60)
print("超参数调优: LightGBM (SMOTE Pipeline)")
print("=" * 60)
pipe_lgbm_tune = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', LGBMClassifier(random_state=42, verbose=-1))
])
param_dist_lgbm = {
'model__n_estimators': [100, 200, 300, 500],
'model__max_depth': [3, 5, 7, 10, -1],
'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
'model__num_leaves': [15, 31, 63, 127],
'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
'model__min_child_samples': [5, 10, 20, 50],
'model__reg_alpha': [0, 0.01, 0.1, 1],
'model__reg_lambda': [0, 0.01, 0.1, 1]
}
search_lgbm = RandomizedSearchCV(
pipe_lgbm_tune,
param_distributions=param_dist_lgbm,
n_iter=20,
scoring='f1',
cv=cv,
random_state=42,
n_jobs=-1,
verbose=0
)
search_lgbm.fit(X_train, y_train)
print(f"Best F1 (CV): {search_lgbm.best_score_:.4f}")
print(f"Best Params: {search_lgbm.best_params_}")
============================================================
超参数调优: LightGBM (SMOTE Pipeline)
============================================================
Best F1 (CV): 0.6850
Best Params: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0.01, 'model__num_leaves': 63, 'model__n_estimators': 300, 'model__min_child_samples': 10, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}
# 5.4 调参前后对比
print("\n=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===")
tuning_comparison = pd.DataFrame({
'Model': ['Random Forest', 'XGBoost', 'LightGBM'],
'Before Tuning (F1)': [
baseline_df.loc['Random Forest', 'f1_mean'],
baseline_df.loc['XGBoost', 'f1_mean'],
baseline_df.loc['LightGBM', 'f1_mean']
],
'After Tuning (F1)': [
search_rf.best_score_,
search_xgb.best_score_,
search_lgbm.best_score_
]
}).set_index('Model')
tuning_comparison['Improvement'] = (
tuning_comparison['After Tuning (F1)'] - tuning_comparison['Before Tuning (F1)']
)
display(tuning_comparison.round(4))
# 选出最优模型
best_searches = {
'Random Forest': search_rf,
'XGBoost': search_xgb,
'LightGBM': search_lgbm
}
best_model_name = tuning_comparison['After Tuning (F1)'].idxmax()
best_search = best_searches[best_model_name]
print(f"\n>>> 最优模型: {best_model_name} (Tuned F1 = {best_search.best_score_:.4f})")
=== 超参数调优前后 F1-score 对比 (5-Fold CV) ===
| Before Tuning (F1) | After Tuning (F1) | Improvement | |
|---|---|---|---|
| Model | |||
| Random Forest | 0.6824 | 0.6854 | 0.0030 |
| XGBoost | 0.6563 | 0.6871 | 0.0308 |
| LightGBM | 0.6651 | 0.6850 | 0.0199 |
>>> 最优模型: XGBoost (Tuned F1 = 0.6871)
6. 阈值优化¶
默认阈值 0.5 不一定最优。通过 Precision-Recall Curve 找到使 F1 最大化的阈值。 同时展示不同阈值对 Precision/Recall/F1 的影响曲线。
# 6.1 使用最优调参模型在测试集上获取概率预测
best_pipeline = best_search.best_estimator_
y_proba_best = best_pipeline.predict_proba(X_test)[:, 1]
# 6.2 计算各阈值下的 Precision, Recall, F1
precisions_curve, recalls_curve, thresholds_pr = precision_recall_curve(y_test, y_proba_best)
f1_scores_curve = 2 * (precisions_curve * recalls_curve) / (precisions_curve + recalls_curve + 1e-10)
# 最优阈值 (最大化 F1)
best_threshold_idx = np.argmax(f1_scores_curve[:-1]) # 最后一个点可能无意义
best_threshold = thresholds_pr[best_threshold_idx]
best_f1_at_threshold = f1_scores_curve[best_threshold_idx]
print(f"默认阈值 0.5 的 F1: {f1_score(y_test, (y_proba_best >= 0.5).astype(int)):.4f}")
print(f"最优阈值: {best_threshold:.4f}")
print(f"最优阈值下的 F1: {best_f1_at_threshold:.4f}")
默认阈值 0.5 的 F1: 0.6723 最优阈值: 0.5889 最优阈值下的 F1: 0.6750
# 6.3 阈值 vs Precision / Recall / F1 曲线
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
# 左图: Threshold vs Metrics
axes[0].plot(thresholds_pr, precisions_curve[:-1], label='Precision', linewidth=2)
axes[0].plot(thresholds_pr, recalls_curve[:-1], label='Recall', linewidth=2)
axes[0].plot(thresholds_pr, f1_scores_curve[:-1], label='F1-score', linewidth=2)
axes[0].axvline(x=best_threshold, color='red', linestyle='--',
label=f'Best Threshold = {best_threshold:.3f}')
axes[0].axvline(x=0.5, color='gray', linestyle=':', label='Default 0.5')
axes[0].set_xlabel('Decision Threshold')
axes[0].set_ylabel('Score')
axes[0].set_title('Threshold Optimization: Precision / Recall / F1')
axes[0].legend()
# 右图: Precision-Recall Curve
axes[1].plot(recalls_curve, precisions_curve, linewidth=2)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
ap = average_precision_score(y_test, y_proba_best)
axes[1].set_title(f'Precision-Recall Curve (AP = {ap:.3f})')
axes[1].fill_between(recalls_curve, precisions_curve, alpha=0.2)
plt.tight_layout()
plt.show()
# 6.4 对比默认阈值 vs 最优阈值
y_pred_default = (y_proba_best >= 0.5).astype(int)
y_pred_optimal = (y_proba_best >= best_threshold).astype(int)
print("=== 默认阈值 (0.5) ===")
print(classification_report(y_test, y_pred_default, digits=4))
print(f"\n=== 最优阈值 ({best_threshold:.4f}) ===")
print(classification_report(y_test, y_pred_optimal, digits=4))
# 混淆矩阵对比
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_default, ax=axes[0],
cmap='Blues', colorbar=False)
axes[0].set_title('Confusion Matrix (Threshold = 0.5)')
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_optimal, ax=axes[1],
cmap='Oranges', colorbar=False)
axes[1].set_title(f'Confusion Matrix (Threshold = {best_threshold:.3f})')
plt.tight_layout()
plt.show()
=== 默认阈值 (0.5) ===
precision recall f1-score support
0 0.9485 0.9199 0.9340 2084
1 0.6247 0.7277 0.6723 382
accuracy 0.8901 2466
macro avg 0.7866 0.8238 0.8031 2466
weighted avg 0.8984 0.8901 0.8934 2466
=== 最优阈值 (0.5889) ===
precision recall f1-score support
0 0.9449 0.9299 0.9374 2084
1 0.6482 0.7042 0.6750 382
accuracy 0.8950 2466
macro avg 0.7965 0.8171 0.8062 2466
weighted avg 0.8989 0.8950 0.8967 2466
7. 最终模型全面评估¶
包括 ROC Curve 和 Precision-Recall Curve 对比所有调优后/基线模型, 以及最优模型的详细分类报告。
# 7.1 训练所有最终模型并收集预测概率
# 用 SMOTE 处理训练集 (供单独模型训练使用)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# 为线性模型准备标准化数据
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)
# 最终模型集合 (包含调优后参数)
final_models = {}
# Logistic Regression (使用标准化数据)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_res_scaled, y_train_res)
final_models['Logistic Regression'] = {
'model': lr_model,
'y_proba': lr_model.predict_proba(X_test_scaled)[:, 1]
}
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train_res, y_train_res)
final_models['Decision Tree'] = {
'model': dt_model,
'y_proba': dt_model.predict_proba(X_test.values)[:, 1]
}
# Random Forest (调优后)
rf_best_params = {k.replace('model__', ''): v
for k, v in search_rf.best_params_.items()}
rf_tuned = RandomForestClassifier(**rf_best_params, random_state=42)
rf_tuned.fit(X_train_res, y_train_res)
final_models['Random Forest (Tuned)'] = {
'model': rf_tuned,
'y_proba': rf_tuned.predict_proba(X_test.values)[:, 1]
}
# XGBoost (调优后)
xgb_best_params = {k.replace('model__', ''): v
for k, v in search_xgb.best_params_.items()}
xgb_tuned = XGBClassifier(**xgb_best_params, random_state=42,
eval_metric='logloss', verbosity=0)
xgb_tuned.fit(X_train_res, y_train_res)
final_models['XGBoost (Tuned)'] = {
'model': xgb_tuned,
'y_proba': xgb_tuned.predict_proba(X_test.values)[:, 1]
}
# LightGBM (调优后)
lgbm_best_params = {k.replace('model__', ''): v
for k, v in search_lgbm.best_params_.items()}
lgbm_tuned = LGBMClassifier(**lgbm_best_params, random_state=42, verbose=-1)
lgbm_tuned.fit(X_train_res, y_train_res)
final_models['LightGBM (Tuned)'] = {
'model': lgbm_tuned,
'y_proba': lgbm_tuned.predict_proba(X_test.values)[:, 1]
}
# 7.2 ROC Curve 对比 (所有模型)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# 左图: ROC Curve
for name, info in final_models.items():
fpr, tpr, _ = roc_curve(y_test, info['y_proba'])
auc_val = roc_auc_score(y_test, info['y_proba'])
axes[0].plot(fpr, tpr, linewidth=2, label=f"{name} (AUC={auc_val:.3f})")
axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve Comparison')
axes[0].legend(fontsize=9)
# 右图: Precision-Recall Curve
for name, info in final_models.items():
prec_c, rec_c, _ = precision_recall_curve(y_test, info['y_proba'])
ap_val = average_precision_score(y_test, info['y_proba'])
axes[1].plot(rec_c, prec_c, linewidth=2, label=f"{name} (AP={ap_val:.3f})")
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve Comparison')
axes[1].legend(fontsize=9)
plt.tight_layout()
plt.show()
# 7.3 最终模型对比汇总表 (使用最优阈值)
final_results = []
for name, info in final_models.items():
y_proba = info['y_proba']
# 使用最优阈值 (基于 F1)
prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba)
f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
opt_idx = np.argmax(f1_c[:-1])
opt_thr = thr_c[opt_idx]
y_pred = (y_proba >= opt_thr).astype(int)
final_results.append({
'Model': name,
'Optimal Threshold': opt_thr,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision': precision_score(y_test, y_pred),
'Recall': recall_score(y_test, y_pred),
'F1-score': f1_score(y_test, y_pred),
'ROC-AUC': roc_auc_score(y_test, y_proba),
'Avg Precision': average_precision_score(y_test, y_proba)
})
final_results_df = pd.DataFrame(final_results).set_index('Model')
display(final_results_df.round(4))
# 找出综合最优模型
overall_best_name = final_results_df['F1-score'].idxmax()
print(f"\n>>> 综合最优模型: {overall_best_name}")
print(f" F1 = {final_results_df.loc[overall_best_name, 'F1-score']:.4f}")
print(f" ROC-AUC = {final_results_df.loc[overall_best_name, 'ROC-AUC']:.4f}")
| Optimal Threshold | Accuracy | Precision | Recall | F1-score | ROC-AUC | Avg Precision | |
|---|---|---|---|---|---|---|---|
| Model | |||||||
| Logistic Regression | 0.4372 | 0.8816 | 0.6098 | 0.6545 | 0.6313 | 0.8887 | 0.6177 |
| Decision Tree | 0.6044 | 0.8881 | 0.6157 | 0.7382 | 0.6714 | 0.9108 | 0.6308 |
| Random Forest (Tuned) | 0.5966 | 0.8958 | 0.6574 | 0.6832 | 0.6701 | 0.9189 | 0.6926 |
| XGBoost (Tuned) | 0.5889 | 0.8950 | 0.6482 | 0.7042 | 0.6750 | 0.9235 | 0.6951 |
| LightGBM (Tuned) | 0.4827 | 0.8897 | 0.6239 | 0.7251 | 0.6707 | 0.9299 | 0.7202 |
>>> 综合最优模型: XGBoost (Tuned)
F1 = 0.6750
ROC-AUC = 0.9235
8. 模型可解释性分析¶
使用 SHAP Values、Permutation Importance 和 Partial Dependence Plot (PDP) 多角度打开机器学习"黑盒"。
# 选取表现最好的树模型进行可解释性分析
# (使用 RF Tuned 作为演示,因为 SHAP TreeExplainer 对其支持最好)
explain_model = rf_tuned
explain_model_name = "Random Forest (Tuned)"
print(f"可解释性分析模型: {explain_model_name}")
可解释性分析模型: Random Forest (Tuned)
# 8.1 SHAP Values 分析
# 取测试集样本 (控制计算量)
n_shap_samples = 200
X_test_sample = X_test.iloc[:n_shap_samples].copy()
explainer = shap.TreeExplainer(explain_model)
shap_exp = explainer(X_test_sample)
# 处理二分类可能产生的三维 SHAP values
if isinstance(shap_exp.values, np.ndarray) and len(shap_exp.values.shape) == 3:
shap_values_pos = shap.Explanation(
values=shap_exp.values[:, :, 1],
base_values=(shap_exp.base_values[:, 1]
if len(shap_exp.base_values.shape) > 1
else shap_exp.base_values),
data=shap_exp.data,
feature_names=feature_names
)
else:
shap_values_pos = shap_exp
# SHAP Beeswarm Plot (全局特征重要性 + 方向)
print("=== SHAP Beeswarm Plot (Top 20 Features) ===")
shap.plots.beeswarm(shap_values_pos, max_display=20, show=True)
=== SHAP Beeswarm Plot (Top 20 Features) ===
# SHAP Bar Plot (平均绝对 SHAP 值)
print("=== SHAP Bar Plot (Mean |SHAP|) ===")
shap.plots.bar(shap_values_pos, max_display=20, show=True)
=== SHAP Bar Plot (Mean |SHAP|) ===
# SHAP Dependence Plot (Top 2 特征的详细交互)
top_features_idx = np.argsort(-np.abs(shap_values_pos.values).mean(axis=0))[:2]
top_feature_names = [feature_names[i] for i in top_features_idx]
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
for i, feat in enumerate(top_feature_names):
shap.plots.scatter(shap_values_pos[:, feat], ax=axes[i], show=False)
axes[i].set_title(f'SHAP Dependence: {feat}')
plt.tight_layout()
plt.show()
# 8.2 Permutation Importance (模型无关的特征重要性评估)
print("=== Permutation Importance (10 repeats) ===")
perm_result = permutation_importance(
explain_model, X_test, y_test,
n_repeats=10, random_state=42, scoring='f1', n_jobs=-1
)
perm_df = pd.DataFrame({
'Feature': feature_names,
'Importance_mean': perm_result.importances_mean,
'Importance_std': perm_result.importances_std
}).sort_values('Importance_mean', ascending=False)
# 可视化 Top 15
fig, ax = plt.subplots(figsize=(10, 7))
top_perm = perm_df.head(15)
ax.barh(range(len(top_perm)), top_perm['Importance_mean'],
xerr=top_perm['Importance_std'], color='steelblue', edgecolor='black')
ax.set_yticks(range(len(top_perm)))
ax.set_yticklabels(top_perm['Feature'])
ax.invert_yaxis()
ax.set_xlabel('Mean Permutation Importance (F1 decrease)')
ax.set_title('Top 15 Permutation Importance')
plt.tight_layout()
plt.show()
=== Permutation Importance (10 repeats) ===
# 8.3 SHAP vs Permutation Importance 交叉验证
print("\n=== SHAP 与 Permutation Importance Top 10 对比 ===")
shap_importance = pd.Series(
np.abs(shap_values_pos.values).mean(axis=0),
index=feature_names
).sort_values(ascending=False)
perm_importance = perm_df.set_index('Feature')['Importance_mean']
comparison_top10 = pd.DataFrame({
'SHAP Rank': range(1, 11),
'SHAP Feature': shap_importance.head(10).index.tolist(),
'Perm Rank': range(1, 11),
'Perm Feature': perm_importance.sort_values(ascending=False).head(10).index.tolist()
})
display(comparison_top10)
=== SHAP 与 Permutation Importance Top 10 对比 ===
| SHAP Rank | SHAP Feature | Perm Rank | Perm Feature | |
|---|---|---|---|---|
| 0 | 1 | PageValues | 1 | PageValues |
| 1 | 2 | PageValues_log | 2 | PageValues_log |
| 2 | 3 | Month_Nov | 3 | Month_Nov |
| 3 | 4 | TrafficType_2 | 4 | Administrative |
| 4 | 5 | ExitRates | 5 | Month_Mar |
| 5 | 6 | OperatingSystems_2 | 6 | ProductRelated_Duration |
| 6 | 7 | Administrative_Duration | 7 | Informational |
| 7 | 8 | Administrative_Duration_log | 8 | ProductRelated_Duration_log |
| 8 | 9 | ProductPageRatio | 9 | TotalPages |
| 9 | 10 | Administrative | 10 | Administrative_Duration_log |
# 8.4 Partial Dependence Plot (PDP) —— 展示特征边际效应
print("=== Partial Dependence Plots ===")
# 选取 SHAP 重要性最高的 4 个连续特征
top_continuous_features = []
continuous_original = ['PageValues', 'ExitRates', 'BounceRates',
'ProductRelated_Duration', 'ProductRelated',
'TotalDuration', 'TotalPages', 'PageValues_log']
for feat in shap_importance.index:
if feat in continuous_original:
top_continuous_features.append(feat)
if len(top_continuous_features) >= 4:
break
fig, axes = plt.subplots(1, len(top_continuous_features),
figsize=(5 * len(top_continuous_features), 4))
if len(top_continuous_features) == 1:
axes = [axes]
PartialDependenceDisplay.from_estimator(
explain_model, X_test, top_continuous_features,
ax=axes, grid_resolution=50
)
plt.suptitle('Partial Dependence Plots (PDP)', fontsize=14, y=1.05)
plt.tight_layout()
plt.show()
=== Partial Dependence Plots ===
9. 高级诊断分析¶
包括 校准曲线 (Calibration Curve)、学习曲线 (Learning Curve)、 错误分析 (Error Analysis) 和 成本敏感分析 (Cost-Sensitive Analysis)。
# 9.1 校准曲线 (Calibration Curve)
# 检验模型输出概率的可靠性
print("=== 校准曲线 (Calibration Curve) ===")
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')
for name, info in final_models.items():
prob_true, prob_pred = calibration_curve(y_test, info['y_proba'], n_bins=10)
ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=name)
ax.set_xlabel('Mean Predicted Probability')
ax.set_ylabel('Fraction of Positives')
ax.set_title('Calibration Curve (Reliability Diagram)')
ax.legend(fontsize=9, loc='lower right')
plt.tight_layout()
plt.show()
=== 校准曲线 (Calibration Curve) ===
# 9.2 学习曲线 (Learning Curve) —— 检测过拟合/欠拟合
print("=== 学习曲线 (Learning Curve) ===")
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
lc_models = {
'Random Forest (Tuned)': rf_tuned,
'XGBoost (Tuned)': xgb_tuned,
'LightGBM (Tuned)': lgbm_tuned
}
for idx, (name, model) in enumerate(lc_models.items()):
train_sizes, train_scores, val_scores = learning_curve(
model, X_train_res, y_train_res,
cv=5, scoring='f1',
train_sizes=np.linspace(0.2, 1.0, 5),
n_jobs=-1, random_state=42
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
axes[idx].fill_between(train_sizes, train_mean - train_std,
train_mean + train_std, alpha=0.1, color='blue')
axes[idx].fill_between(train_sizes, val_mean - val_std,
val_mean + val_std, alpha=0.1, color='orange')
axes[idx].plot(train_sizes, train_mean, 'o-', color='blue',
label='Training F1')
axes[idx].plot(train_sizes, val_mean, 'o-', color='orange',
label='Validation F1')
axes[idx].set_title(f'Learning Curve: {name}')
axes[idx].set_xlabel('Training Set Size')
axes[idx].set_ylabel('F1 Score')
axes[idx].legend(loc='lower right')
axes[idx].set_ylim(0, 1.1)
plt.tight_layout()
plt.show()
=== 学习曲线 (Learning Curve) ===
# 9.3 错误分析 (Error Analysis) —— 分析哪些样本容易被误判
# 使用综合最优模型
best_final_model = final_models[overall_best_name]
y_proba_final = best_final_model['y_proba']
# 使用最优阈值
prec_c, rec_c, thr_c = precision_recall_curve(y_test, y_proba_final)
f1_c = 2 * (prec_c * rec_c) / (prec_c + rec_c + 1e-10)
final_threshold = thr_c[np.argmax(f1_c[:-1])]
y_pred_final = (y_proba_final >= final_threshold).astype(int)
# 构建错误分析 DataFrame
error_df = df.iloc[y_test.index].copy()
error_df['y_true'] = y_test.values
error_df['y_pred'] = y_pred_final
error_df['y_proba'] = y_proba_final
error_df['correct'] = (error_df['y_true'] == error_df['y_pred'])
error_df['error_type'] = 'Correct'
error_df.loc[(error_df['y_true'] == 1) & (error_df['y_pred'] == 0), 'error_type'] = 'False Negative'
error_df.loc[(error_df['y_true'] == 0) & (error_df['y_pred'] == 1), 'error_type'] = 'False Positive'
print("=== 错误类型分布 ===")
print(error_df['error_type'].value_counts())
=== 错误类型分布 === error_type Correct 2207 False Positive 146 False Negative 113 Name: count, dtype: int64
# 9.3a 误判样本的特征分析
print("\n=== False Negative vs True Positive 特征均值对比 ===")
analysis_cols = ['PageValues', 'BounceRates', 'ExitRates',
'ProductRelated', 'ProductRelated_Duration']
fn_samples = error_df[error_df['error_type'] == 'False Negative']
tp_samples = error_df[(error_df['y_true'] == 1) & (error_df['y_pred'] == 1)]
fp_samples = error_df[error_df['error_type'] == 'False Positive']
tn_samples = error_df[(error_df['y_true'] == 0) & (error_df['y_pred'] == 0)]
error_analysis = pd.DataFrame({
'True Positive (Mean)': tp_samples[analysis_cols].mean(),
'False Negative (Mean)': fn_samples[analysis_cols].mean(),
'True Negative (Mean)': tn_samples[analysis_cols].mean(),
'False Positive (Mean)': fp_samples[analysis_cols].mean()
})
display(error_analysis.round(4))
=== False Negative vs True Positive 特征均值对比 ===
| True Positive (Mean) | False Negative (Mean) | True Negative (Mean) | False Positive (Mean) | |
|---|---|---|---|---|
| PageValues | 34.2793 | 3.6043 | 0.4245 | 24.1529 |
| BounceRates | 0.0044 | 0.0110 | 0.0254 | 0.0075 |
| ExitRates | 0.0186 | 0.0293 | 0.0475 | 0.0241 |
| ProductRelated | 48.3048 | 60.0708 | 27.8839 | 49.1644 |
| ProductRelated_Duration | 1949.1567 | 2218.8107 | 1011.3520 | 2000.5623 |
# 9.3b 各月份/用户类型的误判率
print("\n=== 各月份误判率 (False Negative Rate among actual positives) ===")
actual_positive = error_df[error_df['y_true'] == 1]
monthly_fn_rate = actual_positive.groupby('Month').apply(
lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(monthly_fn_rate.round(4))
print("\n=== 各访客类型误判率 ===")
visitor_fn_rate = actual_positive.groupby('VisitorType').apply(
lambda g: (g['error_type'] == 'False Negative').mean()
).sort_values(ascending=False)
print(visitor_fn_rate.round(4))
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
monthly_fn_rate.plot(kind='bar', ax=axes[0], color='salmon', edgecolor='black')
axes[0].set_title('False Negative Rate by Month')
axes[0].set_ylabel('FN Rate')
axes[0].tick_params(axis='x', rotation=45)
visitor_fn_rate.plot(kind='bar', ax=axes[1], color='lightblue', edgecolor='black')
axes[1].set_title('False Negative Rate by Visitor Type')
axes[1].set_ylabel('FN Rate')
plt.tight_layout()
plt.show()
=== 各月份误判率 (False Negative Rate among actual positives) === Month Aug 0.6000 Jul 0.5500 Nov 0.3678 Sep 0.3636 Oct 0.2353 Dec 0.2188 June 0.1667 May 0.1370 Mar 0.0909 Feb 0.0000 dtype: float64 === 各访客类型误判率 === VisitorType Returning_Visitor 0.3103 New_Visitor 0.2584 Other 0.0000 dtype: float64
# 9.4 成本敏感分析 (Cost-Sensitive Business Analysis)
print("=== 成本敏感分析 ===")
# 假设业务成本模型:
# - 发送优惠券成本: ¥5 / 用户
# - 成功促成转化收益: ¥50 / 用户
# - 错过真实购买用户 (False Negative) 机会成本: ¥50
# - 错误发送优惠券 (False Positive) 成本: ¥5
cost_per_fp = 5 # 误发优惠券
gain_per_tp = 50 # 成功转化
loss_per_fn = 50 # 错失客户
def calculate_business_value(y_true, y_pred):
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
total_gain = tp * gain_per_tp
total_cost = fp * cost_per_fp + fn * loss_per_fn
net_value = total_gain - total_cost
return {
'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn,
'Total Gain': total_gain,
'Total Cost': total_cost,
'Net Business Value': net_value
}
# 比较不同阈值的商业价值
thresholds_to_compare = [0.3, 0.4, best_threshold, 0.5, 0.6, 0.7]
business_results = []
for thr in thresholds_to_compare:
y_pred_thr = (y_proba_final >= thr).astype(int)
bv = calculate_business_value(y_test, y_pred_thr)
bv['Threshold'] = thr
business_results.append(bv)
bv_df = pd.DataFrame(business_results).set_index('Threshold')
display(bv_df)
# 可视化商业价值
plt.figure(figsize=(8, 5))
plt.plot(bv_df.index, bv_df['Net Business Value'], 'o-', linewidth=2,
markersize=8, color='green')
plt.axvline(x=best_threshold, color='red', linestyle='--',
label=f'F1-Optimal Threshold = {best_threshold:.3f}')
plt.xlabel('Decision Threshold')
plt.ylabel('Net Business Value (¥)')
plt.title('Business Value at Different Thresholds')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
best_bv_threshold = bv_df['Net Business Value'].idxmax()
print(f"\n>>> 商业价值最大化阈值: {best_bv_threshold}")
print(f" 净商业价值: ¥{bv_df.loc[best_bv_threshold, 'Net Business Value']}")
=== 成本敏感分析 ===
| TP | FP | FN | TN | Total Gain | Total Cost | Net Business Value | |
|---|---|---|---|---|---|---|---|
| Threshold | |||||||
| 0.300000 | 311 | 261 | 71 | 1823 | 15550 | 4855 | 10695 |
| 0.400000 | 287 | 194 | 95 | 1890 | 14350 | 5720 | 8630 |
| 0.588857 | 269 | 146 | 113 | 1938 | 13450 | 6380 | 7070 |
| 0.500000 | 278 | 167 | 104 | 1917 | 13900 | 6035 | 7865 |
| 0.600000 | 264 | 144 | 118 | 1940 | 13200 | 6620 | 6580 |
| 0.700000 | 237 | 108 | 145 | 1976 | 11850 | 7790 | 4060 |
>>> 商业价值最大化阈值: 0.3
净商业价值: ¥10695
10. 客户分群聚类分析¶
使用 肘部法则 (Elbow Method) 和 轮廓系数 (Silhouette Score) 确定最优聚类数, 进行 K-Means 聚类,使用 PCA 可视化,并做 聚类画像 (Cluster Profiling) 和命名。
# 10.1 聚类特征选择与标准化
cluster_features = ['Administrative', 'Administrative_Duration', 'Informational',
'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
X_cluster = df[cluster_features].copy()
scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)
print(f"聚类数据形状: {X_cluster_scaled.shape}")
聚类数据形状: (12330, 10)
# 10.2 肘部法则 + 轮廓系数确定最优 K
k_range = range(2, 9)
inertias = []
sil_scores = []
for k in k_range:
kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
labels_temp = kmeans_temp.fit_predict(X_cluster_scaled)
inertias.append(kmeans_temp.inertia_)
sil_scores.append(silhouette_score(X_cluster_scaled, labels_temp))
print(f"K={k}: Inertia={kmeans_temp.inertia_:.0f}, Silhouette={sil_scores[-1]:.4f}")
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (Within-Cluster Sum of Squares)')
axes[0].set_title('Elbow Method')
axes[1].plot(k_range, sil_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score Analysis')
# 标记最优 K
best_k = list(k_range)[np.argmax(sil_scores)]
axes[1].axvline(x=best_k, color='green', linestyle='--',
label=f'Best k = {best_k}')
axes[1].legend()
plt.tight_layout()
plt.show()
print(f"\n>>> 基于轮廓系数的最优 K = {best_k}")
K=2: Inertia=99672, Silhouette=0.4359 K=3: Inertia=80117, Silhouette=0.4517 K=4: Inertia=70377, Silhouette=0.4304 K=5: Inertia=62615, Silhouette=0.3981 K=6: Inertia=55748, Silhouette=0.4076 K=7: Inertia=49918, Silhouette=0.4079 K=8: Inertia=46254, Silhouette=0.3727
>>> 基于轮廓系数的最优 K = 3
# 10.3 使用最优 K 执行 KMeans 聚类
kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_cluster_scaled)
df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels
print("=== 各聚类簇的样本数量 ===")
print(df_clustered['Cluster'].value_counts().sort_index())
print("\n=== 各聚类簇的购买转化率 ===")
print(df_clustered.groupby('Cluster')['Revenue'].mean().round(4))
=== 各聚类簇的样本数量 === Cluster 0 9652 1 1628 2 1050 Name: count, dtype: int64 === 各聚类簇的购买转化率 === Cluster 0 0.1502 1 0.2776 2 0.0057 Name: Revenue, dtype: float64
# 10.4 PCA 降维可视化
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)
print(f"PCA 解释方差比: {pca.explained_variance_ratio_.round(4)}")
print(f"PCA 累计解释方差: {pca.explained_variance_ratio_.sum():.4f}")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# 按聚类标签着色
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1],
c=cluster_labels, cmap='viridis',
alpha=0.5, s=15)
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')
axes[0].set_title(f'K-Means Clustering (K={best_k}) - PCA Projection')
plt.colorbar(scatter1, ax=axes[0], label='Cluster')
# 按实际 Revenue 着色 (对照)
scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1],
c=df['Revenue'].astype(int), cmap='coolwarm',
alpha=0.5, s=15)
axes[1].set_xlabel('PCA Component 1')
axes[1].set_ylabel('PCA Component 2')
axes[1].set_title('Actual Revenue Labels - PCA Projection')
plt.colorbar(scatter2, ax=axes[1], label='Revenue')
plt.tight_layout()
plt.show()
PCA 解释方差比: [0.34 0.1675] PCA 累计解释方差: 0.5076
# 10.5 聚类画像分析 (Cluster Profiling)
print("=== 详细聚类画像 (各簇特征均值) ===")
profile_features = cluster_features + ['Revenue']
cluster_profile = df_clustered.groupby('Cluster')[profile_features].agg(['mean', 'median'])
display(cluster_profile.round(4))
# 简化版画像 (均值)
profile_mean = df_clustered.groupby('Cluster')[profile_features].mean()
display(profile_mean.round(4))
# 10.5a 聚类画像热力图
fig, ax = plt.subplots(figsize=(14, best_k + 2))
profile_norm = profile_mean.copy()
for col in profile_norm.columns:
col_min = profile_norm[col].min()
col_max = profile_norm[col].max()
if col_max > col_min:
profile_norm[col] = (profile_norm[col] - col_min) / (col_max - col_min)
else:
profile_norm[col] = 0
sns.heatmap(profile_norm, annot=profile_mean.round(2).values,
cmap='YlOrRd', fmt='', linewidths=1, ax=ax)
ax.set_title('Cluster Profile Heatmap (Normalized, Values = Actual Means)')
ax.set_ylabel('Cluster')
plt.tight_layout()
plt.show()
=== 详细聚类画像 (各簇特征均值) ===
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ... | BounceRates | ExitRates | PageValues | SpecialDay | Revenue | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | median | mean | median | mean | median | mean | median | mean | median | ... | mean | median | mean | median | mean | median | mean | median | mean | median | |
| Cluster | |||||||||||||||||||||
| 0 | 1.7090 | 1.0 | 51.1829 | 4.0000 | 0.2165 | 0.0 | 8.1459 | 0.0000 | 23.8882 | 17.0 | ... | 0.0092 | 0.0000 | 0.0321 | 0.0250 | 5.8774 | 0.0000 | 0.0617 | 0.0 | 0.1502 | 0.0 |
| 1 | 7.3710 | 7.0 | 307.7312 | 197.9757 | 2.5203 | 2.0 | 212.7327 | 93.2083 | 96.7439 | 75.0 | ... | 0.0063 | 0.0041 | 0.0194 | 0.0175 | 9.7577 | 1.8372 | 0.0301 | 0.0 | 0.2776 | 0.0 |
| 2 | 0.0486 | 0.0 | 1.4189 | 0.0000 | 0.0152 | 0.0 | 0.0869 | 0.0000 | 3.0295 | 1.0 | ... | 0.1665 | 0.2000 | 0.1805 | 0.2000 | 0.0000 | 0.0000 | 0.1074 | 0.0 | 0.0057 | 0.0 |
3 rows × 22 columns
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Cluster | |||||||||||
| 0 | 1.7090 | 51.1829 | 0.2165 | 8.1459 | 23.8882 | 873.8026 | 0.0092 | 0.0321 | 5.8774 | 0.0617 | 0.1502 |
| 1 | 7.3710 | 307.7312 | 2.5203 | 212.7327 | 96.7439 | 3831.1453 | 0.0063 | 0.0194 | 9.7577 | 0.0301 | 0.2776 |
| 2 | 0.0486 | 1.4189 | 0.0152 | 0.0869 | 3.0295 | 57.3080 | 0.1665 | 0.1805 | 0.0000 | 0.1074 | 0.0057 |
# 10.5b 聚类雷达图 (Spider Chart)
from matplotlib.patches import FancyBboxPatch
radar_features = ['Administrative', 'ProductRelated', 'BounceRates',
'ExitRates', 'PageValues', 'Revenue']
# 标准化到 0-1 范围用于雷达图
radar_data = profile_mean[radar_features].copy()
for col in radar_data.columns:
col_min = radar_data[col].min()
col_max = radar_data[col].max()
if col_max > col_min:
radar_data[col] = (radar_data[col] - col_min) / (col_max - col_min)
else:
radar_data[col] = 0
angles = np.linspace(0, 2 * np.pi, len(radar_features), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
colors = plt.cm.Set2(np.linspace(0, 1, best_k))
for i in range(best_k):
values = radar_data.iloc[i].tolist()
values += values[:1]
ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {i}', color=colors[i])
ax.fill(angles, values, alpha=0.15, color=colors[i])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(radar_features, fontsize=10)
ax.set_title('Cluster Profile Radar Chart', fontsize=14, pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()
# 10.6 聚类命名与业务解读
print("=== 聚类命名与业务解读 ===\n")
# 根据画像自动生成聚类描述
for cluster_id in range(best_k):
p = profile_mean.loc[cluster_id]
conv_rate = p['Revenue']
page_val = p['PageValues']
bounce = p['BounceRates']
exit_r = p['ExitRates']
prod_pages = p['ProductRelated']
prod_dur = p['ProductRelated_Duration']
count = (df_clustered['Cluster'] == cluster_id).sum()
print(f"--- Cluster {cluster_id} ({count} users, {count/len(df_clustered)*100:.1f}%) ---")
print(f" Conversion Rate: {conv_rate:.2%}")
print(f" Avg PageValues: {page_val:.2f}")
print(f" Avg BounceRate: {bounce:.4f}")
print(f" Avg ExitRate: {exit_r:.4f}")
print(f" Avg ProductRelated Pages: {prod_pages:.1f}")
print(f" Avg ProductRelated Duration: {prod_dur:.1f}s")
# 自动命名逻辑
if conv_rate > 0.20 and page_val > 10:
print(f" >>> 命名: High-Intent Purchasers (高意向购买者)")
elif bounce > 0.03 or exit_r > 0.05:
print(f" >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)")
elif prod_pages > 30:
print(f" >>> 命名: Active Browsers (活跃浏览用户)")
else:
print(f" >>> 命名: Casual Visitors (普通访客)")
print()
=== 聚类命名与业务解读 === --- Cluster 0 (9652 users, 78.3%) --- Conversion Rate: 15.02% Avg PageValues: 5.88 Avg BounceRate: 0.0092 Avg ExitRate: 0.0321 Avg ProductRelated Pages: 23.9 Avg ProductRelated Duration: 873.8s >>> 命名: Casual Visitors (普通访客) --- Cluster 1 (1628 users, 13.2%) --- Conversion Rate: 27.76% Avg PageValues: 9.76 Avg BounceRate: 0.0063 Avg ExitRate: 0.0194 Avg ProductRelated Pages: 96.7 Avg ProductRelated Duration: 3831.1s >>> 命名: Active Browsers (活跃浏览用户) --- Cluster 2 (1050 users, 8.5%) --- Conversion Rate: 0.57% Avg PageValues: 0.00 Avg BounceRate: 0.1665 Avg ExitRate: 0.1805 Avg ProductRelated Pages: 3.0 Avg ProductRelated Duration: 57.3s >>> 命名: Low-Engagement Bouncers (低参与度跳出用户)
11. 特征工程效果验证¶
对比"原始特征"与"特征工程后"的模型性能,验证特征工程的价值。
# 11.1 准备无特征工程版本的数据
df_no_fe = df.copy()
df_no_fe['Revenue'] = df_no_fe['Revenue'].astype(int)
df_no_fe['Weekend'] = df_no_fe['Weekend'].astype(int)
df_no_fe = pd.get_dummies(df_no_fe, columns=categorical_cols, drop_first=True)
X_no_fe = df_no_fe.drop('Revenue', axis=1)
y_no_fe = df_no_fe['Revenue']
X_train_no_fe, X_test_no_fe, y_train_no_fe, y_test_no_fe = train_test_split(
X_no_fe, y_no_fe, test_size=0.2, random_state=42, stratify=y_no_fe
)
# 使用同一个 RF 模型对比
pipe_rf_no_fe = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', RandomForestClassifier(random_state=42, n_estimators=200))
])
pipe_rf_with_fe = ImbPipeline([
('smote', SMOTE(random_state=42)),
('model', RandomForestClassifier(random_state=42, n_estimators=200))
])
# 交叉验证
print("=== 无特征工程 ===")
result_no_fe = run_cv(pipe_rf_no_fe, X_train_no_fe, y_train_no_fe, cv, "RF (No FE)")
print("=== 有特征工程 ===")
result_with_fe = run_cv(pipe_rf_with_fe, X_train, y_train, cv, "RF (With FE)")
fe_compare = pd.DataFrame([result_no_fe, result_with_fe]).set_index('Model')
display_fe = {}
for metric in scoring_metrics:
display_fe[metric] = fe_compare.apply(
lambda r: f"{r[f'{metric}_mean']:.4f} ± {r[f'{metric}_std']:.4f}", axis=1
)
print("\n=== 特征工程前后对比 ===")
display(pd.DataFrame(display_fe, index=fe_compare.index))
=== 无特征工程 === === 有特征工程 === === 特征工程前后对比 ===
| accuracy | precision | recall | f1 | roc_auc | |
|---|---|---|---|---|---|
| Model | |||||
| RF (No FE) | 0.8969 ± 0.0056 | 0.6556 ± 0.0243 | 0.7064 ± 0.0158 | 0.6796 ± 0.0116 | 0.9240 ± 0.0030 |
| RF (With FE) | 0.8937 ± 0.0040 | 0.6338 ± 0.0134 | 0.7412 ± 0.0118 | 0.6832 ± 0.0100 | 0.9229 ± 0.0033 |
12. 结论与部署建议 (Deployment Recommendations)¶
核心结论¶
1. 最重要的特征 (Model Interpretability)
通过 SHAP 分析、Permutation Importance 和 PDP 三重验证发现:
PageValues(页面价值) 是影响购买意向的最核心正向指标ExitRates/BounceRates高则强烈暗示用户不会购买Month_Nov(11月份) 等季节性特征对结果影响显著- 特征工程构造的
TotalDuration、ProductPageRatio等特征提供了额外的预测信息
2. 模型表现 (Model Performance)
- 集成树模型 (Random Forest, XGBoost, LightGBM) 全面优于 Logistic Regression 和 Decision Tree
- 经 RandomizedSearchCV 调参后模型性能进一步提升
- SMOTE 和 class_weight 策略各有优势,但通过交叉验证确认了各模型的最优策略
- 阈值优化从默认 0.5 调整后,显著提升了 F1-score
3. 模型可靠性 (Model Reliability)
- 5-fold 交叉验证确认模型结果稳健,非偶然
- 校准曲线显示树模型概率输出基本可靠
- 学习曲线表明模型无严重过拟合或欠拟合
- 错误分析揭示了模型在特定月份/用户类型上的薄弱环节
4. 用户分层 (Customer Segmentation)
- 通过 Elbow + Silhouette Score 确定了最优聚类数
- 聚类画像成功识别出不同参与度的用户群体
- 高活跃度/高 PageValues 的簇具有显著更高的购买转化率
业务部署建议¶
实时干预系统: 基于最优树模型建立实时预测 API,使用成本敏感分析 确定的最优阈值进行决策。当预测概率处于犹豫区间时,触发客服弹窗 或限时优惠券以促成转化。
页面优化: 深度研究高 PageValues 页面的设计特征,
降低全局 ExitRates 和 BounceRates。
精准营销: 利用聚类结果对用户分层,针对不同群体采取差异化营销策略。 对 High-Intent 群体推送高价值商品推荐,对 Low-Engagement 群体 尝试重定向广告。
季节性策略: 在 11 月等高峰期前提前部署资源和广告预算。 关注错误分析中发现的高误判月份,在这些时段适当调低阈值以 减少漏判。
print("=" * 60)
print("分析完成!")
print("=" * 60)
print(f"最终推荐模型: {overall_best_name}")
print(f"推荐决策阈值 (F1最优): {final_threshold:.4f}")
print(f"推荐决策阈值 (商业价值最优): {best_bv_threshold}")
print(f"最优聚类数: {best_k}")
============================================================ 分析完成! ============================================================ 最终推荐模型: XGBoost (Tuned) 推荐决策阈值 (F1最优): 0.5889 推荐决策阈值 (商业价值最优): 0.3 最优聚类数: 3