File size: 6,073 Bytes
027ce51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# ==============================================================================
# 最终脚本: final_ensemble_stacking.py (Meta-Model 升级)
#
# 策略: 结合 Model A 和 Model B 的预测概率(元特征),
# 使用 AdaBoostClassifier (来自 DM-01 PPT) 作为最终分类器。
#
# 目标: 提高 Fake(1) 的 Precision (精确率)。
# ==============================================================================
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier # 引入 AdaBoost
from sklearn.tree import DecisionTreeClassifier # AdaBoost 通常使用决策树作为基分类器
from sklearn.metrics import classification_report, f1_score
from datasets import Dataset
from transformers import (
DebertaV2Tokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
import os
import time
import warnings
# --- 1. 定义路径和常量 (保持不变) ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv"
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv"
# 依赖路径
MODEL_A_DIR = "./final_model_tfidf"
VECTORIZER_PATH = os.path.join(MODEL_A_DIR, "vectorizer.joblib")
MODEL_A_PATH = os.path.join(MODEL_A_DIR, "model_A.joblib")
DEBERTA_MODEL_PATH = "./final_model_deberta_weighted"
# --- 2. 加载数据 (保持不变) ---
print("--- 方案三:集成学习 (Stacking) ---")
print("加载数据...")
train_df = pd.read_csv(TRAIN_FILE_PATH)
eval_df = pd.read_csv(VALID_FILE_PATH)
label_map = {"real": 0, "fake": 1}
train_df['label'] = train_df['label'].map(label_map)
eval_df['label'] = eval_df['label'].map(label_map)
X_train_text = train_df['text']
y_train = train_df['label']
X_eval_text = eval_df['text']
y_eval = eval_df['label'] # 这是我们最终的真实标签
# --- 3/4: 加载 Model A 和 Model B 并获取预测概率 (代码与之前相同) ---
# ... (此部分代码完全相同,省略以节省空间) ...
# 注意:你需要手动确保这部分在你的实际文件中是完整的,因为它依赖了前面 Model A/B 的加载和预测逻辑。
# 以下为 Model A 的加载和预测代码片段:
print(f"\n--- 1/3:加载 Model A ({MODEL_A_DIR}) 并获取预测概率 ---")
start_time = time.time()
try:
vectorizer = joblib.load(VECTORIZER_PATH)
model_A = joblib.load(MODEL_A_PATH)
except FileNotFoundError:
print(f"!!!错误:找不到 Model A 的文件 {MODEL_A_DIR} !!!")
exit()
X_train_tfidf = vectorizer.transform(X_train_text)
X_eval_tfidf = vectorizer.transform(X_eval_text)
probs_A_train = model_A.predict_proba(X_train_tfidf)[:, 1]
probs_A_eval = model_A.predict_proba(X_eval_tfidf)[:, 1]
print(f"Model A 运行完毕. 耗时: {time.time() - start_time:.2f} 秒")
# 以下为 Model B 的加载和预测代码片段:
print(f"\n--- 2/3:加载 Model B ({DEBERTA_MODEL_PATH}) 并获取预测概率 ---")
start_time = time.time()
try:
model_B = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL_PATH)
tokenizer_B = DebertaV2Tokenizer.from_pretrained(DEBERTA_MODEL_PATH)
except Exception as e:
print(f"!!!错误:无法加载 Model B。详情: {e} !!!")
exit()
trainer_B = Trainer(
model=model_B,
args=TrainingArguments(output_dir="./tmp_ensemble_eval", per_device_eval_batch_size=32, eval_strategy="no", fp16=True)
)
def tokenize_function(examples):
return tokenizer_B(examples["text"], padding="max_length", truncation=True, max_length=512)
train_dataset_hf = Dataset.from_pandas(train_df)
eval_dataset_hf = Dataset.from_pandas(eval_df)
tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4).remove_columns(["id", "text", "label"])
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4).remove_columns(["id", "text", "label"])
predictions_B_train = trainer_B.predict(tokenized_train_dataset)
logits_B_train = predictions_B_train.predictions
softmax = torch.nn.Softmax(dim=1)
probs_B_train = softmax(torch.from_numpy(logits_B_train))[:, 1].numpy()
predictions_B_eval = trainer_B.predict(tokenized_eval_dataset)
logits_B_eval = predictions_B_eval.predictions
probs_B_eval = softmax(torch.from_numpy(logits_B_eval))[:, 1].numpy()
print(f"Model B 运行完毕. 耗时: {time.time() - start_time:.2f} 秒")
# --- 5: 训练元模型 (Meta-Model) (!!! 核心修改在这里 !!!) ---
print("\n--- 3/3:训练元模型 (AdaBoost Classifier) ---")
# 创建元数据集
X_train_meta = np.column_stack((probs_A_train, probs_B_train))
X_eval_meta = np.column_stack((probs_A_eval, probs_B_eval))
print(f"元训练集维度: {X_train_meta.shape}")
# (核心修改) AdaBoost 作为元模型
# 导入 AdaBoost 和 DecisionTree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# 基分类器 (弱学习器)
base_learner = DecisionTreeClassifier(max_depth=1, random_state=42)
# (!!! 关键修正: 将 base_estimator 改为 estimator !!!)
meta_model = AdaBoostClassifier(
estimator=base_learner, # <--- 修正后的参数名
n_estimators=50,
learning_rate=1.0,
random_state=42
)
# 注意:AdaBoost 不支持 class_weight='balanced',它通过迭代调整样本权重实现平衡。
# 所以我们直接在不平衡数据上训练。
meta_model.fit(X_train_meta, y_train)
print("元模型 (AdaBoost) 训练完毕!")
# --- 6: 最终评估 ---
print("\n--- 最终评估:在【验证集】上评估“集成模型”的性能 ---")
preds_meta = meta_model.predict(X_eval_meta)
# 输出详细的分类报告
report = classification_report(
y_eval,
preds_meta,
target_names=['real (0)', 'fake (1)'],
digits=4,
zero_division=0
)
print(report)
f1_ensemble = f1_score(y_eval, preds_meta, pos_label=1)
print(f"--- 最终集成 F1 分数 (Fake): {f1_ensemble:.4f} ---")
print("--- 脚本运行结束 ---") |