File size: 6,333 Bytes
027ce51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# ==============================================================================
# 脚本 2: final_stacking_V2.py (增强版 Stacking 完整代码)
#
# 目的:运行增强版 Stacking,将 Model A/B 概率与 6 个人工特征结合。
# 核心: 元特征 = [Prob_A, Prob_B, VADER Scores, Lengthening, Punctuation]
# ==============================================================================
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from datasets import Dataset
from transformers import (
# 明确指定 DebertaV2Tokenizer 解决兼容性问题
DebertaV2Tokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
import os
import time
import warnings
# --- 1. 定义路径和常量 ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# 原始数据和新特征的路径 (使用脚本 1 的输出)
TRAIN_FILE_PATH = "train_features.csv"
VALID_FILE_PATH = "val_features.csv"
# 依赖路径 (基础模型概率的来源,假设已复制到 method2 目录下)
MODEL_A_DIR = "final_model_tfidf"
VECTORIZER_PATH = os.path.join(MODEL_A_DIR, "vectorizer.joblib")
MODEL_A_PATH = os.path.join(MODEL_A_DIR, "model_A.joblib")
DEBERTA_MODEL_PATH = "final_model_deberta_weighted"
# 新增的人工特征列表
NEW_FEATURES = [
'vader_compound_score', 'vader_neg_score', 'vader_pos_score',
'lengthening_ratio', 'extreme_punctuation'
]
# --- 2. 加载增强后的数据 ---
print("--- 方案二:增强版集成学习 ---")
print("加载增强特征后的数据...")
train_df = pd.read_csv(TRAIN_FILE_PATH)
eval_df = pd.read_csv(VALID_FILE_PATH)
# 将 label 映射为数值 (由于 extract_features.py 已经处理,这里只做确认)
y_train = train_df['label']
y_eval = eval_df['label']
X_train_text = train_df['text']
X_eval_text = eval_df['text']
# (核心步骤) 提取新的人工特征作为 Meta-Features 的一部分
X_train_new_features = train_df[NEW_FEATURES].values
X_eval_new_features = eval_df[NEW_FEATURES].values
print(f"已提取 {len(NEW_FEATURES)} 个人工特征。")
# --- 3: 获取 Model A 的预测概率 (元特征 1) ---
print(f"\n--- 1/4:加载 Model A ({MODEL_A_DIR}) 并获取预测概率 ---")
try:
vectorizer = joblib.load(VECTORIZER_PATH)
model_A = joblib.load(MODEL_A_PATH)
except FileNotFoundError:
print("!!!错误:Model A 文件缺失。请将 Model A 文件夹复制到当前目录。!!!")
exit()
X_train_tfidf = vectorizer.transform(X_train_text)
X_eval_tfidf = vectorizer.transform(X_eval_text)
probs_A_train = model_A.predict_proba(X_train_tfidf)[:, 1]
probs_A_eval = model_A.predict_proba(X_eval_tfidf)[:, 1]
# --- 4: 获取 Model B 的预测概率 (元特征 2) ---
print(f"\n--- 2/4:加载 Model B ({DEBERTA_MODEL_PATH}) 并获取预测概率 ---")
try:
model_B = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL_PATH)
tokenizer_B = DebertaV2Tokenizer.from_pretrained(DEBERTA_MODEL_PATH)
except Exception:
print("!!!错误:Model B 文件缺失或加载失败。请确保 Model B 文件夹已复制且依赖已安装。!!!")
exit()
# 准备 Trainer 和数据函数
trainer_B = Trainer(
model=model_B,
args=TrainingArguments(output_dir="./tmp_ensemble_eval", per_device_eval_batch_size=32, eval_strategy="no", fp16=True)
)
def tokenize_function(examples):
return tokenizer_B(examples["text"], padding="max_length", truncation=True, max_length=512)
# Tokenize 数据集 (需要用到原始的 text 列)
train_dataset_hf = Dataset.from_pandas(train_df)
eval_dataset_hf = Dataset.from_pandas(eval_df)
# 移除不需要的列,只保留 text 和 label 用于预测
cols_to_remove = [col for col in train_df.columns if col not in ['text', 'label']]
tokenized_train_dataset = train_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4)
tokenized_eval_dataset = eval_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4)
# 预测训练集和验证集
predictions_B_train = trainer_B.predict(tokenized_train_dataset.remove_columns(["text", "label"]))
probs_B_train = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_train.predictions))[:, 1].numpy()
predictions_B_eval = trainer_B.predict(tokenized_eval_dataset.remove_columns(["text", "label"]))
probs_B_eval = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_eval.predictions))[:, 1].numpy()
# --- 5: 训练增强版元模型 ---
print("\n--- 3/4:创建增强版元特征并训练 Meta-Model ---")
# (核心步骤) 堆叠所有元特征: Model A 概率 + Model B 概率 + 6个人工特征 (共 8 个特征)
X_train_meta_V2 = np.column_stack((probs_A_train, probs_B_train, X_train_new_features))
X_eval_meta_V2 = np.column_stack((probs_A_eval, probs_B_eval, X_eval_new_features))
print(f"增强版元特征维度: {X_eval_meta_V2.shape} (包含 2 个概率 + 5 个人工特征 = 7 个特征)")
# 训练元模型:使用 LR 并加入类别权重 (高召回潜力)
# 我们回到 V1 的 LR 模型,但这次有更多特征来帮助它做决策。
meta_model_V2 = LogisticRegression(
class_weight='balanced',
random_state=42,
max_iter=500
)
meta_model_V2.fit(X_train_meta_V2, y_train)
print("增强版元模型 (LR) 训练完毕!")
# --- 6: 最终评估:阈值调优 (继续使用 Thr=0.65) ---
print("\n--- 4/4:最终评估:阈值调优 (使用 Thr=0.65) ---")
# 获取 LR 模型的预测概率 (scores)
meta_scores_V2 = meta_model_V2.predict_proba(X_eval_meta_V2)[:, 1]
# 使用 V3 方案中表现略有提升的阈值 0.65
BEST_THRESHOLD = 0.65
# 根据新阈值生成最终预测
preds_meta_tuned_V2 = (meta_scores_V2 >= BEST_THRESHOLD).astype(int)
# 输出详细的分类报告
print(f"--- 评估阈值: {BEST_THRESHOLD:.2f} ---")
report = classification_report(
y_eval,
preds_meta_tuned_V2,
target_names=['real (0)', 'fake (1)'],
digits=4,
zero_division=0
)
print(report)
f1_ensemble_V2 = f1_score(y_eval, preds_meta_tuned_V2, pos_label=1)
print(f"--- 最终集成 F1 分数 (Fake): {f1_ensemble_V2:.4f} ---")
print("--- 脚本 final_stacking_V2.py 运行结束 ---") |