File size: 6,333 Bytes
027ce51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# ==============================================================================
# 脚本 2: final_stacking_V2.py (增强版 Stacking 完整代码)
#
# 目的:运行增强版 Stacking,将 Model A/B 概率与 6 个人工特征结合。
# 核心: 元特征 = [Prob_A, Prob_B, VADER Scores, Lengthening, Punctuation]
# ==============================================================================

import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from datasets import Dataset
from transformers import (
    # 明确指定 DebertaV2Tokenizer 解决兼容性问题
    DebertaV2Tokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import os
import time
import warnings

# --- 1. 定义路径和常量 ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 原始数据和新特征的路径 (使用脚本 1 的输出)
TRAIN_FILE_PATH = "train_features.csv" 
VALID_FILE_PATH = "val_features.csv"   

# 依赖路径 (基础模型概率的来源,假设已复制到 method2 目录下)
MODEL_A_DIR = "final_model_tfidf"
VECTORIZER_PATH = os.path.join(MODEL_A_DIR, "vectorizer.joblib")
MODEL_A_PATH = os.path.join(MODEL_A_DIR, "model_A.joblib")
DEBERTA_MODEL_PATH = "final_model_deberta_weighted" 

# 新增的人工特征列表
NEW_FEATURES = [
    'vader_compound_score', 'vader_neg_score', 'vader_pos_score', 
    'lengthening_ratio', 'extreme_punctuation'
]


# --- 2. 加载增强后的数据 ---
print("--- 方案二:增强版集成学习 ---")
print("加载增强特征后的数据...")
train_df = pd.read_csv(TRAIN_FILE_PATH)
eval_df = pd.read_csv(VALID_FILE_PATH)

# 将 label 映射为数值 (由于 extract_features.py 已经处理,这里只做确认)
y_train = train_df['label']
y_eval = eval_df['label'] 
X_train_text = train_df['text']
X_eval_text = eval_df['text']


# (核心步骤) 提取新的人工特征作为 Meta-Features 的一部分
X_train_new_features = train_df[NEW_FEATURES].values
X_eval_new_features = eval_df[NEW_FEATURES].values

print(f"已提取 {len(NEW_FEATURES)} 个人工特征。")

# --- 3: 获取 Model A 的预测概率 (元特征 1) ---
print(f"\n--- 1/4:加载 Model A ({MODEL_A_DIR}) 并获取预测概率 ---")
try:
    vectorizer = joblib.load(VECTORIZER_PATH)
    model_A = joblib.load(MODEL_A_PATH)
except FileNotFoundError:
    print("!!!错误:Model A 文件缺失。请将 Model A 文件夹复制到当前目录。!!!")
    exit()

X_train_tfidf = vectorizer.transform(X_train_text)
X_eval_tfidf = vectorizer.transform(X_eval_text)

probs_A_train = model_A.predict_proba(X_train_tfidf)[:, 1]
probs_A_eval = model_A.predict_proba(X_eval_tfidf)[:, 1]

# --- 4: 获取 Model B 的预测概率 (元特征 2) ---
print(f"\n--- 2/4:加载 Model B ({DEBERTA_MODEL_PATH}) 并获取预测概率 ---")
try:
    model_B = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL_PATH)
    tokenizer_B = DebertaV2Tokenizer.from_pretrained(DEBERTA_MODEL_PATH)
except Exception:
    print("!!!错误:Model B 文件缺失或加载失败。请确保 Model B 文件夹已复制且依赖已安装。!!!")
    exit()

# 准备 Trainer 和数据函数
trainer_B = Trainer(
    model=model_B,
    args=TrainingArguments(output_dir="./tmp_ensemble_eval", per_device_eval_batch_size=32, eval_strategy="no", fp16=True)
)

def tokenize_function(examples):
    return tokenizer_B(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize 数据集 (需要用到原始的 text 列)
train_dataset_hf = Dataset.from_pandas(train_df)
eval_dataset_hf = Dataset.from_pandas(eval_df)

# 移除不需要的列,只保留 text 和 label 用于预测
cols_to_remove = [col for col in train_df.columns if col not in ['text', 'label']]
tokenized_train_dataset = train_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4)
tokenized_eval_dataset = eval_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4)

# 预测训练集和验证集
predictions_B_train = trainer_B.predict(tokenized_train_dataset.remove_columns(["text", "label"]))
probs_B_train = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_train.predictions))[:, 1].numpy()

predictions_B_eval = trainer_B.predict(tokenized_eval_dataset.remove_columns(["text", "label"]))
probs_B_eval = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_eval.predictions))[:, 1].numpy()

# --- 5: 训练增强版元模型 ---
print("\n--- 3/4:创建增强版元特征并训练 Meta-Model ---")

# (核心步骤) 堆叠所有元特征: Model A 概率 + Model B 概率 + 6个人工特征 (共 8 个特征)
X_train_meta_V2 = np.column_stack((probs_A_train, probs_B_train, X_train_new_features))
X_eval_meta_V2 = np.column_stack((probs_A_eval, probs_B_eval, X_eval_new_features))

print(f"增强版元特征维度: {X_eval_meta_V2.shape} (包含 2 个概率 + 5 个人工特征 = 7 个特征)")

# 训练元模型:使用 LR 并加入类别权重 (高召回潜力)
# 我们回到 V1 的 LR 模型,但这次有更多特征来帮助它做决策。
meta_model_V2 = LogisticRegression(
    class_weight='balanced', 
    random_state=42,
    max_iter=500 
)
meta_model_V2.fit(X_train_meta_V2, y_train)

print("增强版元模型 (LR) 训练完毕!")


# --- 6: 最终评估:阈值调优 (继续使用 Thr=0.65) ---
print("\n--- 4/4:最终评估:阈值调优 (使用 Thr=0.65) ---")

# 获取 LR 模型的预测概率 (scores)
meta_scores_V2 = meta_model_V2.predict_proba(X_eval_meta_V2)[:, 1] 

# 使用 V3 方案中表现略有提升的阈值 0.65
BEST_THRESHOLD = 0.65 

# 根据新阈值生成最终预测
preds_meta_tuned_V2 = (meta_scores_V2 >= BEST_THRESHOLD).astype(int)

# 输出详细的分类报告
print(f"--- 评估阈值: {BEST_THRESHOLD:.2f} ---")
report = classification_report(
    y_eval, 
    preds_meta_tuned_V2, 
    target_names=['real (0)', 'fake (1)'],
    digits=4,
    zero_division=0
)
print(report)

f1_ensemble_V2 = f1_score(y_eval, preds_meta_tuned_V2, pos_label=1)
print(f"--- 最终集成 F1 分数 (Fake): {f1_ensemble_V2:.4f} ---")
print("--- 脚本 final_stacking_V2.py 运行结束 ---")