|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import torch |
|
|
import joblib |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import classification_report, f1_score |
|
|
from datasets import Dataset |
|
|
from transformers import ( |
|
|
|
|
|
DebertaV2Tokenizer, |
|
|
AutoModelForSequenceClassification, |
|
|
TrainingArguments, |
|
|
Trainer |
|
|
) |
|
|
import os |
|
|
import time |
|
|
import warnings |
|
|
|
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
|
|
|
TRAIN_FILE_PATH = "train_features.csv" |
|
|
VALID_FILE_PATH = "val_features.csv" |
|
|
|
|
|
|
|
|
MODEL_A_DIR = "final_model_tfidf" |
|
|
VECTORIZER_PATH = os.path.join(MODEL_A_DIR, "vectorizer.joblib") |
|
|
MODEL_A_PATH = os.path.join(MODEL_A_DIR, "model_A.joblib") |
|
|
DEBERTA_MODEL_PATH = "final_model_deberta_weighted" |
|
|
|
|
|
|
|
|
NEW_FEATURES = [ |
|
|
'vader_compound_score', 'vader_neg_score', 'vader_pos_score', |
|
|
'lengthening_ratio', 'extreme_punctuation' |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
print("--- 方案二:增强版集成学习 ---") |
|
|
print("加载增强特征后的数据...") |
|
|
train_df = pd.read_csv(TRAIN_FILE_PATH) |
|
|
eval_df = pd.read_csv(VALID_FILE_PATH) |
|
|
|
|
|
|
|
|
y_train = train_df['label'] |
|
|
y_eval = eval_df['label'] |
|
|
X_train_text = train_df['text'] |
|
|
X_eval_text = eval_df['text'] |
|
|
|
|
|
|
|
|
|
|
|
X_train_new_features = train_df[NEW_FEATURES].values |
|
|
X_eval_new_features = eval_df[NEW_FEATURES].values |
|
|
|
|
|
print(f"已提取 {len(NEW_FEATURES)} 个人工特征。") |
|
|
|
|
|
|
|
|
print(f"\n--- 1/4:加载 Model A ({MODEL_A_DIR}) 并获取预测概率 ---") |
|
|
try: |
|
|
vectorizer = joblib.load(VECTORIZER_PATH) |
|
|
model_A = joblib.load(MODEL_A_PATH) |
|
|
except FileNotFoundError: |
|
|
print("!!!错误:Model A 文件缺失。请将 Model A 文件夹复制到当前目录。!!!") |
|
|
exit() |
|
|
|
|
|
X_train_tfidf = vectorizer.transform(X_train_text) |
|
|
X_eval_tfidf = vectorizer.transform(X_eval_text) |
|
|
|
|
|
probs_A_train = model_A.predict_proba(X_train_tfidf)[:, 1] |
|
|
probs_A_eval = model_A.predict_proba(X_eval_tfidf)[:, 1] |
|
|
|
|
|
|
|
|
print(f"\n--- 2/4:加载 Model B ({DEBERTA_MODEL_PATH}) 并获取预测概率 ---") |
|
|
try: |
|
|
model_B = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL_PATH) |
|
|
tokenizer_B = DebertaV2Tokenizer.from_pretrained(DEBERTA_MODEL_PATH) |
|
|
except Exception: |
|
|
print("!!!错误:Model B 文件缺失或加载失败。请确保 Model B 文件夹已复制且依赖已安装。!!!") |
|
|
exit() |
|
|
|
|
|
|
|
|
trainer_B = Trainer( |
|
|
model=model_B, |
|
|
args=TrainingArguments(output_dir="./tmp_ensemble_eval", per_device_eval_batch_size=32, eval_strategy="no", fp16=True) |
|
|
) |
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer_B(examples["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
|
|
|
|
|
train_dataset_hf = Dataset.from_pandas(train_df) |
|
|
eval_dataset_hf = Dataset.from_pandas(eval_df) |
|
|
|
|
|
|
|
|
cols_to_remove = [col for col in train_df.columns if col not in ['text', 'label']] |
|
|
tokenized_train_dataset = train_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4) |
|
|
tokenized_eval_dataset = eval_dataset_hf.remove_columns(cols_to_remove).map(tokenize_function, batched=True, num_proc=4) |
|
|
|
|
|
|
|
|
predictions_B_train = trainer_B.predict(tokenized_train_dataset.remove_columns(["text", "label"])) |
|
|
probs_B_train = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_train.predictions))[:, 1].numpy() |
|
|
|
|
|
predictions_B_eval = trainer_B.predict(tokenized_eval_dataset.remove_columns(["text", "label"])) |
|
|
probs_B_eval = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions_B_eval.predictions))[:, 1].numpy() |
|
|
|
|
|
|
|
|
print("\n--- 3/4:创建增强版元特征并训练 Meta-Model ---") |
|
|
|
|
|
|
|
|
X_train_meta_V2 = np.column_stack((probs_A_train, probs_B_train, X_train_new_features)) |
|
|
X_eval_meta_V2 = np.column_stack((probs_A_eval, probs_B_eval, X_eval_new_features)) |
|
|
|
|
|
print(f"增强版元特征维度: {X_eval_meta_V2.shape} (包含 2 个概率 + 5 个人工特征 = 7 个特征)") |
|
|
|
|
|
|
|
|
|
|
|
meta_model_V2 = LogisticRegression( |
|
|
class_weight='balanced', |
|
|
random_state=42, |
|
|
max_iter=500 |
|
|
) |
|
|
meta_model_V2.fit(X_train_meta_V2, y_train) |
|
|
|
|
|
print("增强版元模型 (LR) 训练完毕!") |
|
|
|
|
|
|
|
|
|
|
|
print("\n--- 4/4:最终评估:阈值调优 (使用 Thr=0.65) ---") |
|
|
|
|
|
|
|
|
meta_scores_V2 = meta_model_V2.predict_proba(X_eval_meta_V2)[:, 1] |
|
|
|
|
|
|
|
|
BEST_THRESHOLD = 0.65 |
|
|
|
|
|
|
|
|
preds_meta_tuned_V2 = (meta_scores_V2 >= BEST_THRESHOLD).astype(int) |
|
|
|
|
|
|
|
|
print(f"--- 评估阈值: {BEST_THRESHOLD:.2f} ---") |
|
|
report = classification_report( |
|
|
y_eval, |
|
|
preds_meta_tuned_V2, |
|
|
target_names=['real (0)', 'fake (1)'], |
|
|
digits=4, |
|
|
zero_division=0 |
|
|
) |
|
|
print(report) |
|
|
|
|
|
f1_ensemble_V2 = f1_score(y_eval, preds_meta_tuned_V2, pos_label=1) |
|
|
print(f"--- 最终集成 F1 分数 (Fake): {f1_ensemble_V2:.4f} ---") |
|
|
print("--- 脚本 final_stacking_V2.py 运行结束 ---") |