File size: 8,739 Bytes

027ce51

# ==============================================================================
# 步骤 3: 训练并保存【基础模型 B】 (DeBERTa + 加权损失) (v2 - 最终修正版)
#
# 策略: (来自 DM-02 PPT 的启发 - 优化损失函数)
# 1. (!!) 关键修复: 明确使用 `DebertaV2Tokenizer` 替代 `AutoTokenizer`
# 2. 根据训练集标签比例，计算“类别权重”，为“假评论”赋予高权重。
# 3. 自定义 `CustomTrainer` 并重写 `compute_loss` 方法。
# 4. 在损失函数 `nn.CrossEntropyLoss` 中传入 `weight` 参数。
# 5. 使用 4xV100, fp16 混合精度训练。
# 6. 监控 `f1_fake` 分数，并保存 F1 最高的模型。
#
# 如何在4卡服务器上运行:
#
#    accelerate launch --num_processes=4 --mixed_precision="fp16" train_model_B.py
#
# ==============================================================================

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from datasets import Dataset
from transformers import (
    # (!!) 关键修复：不再使用 AutoTokenizer
    DebertaV2Tokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import os
import warnings

# --- 1. 配置与常量 ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv"
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv"
MODEL_NAME = "microsoft/deberta-v3-base" 
NEW_MODEL_SAVE_PATH = "./final_model_deberta_weighted" # 新模型的保存路径

# --- 2. 加载数据 ---
print(f"--- 正在训练【基础模型 B】 (DeBERTa + 加权损失) ---")
print("加载数据...")
train_df = pd.read_csv(TRAIN_FILE_PATH)
eval_df = pd.read_csv(VALID_FILE_PATH)

label_map = {"real": 0, "fake": 1}
train_df['label'] = train_df['label'].map(label_map)
eval_df['label'] = eval_df['label'].map(label_map)

print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(eval_df)}")

# --- 3. (核心) 计算类别权重 ---
print("\n--- 正在计算类别权重... ---")
label_counts = train_df['label'].value_counts().sort_index()
count_real = label_counts.get(0, 0)
count_fake = label_counts.get(1, 0)
total_samples = len(train_df)

if count_real == 0 or count_fake == 0:
    print("错误：训练数据只包含一个类别，无法计算权重。")
    exit()

# 权重公式: total_samples / (n_classes * class_count)
weight_for_0 = total_samples / (2.0 * count_real)
weight_for_1 = total_samples / (2.0 * count_fake)
class_weights = torch.tensor([weight_for_0, weight_for_1], dtype=torch.float32)

print(f"训练集标签分布:\n{label_counts}")
print(f"计算出的权重: [Real (0): {weight_for_0:.4f}, Fake (1): {weight_for_1:.4f}]")
print("“Fake” 类的权重更高，将在训练中被重点关注。")


# --- 4. Tokenization ---
print(f"\n--- 正在加载 Tokenizer: {MODEL_NAME} ---")
try:
    # (!!) 关键修复：直接使用 DebertaV2Tokenizer
    tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
except Exception as e:
    print(f"加载 Tokenizer 失败: {e}")
    print("请确保你已运行: pip install sentencepiece")
    print("并且已成功运行 download_model.py 脚本。")
    exit()

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("正在 Tokenize 数据集 (这可能需要几分钟)...")
train_dataset_hf = Dataset.from_pandas(train_df)
eval_dataset_hf = Dataset.from_pandas(eval_df)

# num_proc=4 使用 4 个核心并行处理数据
tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4)
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id", "text"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["id", "text"])
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")

# --- 5. (核心) 自定义 Trainer 以使用加权损失 (最终 DDP 兼容版) ---
print("\n--- 正在定义 CustomTrainer (使用加权损失) ---")
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        
        if class_weights is not None:
            # 权重注册到内部的模型(self.model)上
            # 这里是注册点，self.model 是原始模型
            self.model.register_buffer("class_weights", class_weights)

    # (!!! 最终修正: 接受所有参数，并使用 unwrap_model 获取真实权重 !!!)
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # 1. DDP 兼容性处理：获取原始模型（Unwrap DDP）
        #    如果模型被 DDP (DistributedDataParallel) 包裹，我们需要获取其内部的 module。
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            # 获取内部的原始模型实例
            core_model = model.module 
        else:
            # 如果没有被包裹，直接使用模型本身
            core_model = model
            
        # 2. 从原始模型中获取权重
        #    权重存在于 core_model.class_weights
        weights = core_model.class_weights.to(core_model.device) 
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        
        # 3. 计算损失
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# --- 6. 定义评估指标 ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # (关键) 报告 "fake" (pos_label=1) 的指标
    f1 = f1_score(labels, predictions, pos_label=1, zero_division=0)
    recall = recall_score(labels, predictions, pos_label=1, zero_division=0)
    precision = precision_score(labels, predictions, pos_label=1, zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'f1_fake': f1,
        'recall_fake': recall,
        'precision_fake': precision,
        'accuracy': accuracy,
    }

# --- 7. 加载模型和配置训练参数 ---
print(f"--- 正在加载模型: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

print("--- 正在配置 TrainingArguments (4xV100, fp16) ---")
training_args = TrainingArguments(
    output_dir=NEW_MODEL_SAVE_PATH,
    num_train_epochs=3,                     # 训练 3 轮
    per_device_train_batch_size=16,         # 4卡 x 16 = 64 的全局 batch size
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs_model_B',
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_fake",        # (关键) 用 f1_fake 作为“最佳”的标准
    greater_is_better=True,
    fp16=True,                              # (关键) 启用 V100 混合精度训练
    report_to="none",                       # 关闭 wandb/tensorboard, 如果需要请打开
    dataloader_num_workers=4,
    save_total_limit=1,
)

# --- 8. 初始化 Trainer ---
print("--- 正在初始化 CustomTrainer ---")
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights             # (关键) 传入我们的权重
)

# --- 9. 开始训练 ---
print("\n--- 🚀 开始训练【模型 B】(使用完整数据 + 加权损失) 🚀 ---")
train_result = trainer.train()

# --- 10. 评估并保存 ---
print("\n--- 训练完成！正在评估【模型 B】... ---")
final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
print("--- 【模型 B】最终验证集评估结果 ---")
print(final_metrics)

print("\n--- (Fake vs Real) 分类报告 ---")
predictions = trainer.predict(tokenized_eval_dataset)
final_preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(
    eval_df['label'], 
    final_preds, 
    target_names=['real (0)', 'fake (1)'], 
    digits=4
))

print("\n--- 正在保存【模型 B】的最佳 checkpoint ... ---")
trainer.save_model(NEW_MODEL_SAVE_PATH)
print(f"模型已保存到: {NEW_MODEL_SAVE_PATH}")
print("--- 脚本 train_model_B.py 运行结束 ---")