bert_remark / method_1 /train_model_B.py
BaltimoreCA68's picture
Add files using upload-large-folder tool
027ce51 verified
# ==============================================================================
# 步骤 3: 训练并保存【基础模型 B】 (DeBERTa + 加权损失) (v2 - 最终修正版)
#
# 策略: (来自 DM-02 PPT 的启发 - 优化损失函数)
# 1. (!!) 关键修复: 明确使用 `DebertaV2Tokenizer` 替代 `AutoTokenizer`
# 2. 根据训练集标签比例,计算“类别权重”,为“假评论”赋予高权重。
# 3. 自定义 `CustomTrainer` 并重写 `compute_loss` 方法。
# 4. 在损失函数 `nn.CrossEntropyLoss` 中传入 `weight` 参数。
# 5. 使用 4xV100, fp16 混合精度训练。
# 6. 监控 `f1_fake` 分数,并保存 F1 最高的模型。
#
# 如何在4卡服务器上运行:
#
# accelerate launch --num_processes=4 --mixed_precision="fp16" train_model_B.py
#
# ==============================================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from datasets import Dataset
from transformers import (
# (!!) 关键修复:不再使用 AutoTokenizer
DebertaV2Tokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
import os
import warnings
# --- 1. 配置与常量 ---
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv"
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv"
MODEL_NAME = "microsoft/deberta-v3-base"
NEW_MODEL_SAVE_PATH = "./final_model_deberta_weighted" # 新模型的保存路径
# --- 2. 加载数据 ---
print(f"--- 正在训练【基础模型 B】 (DeBERTa + 加权损失) ---")
print("加载数据...")
train_df = pd.read_csv(TRAIN_FILE_PATH)
eval_df = pd.read_csv(VALID_FILE_PATH)
label_map = {"real": 0, "fake": 1}
train_df['label'] = train_df['label'].map(label_map)
eval_df['label'] = eval_df['label'].map(label_map)
print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(eval_df)}")
# --- 3. (核心) 计算类别权重 ---
print("\n--- 正在计算类别权重... ---")
label_counts = train_df['label'].value_counts().sort_index()
count_real = label_counts.get(0, 0)
count_fake = label_counts.get(1, 0)
total_samples = len(train_df)
if count_real == 0 or count_fake == 0:
print("错误:训练数据只包含一个类别,无法计算权重。")
exit()
# 权重公式: total_samples / (n_classes * class_count)
weight_for_0 = total_samples / (2.0 * count_real)
weight_for_1 = total_samples / (2.0 * count_fake)
class_weights = torch.tensor([weight_for_0, weight_for_1], dtype=torch.float32)
print(f"训练集标签分布:\n{label_counts}")
print(f"计算出的权重: [Real (0): {weight_for_0:.4f}, Fake (1): {weight_for_1:.4f}]")
print("“Fake” 类的权重更高,将在训练中被重点关注。")
# --- 4. Tokenization ---
print(f"\n--- 正在加载 Tokenizer: {MODEL_NAME} ---")
try:
# (!!) 关键修复:直接使用 DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
except Exception as e:
print(f"加载 Tokenizer 失败: {e}")
print("请确保你已运行: pip install sentencepiece")
print("并且已成功运行 download_model.py 脚本。")
exit()
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
print("正在 Tokenize 数据集 (这可能需要几分钟)...")
train_dataset_hf = Dataset.from_pandas(train_df)
eval_dataset_hf = Dataset.from_pandas(eval_df)
# num_proc=4 使用 4 个核心并行处理数据
tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4)
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id", "text"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["id", "text"])
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")
# --- 5. (核心) 自定义 Trainer 以使用加权损失 (最终 DDP 兼容版) ---
print("\n--- 正在定义 CustomTrainer (使用加权损失) ---")
class CustomTrainer(Trainer):
def __init__(self, *args, class_weights=None, **kwargs):
super().__init__(*args, **kwargs)
if class_weights is not None:
# 权重注册到内部的模型(self.model)上
# 这里是注册点,self.model 是原始模型
self.model.register_buffer("class_weights", class_weights)
# (!!! 最终修正: 接受所有参数,并使用 unwrap_model 获取真实权重 !!!)
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.get("logits")
# 1. DDP 兼容性处理:获取原始模型(Unwrap DDP)
# 如果模型被 DDP (DistributedDataParallel) 包裹,我们需要获取其内部的 module。
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
# 获取内部的原始模型实例
core_model = model.module
else:
# 如果没有被包裹,直接使用模型本身
core_model = model
# 2. 从原始模型中获取权重
# 权重存在于 core_model.class_weights
weights = core_model.class_weights.to(core_model.device)
loss_fct = nn.CrossEntropyLoss(weight=weights)
# 3. 计算损失
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
# --- 6. 定义评估指标 ---
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
# (关键) 报告 "fake" (pos_label=1) 的指标
f1 = f1_score(labels, predictions, pos_label=1, zero_division=0)
recall = recall_score(labels, predictions, pos_label=1, zero_division=0)
precision = precision_score(labels, predictions, pos_label=1, zero_division=0)
accuracy = accuracy_score(labels, predictions)
return {
'f1_fake': f1,
'recall_fake': recall,
'precision_fake': precision,
'accuracy': accuracy,
}
# --- 7. 加载模型和配置训练参数 ---
print(f"--- 正在加载模型: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
print("--- 正在配置 TrainingArguments (4xV100, fp16) ---")
training_args = TrainingArguments(
output_dir=NEW_MODEL_SAVE_PATH,
num_train_epochs=3, # 训练 3 轮
per_device_train_batch_size=16, # 4卡 x 16 = 64 的全局 batch size
per_device_eval_batch_size=32,
warmup_ratio=0.1,
weight_decay=0.01,
logging_dir='./logs_model_B',
logging_strategy="steps",
logging_steps=100,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_fake", # (关键) 用 f1_fake 作为“最佳”的标准
greater_is_better=True,
fp16=True, # (关键) 启用 V100 混合精度训练
report_to="none", # 关闭 wandb/tensorboard, 如果需要请打开
dataloader_num_workers=4,
save_total_limit=1,
)
# --- 8. 初始化 Trainer ---
print("--- 正在初始化 CustomTrainer ---")
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
class_weights=class_weights # (关键) 传入我们的权重
)
# --- 9. 开始训练 ---
print("\n--- 🚀 开始训练【模型 B】(使用完整数据 + 加权损失) 🚀 ---")
train_result = trainer.train()
# --- 10. 评估并保存 ---
print("\n--- 训练完成!正在评估【模型 B】... ---")
final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
print("--- 【模型 B】最终验证集评估结果 ---")
print(final_metrics)
print("\n--- (Fake vs Real) 分类报告 ---")
predictions = trainer.predict(tokenized_eval_dataset)
final_preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(
eval_df['label'],
final_preds,
target_names=['real (0)', 'fake (1)'],
digits=4
))
print("\n--- 正在保存【模型 B】的最佳 checkpoint ... ---")
trainer.save_model(NEW_MODEL_SAVE_PATH)
print(f"模型已保存到: {NEW_MODEL_SAVE_PATH}")
print("--- 脚本 train_model_B.py 运行结束 ---")