# ============================================================================== # 步骤 3: 训练并保存【基础模型 B】 (DeBERTa + 加权损失) (v2 - 最终修正版) # # 策略: (来自 DM-02 PPT 的启发 - 优化损失函数) # 1. (!!) 关键修复: 明确使用 `DebertaV2Tokenizer` 替代 `AutoTokenizer` # 2. 根据训练集标签比例,计算“类别权重”,为“假评论”赋予高权重。 # 3. 自定义 `CustomTrainer` 并重写 `compute_loss` 方法。 # 4. 在损失函数 `nn.CrossEntropyLoss` 中传入 `weight` 参数。 # 5. 使用 4xV100, fp16 混合精度训练。 # 6. 监控 `f1_fake` 分数,并保存 F1 最高的模型。 # # 如何在4卡服务器上运行: # # accelerate launch --num_processes=4 --mixed_precision="fp16" train_model_B.py # # ============================================================================== import pandas as pd import numpy as np import torch import torch.nn as nn from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report from datasets import Dataset from transformers import ( # (!!) 关键修复:不再使用 AutoTokenizer DebertaV2Tokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer ) import os import warnings # --- 1. 配置与常量 --- warnings.filterwarnings("ignore") os.environ["TOKENIZERS_PARALLELISM"] = "false" TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv" VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv" MODEL_NAME = "microsoft/deberta-v3-base" NEW_MODEL_SAVE_PATH = "./final_model_deberta_weighted" # 新模型的保存路径 # --- 2. 加载数据 --- print(f"--- 正在训练【基础模型 B】 (DeBERTa + 加权损失) ---") print("加载数据...") train_df = pd.read_csv(TRAIN_FILE_PATH) eval_df = pd.read_csv(VALID_FILE_PATH) label_map = {"real": 0, "fake": 1} train_df['label'] = train_df['label'].map(label_map) eval_df['label'] = eval_df['label'].map(label_map) print(f"训练集大小: {len(train_df)}") print(f"验证集大小: {len(eval_df)}") # --- 3. (核心) 计算类别权重 --- print("\n--- 正在计算类别权重... ---") label_counts = train_df['label'].value_counts().sort_index() count_real = label_counts.get(0, 0) count_fake = label_counts.get(1, 0) total_samples = len(train_df) if count_real == 0 or count_fake == 0: print("错误:训练数据只包含一个类别,无法计算权重。") exit() # 权重公式: total_samples / (n_classes * class_count) weight_for_0 = total_samples / (2.0 * count_real) weight_for_1 = total_samples / (2.0 * count_fake) class_weights = torch.tensor([weight_for_0, weight_for_1], dtype=torch.float32) print(f"训练集标签分布:\n{label_counts}") print(f"计算出的权重: [Real (0): {weight_for_0:.4f}, Fake (1): {weight_for_1:.4f}]") print("“Fake” 类的权重更高,将在训练中被重点关注。") # --- 4. Tokenization --- print(f"\n--- 正在加载 Tokenizer: {MODEL_NAME} ---") try: # (!!) 关键修复:直接使用 DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME) except Exception as e: print(f"加载 Tokenizer 失败: {e}") print("请确保你已运行: pip install sentencepiece") print("并且已成功运行 download_model.py 脚本。") exit() def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) print("正在 Tokenize 数据集 (这可能需要几分钟)...") train_dataset_hf = Dataset.from_pandas(train_df) eval_dataset_hf = Dataset.from_pandas(eval_df) # num_proc=4 使用 4 个核心并行处理数据 tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4) tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4) tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id", "text"]) tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["id", "text"]) tokenized_train_dataset.set_format("torch") tokenized_eval_dataset.set_format("torch") # --- 5. (核心) 自定义 Trainer 以使用加权损失 (最终 DDP 兼容版) --- print("\n--- 正在定义 CustomTrainer (使用加权损失) ---") class CustomTrainer(Trainer): def __init__(self, *args, class_weights=None, **kwargs): super().__init__(*args, **kwargs) if class_weights is not None: # 权重注册到内部的模型(self.model)上 # 这里是注册点,self.model 是原始模型 self.model.register_buffer("class_weights", class_weights) # (!!! 最终修正: 接受所有参数,并使用 unwrap_model 获取真实权重 !!!) def compute_loss(self, model, inputs, return_outputs=False, **kwargs): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs.get("logits") # 1. DDP 兼容性处理:获取原始模型(Unwrap DDP) # 如果模型被 DDP (DistributedDataParallel) 包裹,我们需要获取其内部的 module。 if isinstance(model, torch.nn.parallel.DistributedDataParallel): # 获取内部的原始模型实例 core_model = model.module else: # 如果没有被包裹,直接使用模型本身 core_model = model # 2. 从原始模型中获取权重 # 权重存在于 core_model.class_weights weights = core_model.class_weights.to(core_model.device) loss_fct = nn.CrossEntropyLoss(weight=weights) # 3. 计算损失 loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) return (loss, outputs) if return_outputs else loss # --- 6. 定义评估指标 --- def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) # (关键) 报告 "fake" (pos_label=1) 的指标 f1 = f1_score(labels, predictions, pos_label=1, zero_division=0) recall = recall_score(labels, predictions, pos_label=1, zero_division=0) precision = precision_score(labels, predictions, pos_label=1, zero_division=0) accuracy = accuracy_score(labels, predictions) return { 'f1_fake': f1, 'recall_fake': recall, 'precision_fake': precision, 'accuracy': accuracy, } # --- 7. 加载模型和配置训练参数 --- print(f"--- 正在加载模型: {MODEL_NAME} ---") model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) print("--- 正在配置 TrainingArguments (4xV100, fp16) ---") training_args = TrainingArguments( output_dir=NEW_MODEL_SAVE_PATH, num_train_epochs=3, # 训练 3 轮 per_device_train_batch_size=16, # 4卡 x 16 = 64 的全局 batch size per_device_eval_batch_size=32, warmup_ratio=0.1, weight_decay=0.01, logging_dir='./logs_model_B', logging_strategy="steps", logging_steps=100, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="f1_fake", # (关键) 用 f1_fake 作为“最佳”的标准 greater_is_better=True, fp16=True, # (关键) 启用 V100 混合精度训练 report_to="none", # 关闭 wandb/tensorboard, 如果需要请打开 dataloader_num_workers=4, save_total_limit=1, ) # --- 8. 初始化 Trainer --- print("--- 正在初始化 CustomTrainer ---") trainer = CustomTrainer( model=model, args=training_args, train_dataset=tokenized_train_dataset, eval_dataset=tokenized_eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, class_weights=class_weights # (关键) 传入我们的权重 ) # --- 9. 开始训练 --- print("\n--- 🚀 开始训练【模型 B】(使用完整数据 + 加权损失) 🚀 ---") train_result = trainer.train() # --- 10. 评估并保存 --- print("\n--- 训练完成!正在评估【模型 B】... ---") final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset) print("--- 【模型 B】最终验证集评估结果 ---") print(final_metrics) print("\n--- (Fake vs Real) 分类报告 ---") predictions = trainer.predict(tokenized_eval_dataset) final_preds = np.argmax(predictions.predictions, axis=-1) print(classification_report( eval_df['label'], final_preds, target_names=['real (0)', 'fake (1)'], digits=4 )) print("\n--- 正在保存【模型 B】的最佳 checkpoint ... ---") trainer.save_model(NEW_MODEL_SAVE_PATH) print(f"模型已保存到: {NEW_MODEL_SAVE_PATH}") print("--- 脚本 train_model_B.py 运行结束 ---")