bert_remark / method_1 /train_model_B.py

Add files using upload-large-folder tool

027ce51 verified 3 months ago

8.74 kB

	# ==============================================================================
	# 步骤 3: 训练并保存【基础模型 B】 (DeBERTa + 加权损失) (v2 - 最终修正版)
	#
	# 策略: (来自 DM-02 PPT 的启发 - 优化损失函数)
	# 1. (!!) 关键修复: 明确使用 `DebertaV2Tokenizer` 替代 `AutoTokenizer`
	# 2. 根据训练集标签比例，计算“类别权重”，为“假评论”赋予高权重。
	# 3. 自定义 `CustomTrainer` 并重写 `compute_loss` 方法。
	# 4. 在损失函数 `nn.CrossEntropyLoss` 中传入 `weight` 参数。
	# 5. 使用 4xV100, fp16 混合精度训练。
	# 6. 监控 `f1_fake` 分数，并保存 F1 最高的模型。
	#
	# 如何在4卡服务器上运行:
	#
	# accelerate launch --num_processes=4 --mixed_precision="fp16" train_model_B.py
	#
	# ==============================================================================

	import pandas as pd
	import numpy as np
	import torch
	import torch.nn as nn
	from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
	from datasets import Dataset
	from transformers import (
	# (!!) 关键修复：不再使用 AutoTokenizer
	DebertaV2Tokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer
	)
	import os
	import warnings

	# --- 1. 配置与常量 ---
	warnings.filterwarnings("ignore")
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv"
	VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv"
	MODEL_NAME = "microsoft/deberta-v3-base"
	NEW_MODEL_SAVE_PATH = "./final_model_deberta_weighted" # 新模型的保存路径

	# --- 2. 加载数据 ---
	print(f"--- 正在训练【基础模型 B】 (DeBERTa + 加权损失) ---")
	print("加载数据...")
	train_df = pd.read_csv(TRAIN_FILE_PATH)
	eval_df = pd.read_csv(VALID_FILE_PATH)

	label_map = {"real": 0, "fake": 1}
	train_df['label'] = train_df['label'].map(label_map)
	eval_df['label'] = eval_df['label'].map(label_map)

	print(f"训练集大小: {len(train_df)}")
	print(f"验证集大小: {len(eval_df)}")

	# --- 3. (核心) 计算类别权重 ---
	print("\n--- 正在计算类别权重... ---")
	label_counts = train_df['label'].value_counts().sort_index()
	count_real = label_counts.get(0, 0)
	count_fake = label_counts.get(1, 0)
	total_samples = len(train_df)

	if count_real == 0 or count_fake == 0:
	print("错误：训练数据只包含一个类别，无法计算权重。")
	exit()

	# 权重公式: total_samples / (n_classes * class_count)
	weight_for_0 = total_samples / (2.0 * count_real)
	weight_for_1 = total_samples / (2.0 * count_fake)
	class_weights = torch.tensor([weight_for_0, weight_for_1], dtype=torch.float32)

	print(f"训练集标签分布:\n{label_counts}")
	print(f"计算出的权重: [Real (0): {weight_for_0:.4f}, Fake (1): {weight_for_1:.4f}]")
	print("“Fake” 类的权重更高，将在训练中被重点关注。")


	# --- 4. Tokenization ---
	print(f"\n--- 正在加载 Tokenizer: {MODEL_NAME} ---")
	try:
	# (!!) 关键修复：直接使用 DebertaV2Tokenizer
	tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
	except Exception as e:
	print(f"加载 Tokenizer 失败: {e}")
	print("请确保你已运行: pip install sentencepiece")
	print("并且已成功运行 download_model.py 脚本。")
	exit()

	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

	print("正在 Tokenize 数据集 (这可能需要几分钟)...")
	train_dataset_hf = Dataset.from_pandas(train_df)
	eval_dataset_hf = Dataset.from_pandas(eval_df)

	# num_proc=4 使用 4 个核心并行处理数据
	tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4)
	tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4)

	tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id", "text"])
	tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["id", "text"])
	tokenized_train_dataset.set_format("torch")
	tokenized_eval_dataset.set_format("torch")

	# --- 5. (核心) 自定义 Trainer 以使用加权损失 (最终 DDP 兼容版) ---
	print("\n--- 正在定义 CustomTrainer (使用加权损失) ---")
	class CustomTrainer(Trainer):
	def __init__(self, args, class_weights=None, *kwargs):
	super().__init__(args, *kwargs)

	if class_weights is not None:
	# 权重注册到内部的模型(self.model)上
	# 这里是注册点，self.model 是原始模型
	self.model.register_buffer("class_weights", class_weights)

	# (!!! 最终修正: 接受所有参数，并使用 unwrap_model 获取真实权重 !!!)
	def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
	labels = inputs.pop("labels")
	outputs = model(**inputs)
	logits = outputs.get("logits")

	# 1. DDP 兼容性处理：获取原始模型（Unwrap DDP）
	# 如果模型被 DDP (DistributedDataParallel) 包裹，我们需要获取其内部的 module。
	if isinstance(model, torch.nn.parallel.DistributedDataParallel):
	# 获取内部的原始模型实例
	core_model = model.module
	else:
	# 如果没有被包裹，直接使用模型本身
	core_model = model

	# 2. 从原始模型中获取权重
	# 权重存在于 core_model.class_weights
	weights = core_model.class_weights.to(core_model.device)
	loss_fct = nn.CrossEntropyLoss(weight=weights)

	# 3. 计算损失
	loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

	return (loss, outputs) if return_outputs else loss

	# --- 6. 定义评估指标 ---
	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)

	# (关键) 报告 "fake" (pos_label=1) 的指标
	f1 = f1_score(labels, predictions, pos_label=1, zero_division=0)
	recall = recall_score(labels, predictions, pos_label=1, zero_division=0)
	precision = precision_score(labels, predictions, pos_label=1, zero_division=0)
	accuracy = accuracy_score(labels, predictions)

	return {
	'f1_fake': f1,
	'recall_fake': recall,
	'precision_fake': precision,
	'accuracy': accuracy,
	}

	# --- 7. 加载模型和配置训练参数 ---
	print(f"--- 正在加载模型: {MODEL_NAME} ---")
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

	print("--- 正在配置 TrainingArguments (4xV100, fp16) ---")
	training_args = TrainingArguments(
	output_dir=NEW_MODEL_SAVE_PATH,
	num_train_epochs=3, # 训练 3 轮
	per_device_train_batch_size=16, # 4卡 x 16 = 64 的全局 batch size
	per_device_eval_batch_size=32,
	warmup_ratio=0.1,
	weight_decay=0.01,
	logging_dir='./logs_model_B',
	logging_strategy="steps",
	logging_steps=100,
	eval_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="f1_fake", # (关键) 用 f1_fake 作为“最佳”的标准
	greater_is_better=True,
	fp16=True, # (关键) 启用 V100 混合精度训练
	report_to="none", # 关闭 wandb/tensorboard, 如果需要请打开
	dataloader_num_workers=4,
	save_total_limit=1,
	)

	# --- 8. 初始化 Trainer ---
	print("--- 正在初始化 CustomTrainer ---")
	trainer = CustomTrainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_eval_dataset,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
	class_weights=class_weights # (关键) 传入我们的权重
	)

	# --- 9. 开始训练 ---
	print("\n--- 🚀 开始训练【模型 B】(使用完整数据 + 加权损失) 🚀 ---")
	train_result = trainer.train()

	# --- 10. 评估并保存 ---
	print("\n--- 训练完成！正在评估【模型 B】... ---")
	final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
	print("--- 【模型 B】最终验证集评估结果 ---")
	print(final_metrics)

	print("\n--- (Fake vs Real) 分类报告 ---")
	predictions = trainer.predict(tokenized_eval_dataset)
	final_preds = np.argmax(predictions.predictions, axis=-1)
	print(classification_report(
	eval_df['label'],
	final_preds,
	target_names=['real (0)', 'fake (1)'],
	digits=4
	))

	print("\n--- 正在保存【模型 B】的最佳 checkpoint ... ---")
	trainer.save_model(NEW_MODEL_SAVE_PATH)
	print(f"模型已保存到: {NEW_MODEL_SAVE_PATH}")
	print("--- 脚本 train_model_B.py 运行结束 ---")