|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report |
|
|
from datasets import Dataset |
|
|
from transformers import ( |
|
|
|
|
|
DebertaV2Tokenizer, |
|
|
AutoModelForSequenceClassification, |
|
|
TrainingArguments, |
|
|
Trainer |
|
|
) |
|
|
import os |
|
|
import warnings |
|
|
|
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv" |
|
|
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv" |
|
|
MODEL_NAME = "microsoft/deberta-v3-base" |
|
|
NEW_MODEL_SAVE_PATH = "./final_model_deberta_weighted" |
|
|
|
|
|
|
|
|
print(f"--- 正在训练【基础模型 B】 (DeBERTa + 加权损失) ---") |
|
|
print("加载数据...") |
|
|
train_df = pd.read_csv(TRAIN_FILE_PATH) |
|
|
eval_df = pd.read_csv(VALID_FILE_PATH) |
|
|
|
|
|
label_map = {"real": 0, "fake": 1} |
|
|
train_df['label'] = train_df['label'].map(label_map) |
|
|
eval_df['label'] = eval_df['label'].map(label_map) |
|
|
|
|
|
print(f"训练集大小: {len(train_df)}") |
|
|
print(f"验证集大小: {len(eval_df)}") |
|
|
|
|
|
|
|
|
print("\n--- 正在计算类别权重... ---") |
|
|
label_counts = train_df['label'].value_counts().sort_index() |
|
|
count_real = label_counts.get(0, 0) |
|
|
count_fake = label_counts.get(1, 0) |
|
|
total_samples = len(train_df) |
|
|
|
|
|
if count_real == 0 or count_fake == 0: |
|
|
print("错误:训练数据只包含一个类别,无法计算权重。") |
|
|
exit() |
|
|
|
|
|
|
|
|
weight_for_0 = total_samples / (2.0 * count_real) |
|
|
weight_for_1 = total_samples / (2.0 * count_fake) |
|
|
class_weights = torch.tensor([weight_for_0, weight_for_1], dtype=torch.float32) |
|
|
|
|
|
print(f"训练集标签分布:\n{label_counts}") |
|
|
print(f"计算出的权重: [Real (0): {weight_for_0:.4f}, Fake (1): {weight_for_1:.4f}]") |
|
|
print("“Fake” 类的权重更高,将在训练中被重点关注。") |
|
|
|
|
|
|
|
|
|
|
|
print(f"\n--- 正在加载 Tokenizer: {MODEL_NAME} ---") |
|
|
try: |
|
|
|
|
|
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME) |
|
|
except Exception as e: |
|
|
print(f"加载 Tokenizer 失败: {e}") |
|
|
print("请确保你已运行: pip install sentencepiece") |
|
|
print("并且已成功运行 download_model.py 脚本。") |
|
|
exit() |
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
|
|
print("正在 Tokenize 数据集 (这可能需要几分钟)...") |
|
|
train_dataset_hf = Dataset.from_pandas(train_df) |
|
|
eval_dataset_hf = Dataset.from_pandas(eval_df) |
|
|
|
|
|
|
|
|
tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True, num_proc=4) |
|
|
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True, num_proc=4) |
|
|
|
|
|
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id", "text"]) |
|
|
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["id", "text"]) |
|
|
tokenized_train_dataset.set_format("torch") |
|
|
tokenized_eval_dataset.set_format("torch") |
|
|
|
|
|
|
|
|
print("\n--- 正在定义 CustomTrainer (使用加权损失) ---") |
|
|
class CustomTrainer(Trainer): |
|
|
def __init__(self, *args, class_weights=None, **kwargs): |
|
|
super().__init__(*args, **kwargs) |
|
|
|
|
|
if class_weights is not None: |
|
|
|
|
|
|
|
|
self.model.register_buffer("class_weights", class_weights) |
|
|
|
|
|
|
|
|
def compute_loss(self, model, inputs, return_outputs=False, **kwargs): |
|
|
labels = inputs.pop("labels") |
|
|
outputs = model(**inputs) |
|
|
logits = outputs.get("logits") |
|
|
|
|
|
|
|
|
|
|
|
if isinstance(model, torch.nn.parallel.DistributedDataParallel): |
|
|
|
|
|
core_model = model.module |
|
|
else: |
|
|
|
|
|
core_model = model |
|
|
|
|
|
|
|
|
|
|
|
weights = core_model.class_weights.to(core_model.device) |
|
|
loss_fct = nn.CrossEntropyLoss(weight=weights) |
|
|
|
|
|
|
|
|
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) |
|
|
|
|
|
return (loss, outputs) if return_outputs else loss |
|
|
|
|
|
|
|
|
def compute_metrics(eval_pred): |
|
|
logits, labels = eval_pred |
|
|
predictions = np.argmax(logits, axis=-1) |
|
|
|
|
|
|
|
|
f1 = f1_score(labels, predictions, pos_label=1, zero_division=0) |
|
|
recall = recall_score(labels, predictions, pos_label=1, zero_division=0) |
|
|
precision = precision_score(labels, predictions, pos_label=1, zero_division=0) |
|
|
accuracy = accuracy_score(labels, predictions) |
|
|
|
|
|
return { |
|
|
'f1_fake': f1, |
|
|
'recall_fake': recall, |
|
|
'precision_fake': precision, |
|
|
'accuracy': accuracy, |
|
|
} |
|
|
|
|
|
|
|
|
print(f"--- 正在加载模型: {MODEL_NAME} ---") |
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) |
|
|
|
|
|
print("--- 正在配置 TrainingArguments (4xV100, fp16) ---") |
|
|
training_args = TrainingArguments( |
|
|
output_dir=NEW_MODEL_SAVE_PATH, |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=16, |
|
|
per_device_eval_batch_size=32, |
|
|
warmup_ratio=0.1, |
|
|
weight_decay=0.01, |
|
|
logging_dir='./logs_model_B', |
|
|
logging_strategy="steps", |
|
|
logging_steps=100, |
|
|
eval_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="f1_fake", |
|
|
greater_is_better=True, |
|
|
fp16=True, |
|
|
report_to="none", |
|
|
dataloader_num_workers=4, |
|
|
save_total_limit=1, |
|
|
) |
|
|
|
|
|
|
|
|
print("--- 正在初始化 CustomTrainer ---") |
|
|
trainer = CustomTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_train_dataset, |
|
|
eval_dataset=tokenized_eval_dataset, |
|
|
tokenizer=tokenizer, |
|
|
compute_metrics=compute_metrics, |
|
|
class_weights=class_weights |
|
|
) |
|
|
|
|
|
|
|
|
print("\n--- 🚀 开始训练【模型 B】(使用完整数据 + 加权损失) 🚀 ---") |
|
|
train_result = trainer.train() |
|
|
|
|
|
|
|
|
print("\n--- 训练完成!正在评估【模型 B】... ---") |
|
|
final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset) |
|
|
print("--- 【模型 B】最终验证集评估结果 ---") |
|
|
print(final_metrics) |
|
|
|
|
|
print("\n--- (Fake vs Real) 分类报告 ---") |
|
|
predictions = trainer.predict(tokenized_eval_dataset) |
|
|
final_preds = np.argmax(predictions.predictions, axis=-1) |
|
|
print(classification_report( |
|
|
eval_df['label'], |
|
|
final_preds, |
|
|
target_names=['real (0)', 'fake (1)'], |
|
|
digits=4 |
|
|
)) |
|
|
|
|
|
print("\n--- 正在保存【模型 B】的最佳 checkpoint ... ---") |
|
|
trainer.save_model(NEW_MODEL_SAVE_PATH) |
|
|
print(f"模型已保存到: {NEW_MODEL_SAVE_PATH}") |
|
|
print("--- 脚本 train_model_B.py 运行结束 ---") |