BertAndDeberta / Training Code /DeBERTaFakeNews.py
kaisex's picture
Upload 3 files
16ba90b verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
from transformers import (
DebertaTokenizer,
DebertaForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# clears memory in gpu
torch.cuda.empty_cache()
# Loadin the dataset
df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv")
df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
df.dropna(subset=['text', 'label'], inplace=True)
# Splittin into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
# Tokenization with shorter sequences
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
def tokenize_function(example):
return tokenizer(
example["text"],
truncation=True,
max_length=128, # Reduced to 128 to prevent overflow
padding=False
)
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Loadin model with gradient checkpointing (FP32 precision)
model = DebertaForSequenceClassification.from_pretrained(
"microsoft/deberta-base",
num_labels=2,
torch_dtype=torch.float32 # Explicitly use FP32 to prevent overflow
)
model.gradient_checkpointing_enable()
# Optimized training arguments (without FP16)
training_args = TrainingArguments(
output_dir="./deberta_fake_news",
learning_rate=2e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
weight_decay=0.01,
eval_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
logging_dir='./logs',
logging_steps=100,
fp16=False, # Disabled FP16 to prevent overflow
max_grad_norm=1.0,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
report_to="none",
optim="adamw_torch" # Using standard AdamW instead of Adafactor
)
# Data collator with dynamic padding
data_collator = DataCollatorWithPadding(
tokenizer=tokenizer,
padding=True,
max_length=128,
pad_to_multiple_of=8
)
# Metrics calculation
def compute_metrics(pred):
labels = pred.label_ids
preds = np.argmax(pred.predictions, axis=1)
return {
"accuracy": accuracy_score(labels, preds),
"precision": precision_score(labels, preds),
"recall": recall_score(labels, preds),
"f1": f1_score(labels, preds)
}
# Trainer with optimizations
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
# Startin the training
print("Starting training...")
trainer.train()
print("Training completed!")
# Evaluatin
print("\nEvaluating model...")
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
# Save model and tokenizer
save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")
# we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL)
# import torch
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer
# from datasets import Dataset
# from sklearn.metrics import (
# classification_report,
# confusion_matrix,
# ConfusionMatrixDisplay,
# roc_curve,
# auc
# )
# # Paths
# model_path = "deberta_fake_news_model"
# data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv"
# # Load model and tokenizer
# model = DebertaForSequenceClassification.from_pretrained(model_path)
# tokenizer = DebertaTokenizer.from_pretrained(model_path)
# # Load dataset and fix labels
# df = pd.read_csv(data_path)
# df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
# df.dropna(subset=['text', 'label'], inplace=True)
# # Use 20% as test set
# from sklearn.model_selection import train_test_split
# _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
# # Create Hugging Face Dataset
# test_dataset = Dataset.from_pandas(test_df)
# # Tokenization
# def tokenize_function(example):
# return tokenizer(
# example["text"],
# truncation=True,
# max_length=128,
# padding="max_length"
# )
# test_dataset = test_dataset.map(tokenize_function, batched=True)
# # Set format for PyTorch
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# # Inference using Trainer
# trainer = Trainer(model=model)
# predictions = trainer.predict(test_dataset)
# # Predictions
# y_true = predictions.label_ids
# y_pred = np.argmax(predictions.predictions, axis=1)
# y_probs = predictions.predictions[:, 1]
# # Ensure no None
# if y_true is None or y_pred is None:
# raise ValueError("Prediction failed: y_true or y_pred is None.")
# # Classification Report
# print("\nClassification Report:\n")
# print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
# # Confusion Matrix
# cm = confusion_matrix(y_true, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"])
# disp.plot(cmap=plt.cm.Purples)
# plt.title("Confusion Matrix")
# plt.savefig("confusion_matrix.png")
# plt.show()
# # ROC Curve
# fpr, tpr, _ = roc_curve(y_true, y_probs)
# roc_auc = auc(fpr, tpr)
# plt.figure()
# plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend(loc="lower right")
# plt.savefig("roc_curve.png")
# plt.show()