BertAndDeberta / Training Code /DeBERTaFakeNews.py

Upload 3 files

16ba90b verified 7 months ago

6.38 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from datasets import Dataset
	import torch
	from transformers import (
	DebertaTokenizer,
	DebertaForSequenceClassification,
	TrainingArguments,
	Trainer,
	DataCollatorWithPadding
	)
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

	# clears memory in gpu
	torch.cuda.empty_cache()

	# Loadin the dataset

	df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv")
	df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
	df.dropna(subset=['text', 'label'], inplace=True)

	# Splittin into train and test
	train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
	train_dataset = Dataset.from_pandas(train_df)
	test_dataset = Dataset.from_pandas(test_df)

	# Tokenization with shorter sequences
	tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
	def tokenize_function(example):
	return tokenizer(
	example["text"],
	truncation=True,
	max_length=128, # Reduced to 128 to prevent overflow
	padding=False
	)
	train_dataset = train_dataset.map(tokenize_function, batched=True)
	test_dataset = test_dataset.map(tokenize_function, batched=True)

	# Loadin model with gradient checkpointing (FP32 precision)
	model = DebertaForSequenceClassification.from_pretrained(
	"microsoft/deberta-base",
	num_labels=2,
	torch_dtype=torch.float32 # Explicitly use FP32 to prevent overflow
	)
	model.gradient_checkpointing_enable()

	# Optimized training arguments (without FP16)
	training_args = TrainingArguments(
	output_dir="./deberta_fake_news",
	learning_rate=2e-5,
	per_device_train_batch_size=2,
	per_device_eval_batch_size=2,
	gradient_accumulation_steps=4,
	num_train_epochs=3,
	weight_decay=0.01,
	eval_strategy="steps",
	eval_steps=500,
	save_strategy="steps",
	save_steps=500,
	logging_dir='./logs',
	logging_steps=100,
	fp16=False, # Disabled FP16 to prevent overflow
	max_grad_norm=1.0,
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	greater_is_better=True,
	report_to="none",
	optim="adamw_torch" # Using standard AdamW instead of Adafactor
	)

	# Data collator with dynamic padding
	data_collator = DataCollatorWithPadding(
	tokenizer=tokenizer,
	padding=True,
	max_length=128,
	pad_to_multiple_of=8
	)

	# Metrics calculation
	def compute_metrics(pred):
	labels = pred.label_ids
	preds = np.argmax(pred.predictions, axis=1)
	return {
	"accuracy": accuracy_score(labels, preds),
	"precision": precision_score(labels, preds),
	"recall": recall_score(labels, preds),
	"f1": f1_score(labels, preds)
	}

	# Trainer with optimizations
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer,
	data_collator=data_collator,
	compute_metrics=compute_metrics
	)

	# Startin the training
	print("Starting training...")
	trainer.train()
	print("Training completed!")

	# Evaluatin
	print("\nEvaluating model...")
	predictions = trainer.predict(test_dataset)
	y_true = predictions.label_ids
	y_pred = np.argmax(predictions.predictions, axis=1)
	print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))

	# Save model and tokenizer
	save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model"
	trainer.save_model(save_path)
	tokenizer.save_pretrained(save_path)
	print(f"Model saved to {save_path}")


	# we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL)

	# import torch
	# import numpy as np
	# import pandas as pd
	# import matplotlib.pyplot as plt
	# from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer
	# from datasets import Dataset
	# from sklearn.metrics import (
	# classification_report,
	# confusion_matrix,
	# ConfusionMatrixDisplay,
	# roc_curve,
	# auc
	# )

	# # Paths
	# model_path = "deberta_fake_news_model"
	# data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv"

	# # Load model and tokenizer
	# model = DebertaForSequenceClassification.from_pretrained(model_path)
	# tokenizer = DebertaTokenizer.from_pretrained(model_path)

	# # Load dataset and fix labels
	# df = pd.read_csv(data_path)
	# df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
	# df.dropna(subset=['text', 'label'], inplace=True)

	# # Use 20% as test set
	# from sklearn.model_selection import train_test_split
	# _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

	# # Create Hugging Face Dataset
	# test_dataset = Dataset.from_pandas(test_df)

	# # Tokenization
	# def tokenize_function(example):
	# return tokenizer(
	# example["text"],
	# truncation=True,
	# max_length=128,
	# padding="max_length"
	# )

	# test_dataset = test_dataset.map(tokenize_function, batched=True)

	# # Set format for PyTorch
	# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

	# # Inference using Trainer
	# trainer = Trainer(model=model)
	# predictions = trainer.predict(test_dataset)

	# # Predictions
	# y_true = predictions.label_ids
	# y_pred = np.argmax(predictions.predictions, axis=1)
	# y_probs = predictions.predictions[:, 1]

	# # Ensure no None
	# if y_true is None or y_pred is None:
	# raise ValueError("Prediction failed: y_true or y_pred is None.")

	# # Classification Report
	# print("\nClassification Report:\n")
	# print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))

	# # Confusion Matrix
	# cm = confusion_matrix(y_true, y_pred)
	# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"])
	# disp.plot(cmap=plt.cm.Purples)
	# plt.title("Confusion Matrix")
	# plt.savefig("confusion_matrix.png")
	# plt.show()

	# # ROC Curve
	# fpr, tpr, _ = roc_curve(y_true, y_probs)
	# roc_auc = auc(fpr, tpr)

	# plt.figure()
	# plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
	# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
	# plt.xlabel("False Positive Rate")
	# plt.ylabel("True Positive Rate")
	# plt.title("ROC Curve")
	# plt.legend(loc="lower right")
	# plt.savefig("roc_curve.png")
	# plt.show()