File size: 6,380 Bytes

16ba90b

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# clears memory in gpu
torch.cuda.empty_cache()

# Loadin the dataset

df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv")
df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
df.dropna(subset=['text', 'label'], inplace=True)

# Splittin into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization with shorter sequences
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=128,  # Reduced to 128 to prevent overflow
        padding=False
    )
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Loadin model with gradient checkpointing (FP32 precision)
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2,
    torch_dtype=torch.float32  # Explicitly use FP32 to prevent overflow
)
model.gradient_checkpointing_enable()

# Optimized training arguments (without FP16)
training_args = TrainingArguments(
    output_dir="./deberta_fake_news",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    fp16=False,  # Disabled FP16 to prevent overflow
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    optim="adamw_torch"  # Using standard AdamW instead of Adafactor
)

# Data collator with dynamic padding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    max_length=128,
    pad_to_multiple_of=8
)

# Metrics calculation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Trainer with optimizations
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Startin the training
print("Starting training...")
trainer.train()
print("Training completed!")

# Evaluatin
print("\nEvaluating model...")
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))

# Save model and tokenizer
save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}") 


# we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL)

# import torch
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer
# from datasets import Dataset
# from sklearn.metrics import (
#     classification_report,
#     confusion_matrix,
#     ConfusionMatrixDisplay,
#     roc_curve,
#     auc
# )

# # Paths
# model_path = "deberta_fake_news_model"
# data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv"

# # Load model and tokenizer
# model = DebertaForSequenceClassification.from_pretrained(model_path)
# tokenizer = DebertaTokenizer.from_pretrained(model_path)

# # Load dataset and fix labels
# df = pd.read_csv(data_path)
# df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
# df.dropna(subset=['text', 'label'], inplace=True)

# # Use 20% as test set
# from sklearn.model_selection import train_test_split
# _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# # Create Hugging Face Dataset
# test_dataset = Dataset.from_pandas(test_df)

# # Tokenization
# def tokenize_function(example):
#     return tokenizer(
#         example["text"],
#         truncation=True,
#         max_length=128,
#         padding="max_length"
#     )

# test_dataset = test_dataset.map(tokenize_function, batched=True)

# # Set format for PyTorch
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# # Inference using Trainer
# trainer = Trainer(model=model)
# predictions = trainer.predict(test_dataset)

# # Predictions
# y_true = predictions.label_ids
# y_pred = np.argmax(predictions.predictions, axis=1)
# y_probs = predictions.predictions[:, 1]

# # Ensure no None
# if y_true is None or y_pred is None:
#     raise ValueError("Prediction failed: y_true or y_pred is None.")

# # Classification Report
# print("\nClassification Report:\n")
# print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))

# # Confusion Matrix
# cm = confusion_matrix(y_true, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"])
# disp.plot(cmap=plt.cm.Purples)
# plt.title("Confusion Matrix")
# plt.savefig("confusion_matrix.png")
# plt.show()

# # ROC Curve
# fpr, tpr, _ = roc_curve(y_true, y_probs)
# roc_auc = auc(fpr, tpr)

# plt.figure()
# plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend(loc="lower right")
# plt.savefig("roc_curve.png")
# plt.show()