Upload 3 files

Browse files

Files changed (3) hide show

Training Code/DeBERTaFakeNews.py +210 -0
Training Code/bertFakeNewsPart2.ipynb +0 -0
Training Code/vitModelFakeNews.ipynb +0 -0

Training Code/DeBERTaFakeNews.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from datasets import Dataset
+import torch
+from transformers import (
+    DebertaTokenizer,
+    DebertaForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    DataCollatorWithPadding
+)
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
+# clears memory in gpu
+torch.cuda.empty_cache()
+# Loadin the dataset
+df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv")
+df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
+df.dropna(subset=['text', 'label'], inplace=True)
+# Splittin into train and test
+train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
+train_dataset = Dataset.from_pandas(train_df)
+test_dataset = Dataset.from_pandas(test_df)
+# Tokenization with shorter sequences
+tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
+def tokenize_function(example):
+    return tokenizer(
+        example["text"],
+        truncation=True,
+        max_length=128,  # Reduced to 128 to prevent overflow
+        padding=False
+    )
+train_dataset = train_dataset.map(tokenize_function, batched=True)
+test_dataset = test_dataset.map(tokenize_function, batched=True)
+# Loadin model with gradient checkpointing (FP32 precision)
+model = DebertaForSequenceClassification.from_pretrained(
+    "microsoft/deberta-base",
+    num_labels=2,
+    torch_dtype=torch.float32  # Explicitly use FP32 to prevent overflow
+)
+model.gradient_checkpointing_enable()
+# Optimized training arguments (without FP16)
+training_args = TrainingArguments(
+    output_dir="./deberta_fake_news",
+    learning_rate=2e-5,
+    per_device_train_batch_size=2,
+    per_device_eval_batch_size=2,
+    gradient_accumulation_steps=4,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    eval_strategy="steps",
+    eval_steps=500,
+    save_strategy="steps",
+    save_steps=500,
+    logging_dir='./logs',
+    logging_steps=100,
+    fp16=False,  # Disabled FP16 to prevent overflow
+    max_grad_norm=1.0,
+    load_best_model_at_end=True,
+    metric_for_best_model="f1",
+    greater_is_better=True,
+    report_to="none",
+    optim="adamw_torch"  # Using standard AdamW instead of Adafactor
+)
+# Data collator with dynamic padding
+data_collator = DataCollatorWithPadding(
+    tokenizer=tokenizer,
+    padding=True,
+    max_length=128,
+    pad_to_multiple_of=8
+)
+# Metrics calculation
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = np.argmax(pred.predictions, axis=1)
+    return {
+        "accuracy": accuracy_score(labels, preds),
+        "precision": precision_score(labels, preds),
+        "recall": recall_score(labels, preds),
+        "f1": f1_score(labels, preds)
+    }
+# Trainer with optimizations
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics
+)
+# Startin the training
+print("Starting training...")
+trainer.train()
+print("Training completed!")
+# Evaluatin
+print("\nEvaluating model...")
+predictions = trainer.predict(test_dataset)
+y_true = predictions.label_ids
+y_pred = np.argmax(predictions.predictions, axis=1)
+print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
+# Save model and tokenizer
+save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model"
+trainer.save_model(save_path)
+tokenizer.save_pretrained(save_path)
+print(f"Model saved to {save_path}")
+# we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL)
+# import torch
+# import numpy as np
+# import pandas as pd
+# import matplotlib.pyplot as plt
+# from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer
+# from datasets import Dataset
+# from sklearn.metrics import (
+#     classification_report,
+#     confusion_matrix,
+#     ConfusionMatrixDisplay,
+#     roc_curve,
+#     auc
+# )
+# # Paths
+# model_path = "deberta_fake_news_model"
+# data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv"
+# # Load model and tokenizer
+# model = DebertaForSequenceClassification.from_pretrained(model_path)
+# tokenizer = DebertaTokenizer.from_pretrained(model_path)
+# # Load dataset and fix labels
+# df = pd.read_csv(data_path)
+# df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
+# df.dropna(subset=['text', 'label'], inplace=True)
+# # Use 20% as test set
+# from sklearn.model_selection import train_test_split
+# _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
+# # Create Hugging Face Dataset
+# test_dataset = Dataset.from_pandas(test_df)
+# # Tokenization
+# def tokenize_function(example):
+#     return tokenizer(
+#         example["text"],
+#         truncation=True,
+#         max_length=128,
+#         padding="max_length"
+#     )
+# test_dataset = test_dataset.map(tokenize_function, batched=True)
+# # Set format for PyTorch
+# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+# # Inference using Trainer
+# trainer = Trainer(model=model)
+# predictions = trainer.predict(test_dataset)
+# # Predictions
+# y_true = predictions.label_ids
+# y_pred = np.argmax(predictions.predictions, axis=1)
+# y_probs = predictions.predictions[:, 1]
+# # Ensure no None
+# if y_true is None or y_pred is None:
+#     raise ValueError("Prediction failed: y_true or y_pred is None.")
+# # Classification Report
+# print("\nClassification Report:\n")
+# print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
+# # Confusion Matrix
+# cm = confusion_matrix(y_true, y_pred)
+# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"])
+# disp.plot(cmap=plt.cm.Purples)
+# plt.title("Confusion Matrix")
+# plt.savefig("confusion_matrix.png")
+# plt.show()
+# # ROC Curve
+# fpr, tpr, _ = roc_curve(y_true, y_probs)
+# roc_auc = auc(fpr, tpr)
+# plt.figure()
+# plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
+# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
+# plt.xlabel("False Positive Rate")
+# plt.ylabel("True Positive Rate")
+# plt.title("ROC Curve")
+# plt.legend(loc="lower right")
+# plt.savefig("roc_curve.png")
+# plt.show()

Training Code/bertFakeNewsPart2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Training Code/vitModelFakeNews.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff