| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from datasets import Dataset | |
| import torch | |
| from transformers import ( | |
| DebertaTokenizer, | |
| DebertaForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorWithPadding | |
| ) | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report | |
| # clears memory in gpu | |
| torch.cuda.empty_cache() | |
| # Loadin the dataset | |
| df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv") | |
| df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1}) | |
| df.dropna(subset=['text', 'label'], inplace=True) | |
| # Splittin into train and test | |
| train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42) | |
| train_dataset = Dataset.from_pandas(train_df) | |
| test_dataset = Dataset.from_pandas(test_df) | |
| # Tokenization with shorter sequences | |
| tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base") | |
| def tokenize_function(example): | |
| return tokenizer( | |
| example["text"], | |
| truncation=True, | |
| max_length=128, # Reduced to 128 to prevent overflow | |
| padding=False | |
| ) | |
| train_dataset = train_dataset.map(tokenize_function, batched=True) | |
| test_dataset = test_dataset.map(tokenize_function, batched=True) | |
| # Loadin model with gradient checkpointing (FP32 precision) | |
| model = DebertaForSequenceClassification.from_pretrained( | |
| "microsoft/deberta-base", | |
| num_labels=2, | |
| torch_dtype=torch.float32 # Explicitly use FP32 to prevent overflow | |
| ) | |
| model.gradient_checkpointing_enable() | |
| # Optimized training arguments (without FP16) | |
| training_args = TrainingArguments( | |
| output_dir="./deberta_fake_news", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=2, | |
| per_device_eval_batch_size=2, | |
| gradient_accumulation_steps=4, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| eval_strategy="steps", | |
| eval_steps=500, | |
| save_strategy="steps", | |
| save_steps=500, | |
| logging_dir='./logs', | |
| logging_steps=100, | |
| fp16=False, # Disabled FP16 to prevent overflow | |
| max_grad_norm=1.0, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="f1", | |
| greater_is_better=True, | |
| report_to="none", | |
| optim="adamw_torch" # Using standard AdamW instead of Adafactor | |
| ) | |
| # Data collator with dynamic padding | |
| data_collator = DataCollatorWithPadding( | |
| tokenizer=tokenizer, | |
| padding=True, | |
| max_length=128, | |
| pad_to_multiple_of=8 | |
| ) | |
| # Metrics calculation | |
| def compute_metrics(pred): | |
| labels = pred.label_ids | |
| preds = np.argmax(pred.predictions, axis=1) | |
| return { | |
| "accuracy": accuracy_score(labels, preds), | |
| "precision": precision_score(labels, preds), | |
| "recall": recall_score(labels, preds), | |
| "f1": f1_score(labels, preds) | |
| } | |
| # Trainer with optimizations | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=test_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics | |
| ) | |
| # Startin the training | |
| print("Starting training...") | |
| trainer.train() | |
| print("Training completed!") | |
| # Evaluatin | |
| print("\nEvaluating model...") | |
| predictions = trainer.predict(test_dataset) | |
| y_true = predictions.label_ids | |
| y_pred = np.argmax(predictions.predictions, axis=1) | |
| print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"])) | |
| # Save model and tokenizer | |
| save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model" | |
| trainer.save_model(save_path) | |
| tokenizer.save_pretrained(save_path) | |
| print(f"Model saved to {save_path}") | |
| # we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL) | |
| # import torch | |
| # import numpy as np | |
| # import pandas as pd | |
| # import matplotlib.pyplot as plt | |
| # from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer | |
| # from datasets import Dataset | |
| # from sklearn.metrics import ( | |
| # classification_report, | |
| # confusion_matrix, | |
| # ConfusionMatrixDisplay, | |
| # roc_curve, | |
| # auc | |
| # ) | |
| # # Paths | |
| # model_path = "deberta_fake_news_model" | |
| # data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv" | |
| # # Load model and tokenizer | |
| # model = DebertaForSequenceClassification.from_pretrained(model_path) | |
| # tokenizer = DebertaTokenizer.from_pretrained(model_path) | |
| # # Load dataset and fix labels | |
| # df = pd.read_csv(data_path) | |
| # df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1}) | |
| # df.dropna(subset=['text', 'label'], inplace=True) | |
| # # Use 20% as test set | |
| # from sklearn.model_selection import train_test_split | |
| # _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42) | |
| # # Create Hugging Face Dataset | |
| # test_dataset = Dataset.from_pandas(test_df) | |
| # # Tokenization | |
| # def tokenize_function(example): | |
| # return tokenizer( | |
| # example["text"], | |
| # truncation=True, | |
| # max_length=128, | |
| # padding="max_length" | |
| # ) | |
| # test_dataset = test_dataset.map(tokenize_function, batched=True) | |
| # # Set format for PyTorch | |
| # test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
| # # Inference using Trainer | |
| # trainer = Trainer(model=model) | |
| # predictions = trainer.predict(test_dataset) | |
| # # Predictions | |
| # y_true = predictions.label_ids | |
| # y_pred = np.argmax(predictions.predictions, axis=1) | |
| # y_probs = predictions.predictions[:, 1] | |
| # # Ensure no None | |
| # if y_true is None or y_pred is None: | |
| # raise ValueError("Prediction failed: y_true or y_pred is None.") | |
| # # Classification Report | |
| # print("\nClassification Report:\n") | |
| # print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"])) | |
| # # Confusion Matrix | |
| # cm = confusion_matrix(y_true, y_pred) | |
| # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"]) | |
| # disp.plot(cmap=plt.cm.Purples) | |
| # plt.title("Confusion Matrix") | |
| # plt.savefig("confusion_matrix.png") | |
| # plt.show() | |
| # # ROC Curve | |
| # fpr, tpr, _ = roc_curve(y_true, y_probs) | |
| # roc_auc = auc(fpr, tpr) | |
| # plt.figure() | |
| # plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})") | |
| # plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") | |
| # plt.xlabel("False Positive Rate") | |
| # plt.ylabel("True Positive Rate") | |
| # plt.title("ROC Curve") | |
| # plt.legend(loc="lower right") | |
| # plt.savefig("roc_curve.png") | |
| # plt.show() | |