kaisex commited on
Commit
16ba90b
·
verified ·
1 Parent(s): ad6d812

Upload 3 files

Browse files
Training Code/DeBERTaFakeNews.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from datasets import Dataset
5
+ import torch
6
+ from transformers import (
7
+ DebertaTokenizer,
8
+ DebertaForSequenceClassification,
9
+ TrainingArguments,
10
+ Trainer,
11
+ DataCollatorWithPadding
12
+ )
13
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
14
+
15
+ # clears memory in gpu
16
+ torch.cuda.empty_cache()
17
+
18
+ # Loadin the dataset
19
+
20
+ df = pd.read_csv("\\home\\kaisex\\Desktop\\Deb\\Proper_Dataset.csv")
21
+ df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
22
+ df.dropna(subset=['text', 'label'], inplace=True)
23
+
24
+ # Splittin into train and test
25
+ train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
26
+ train_dataset = Dataset.from_pandas(train_df)
27
+ test_dataset = Dataset.from_pandas(test_df)
28
+
29
+ # Tokenization with shorter sequences
30
+ tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
31
+ def tokenize_function(example):
32
+ return tokenizer(
33
+ example["text"],
34
+ truncation=True,
35
+ max_length=128, # Reduced to 128 to prevent overflow
36
+ padding=False
37
+ )
38
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
39
+ test_dataset = test_dataset.map(tokenize_function, batched=True)
40
+
41
+ # Loadin model with gradient checkpointing (FP32 precision)
42
+ model = DebertaForSequenceClassification.from_pretrained(
43
+ "microsoft/deberta-base",
44
+ num_labels=2,
45
+ torch_dtype=torch.float32 # Explicitly use FP32 to prevent overflow
46
+ )
47
+ model.gradient_checkpointing_enable()
48
+
49
+ # Optimized training arguments (without FP16)
50
+ training_args = TrainingArguments(
51
+ output_dir="./deberta_fake_news",
52
+ learning_rate=2e-5,
53
+ per_device_train_batch_size=2,
54
+ per_device_eval_batch_size=2,
55
+ gradient_accumulation_steps=4,
56
+ num_train_epochs=3,
57
+ weight_decay=0.01,
58
+ eval_strategy="steps",
59
+ eval_steps=500,
60
+ save_strategy="steps",
61
+ save_steps=500,
62
+ logging_dir='./logs',
63
+ logging_steps=100,
64
+ fp16=False, # Disabled FP16 to prevent overflow
65
+ max_grad_norm=1.0,
66
+ load_best_model_at_end=True,
67
+ metric_for_best_model="f1",
68
+ greater_is_better=True,
69
+ report_to="none",
70
+ optim="adamw_torch" # Using standard AdamW instead of Adafactor
71
+ )
72
+
73
+ # Data collator with dynamic padding
74
+ data_collator = DataCollatorWithPadding(
75
+ tokenizer=tokenizer,
76
+ padding=True,
77
+ max_length=128,
78
+ pad_to_multiple_of=8
79
+ )
80
+
81
+ # Metrics calculation
82
+ def compute_metrics(pred):
83
+ labels = pred.label_ids
84
+ preds = np.argmax(pred.predictions, axis=1)
85
+ return {
86
+ "accuracy": accuracy_score(labels, preds),
87
+ "precision": precision_score(labels, preds),
88
+ "recall": recall_score(labels, preds),
89
+ "f1": f1_score(labels, preds)
90
+ }
91
+
92
+ # Trainer with optimizations
93
+ trainer = Trainer(
94
+ model=model,
95
+ args=training_args,
96
+ train_dataset=train_dataset,
97
+ eval_dataset=test_dataset,
98
+ tokenizer=tokenizer,
99
+ data_collator=data_collator,
100
+ compute_metrics=compute_metrics
101
+ )
102
+
103
+ # Startin the training
104
+ print("Starting training...")
105
+ trainer.train()
106
+ print("Training completed!")
107
+
108
+ # Evaluatin
109
+ print("\nEvaluating model...")
110
+ predictions = trainer.predict(test_dataset)
111
+ y_true = predictions.label_ids
112
+ y_pred = np.argmax(predictions.predictions, axis=1)
113
+ print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
114
+
115
+ # Save model and tokenizer
116
+ save_path = "\\home\\kaisex\\Desktop\\Deb\\deberta_fake_news_model"
117
+ trainer.save_model(save_path)
118
+ tokenizer.save_pretrained(save_path)
119
+ print(f"Model saved to {save_path}")
120
+
121
+
122
+ # we USED BELOW CODE TO GET THE RESULTS OF THE MODEL (WE RAN IT SEPARATELY AFTER TRAINING COZ OF TIME IT TOOK TO TRAIN THE MODEL)
123
+
124
+ # import torch
125
+ # import numpy as np
126
+ # import pandas as pd
127
+ # import matplotlib.pyplot as plt
128
+ # from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer
129
+ # from datasets import Dataset
130
+ # from sklearn.metrics import (
131
+ # classification_report,
132
+ # confusion_matrix,
133
+ # ConfusionMatrixDisplay,
134
+ # roc_curve,
135
+ # auc
136
+ # )
137
+
138
+ # # Paths
139
+ # model_path = "deberta_fake_news_model"
140
+ # data_path = "C:\\Users\\student\\Downloads\\Proper_Dataset.csv"
141
+
142
+ # # Load model and tokenizer
143
+ # model = DebertaForSequenceClassification.from_pretrained(model_path)
144
+ # tokenizer = DebertaTokenizer.from_pretrained(model_path)
145
+
146
+ # # Load dataset and fix labels
147
+ # df = pd.read_csv(data_path)
148
+ # df['label'] = df['label'].str.upper().map({'FAKE': 0, 'REAL': 1})
149
+ # df.dropna(subset=['text', 'label'], inplace=True)
150
+
151
+ # # Use 20% as test set
152
+ # from sklearn.model_selection import train_test_split
153
+ # _, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
154
+
155
+ # # Create Hugging Face Dataset
156
+ # test_dataset = Dataset.from_pandas(test_df)
157
+
158
+ # # Tokenization
159
+ # def tokenize_function(example):
160
+ # return tokenizer(
161
+ # example["text"],
162
+ # truncation=True,
163
+ # max_length=128,
164
+ # padding="max_length"
165
+ # )
166
+
167
+ # test_dataset = test_dataset.map(tokenize_function, batched=True)
168
+
169
+ # # Set format for PyTorch
170
+ # test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
171
+
172
+ # # Inference using Trainer
173
+ # trainer = Trainer(model=model)
174
+ # predictions = trainer.predict(test_dataset)
175
+
176
+ # # Predictions
177
+ # y_true = predictions.label_ids
178
+ # y_pred = np.argmax(predictions.predictions, axis=1)
179
+ # y_probs = predictions.predictions[:, 1]
180
+
181
+ # # Ensure no None
182
+ # if y_true is None or y_pred is None:
183
+ # raise ValueError("Prediction failed: y_true or y_pred is None.")
184
+
185
+ # # Classification Report
186
+ # print("\nClassification Report:\n")
187
+ # print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))
188
+
189
+ # # Confusion Matrix
190
+ # cm = confusion_matrix(y_true, y_pred)
191
+ # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["FAKE", "REAL"])
192
+ # disp.plot(cmap=plt.cm.Purples)
193
+ # plt.title("Confusion Matrix")
194
+ # plt.savefig("confusion_matrix.png")
195
+ # plt.show()
196
+
197
+ # # ROC Curve
198
+ # fpr, tpr, _ = roc_curve(y_true, y_probs)
199
+ # roc_auc = auc(fpr, tpr)
200
+
201
+ # plt.figure()
202
+ # plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
203
+ # plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
204
+ # plt.xlabel("False Positive Rate")
205
+ # plt.ylabel("True Positive Rate")
206
+ # plt.title("ROC Curve")
207
+ # plt.legend(loc="lower right")
208
+ # plt.savefig("roc_curve.png")
209
+ # plt.show()
210
+
Training Code/bertFakeNewsPart2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Training Code/vitModelFakeNews.ipynb ADDED
The diff for this file is too large to render. See raw diff