| import pandas as pd |
| import torch |
| from torch.utils.data import Dataset, DataLoader |
| from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup |
| from torch.optim import AdamW |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import classification_report, confusion_matrix, accuracy_score |
| import numpy as np |
| import time |
| import os |
|
|
| |
| df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1) |
| df['label'] = df['Category'].map({'ham': 0, 'spam': 1}) |
|
|
| train_texts, test_texts, train_labels, test_labels = train_test_split( |
| df['Message'].values, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'].values |
| ) |
|
|
| |
| class EmailDataset(Dataset): |
| def __init__(self, texts, labels, tokenizer, max_len=128): |
| self.texts = texts |
| self.labels = labels |
| self.tokenizer = tokenizer |
| self.max_len = max_len |
|
|
| def __len__(self): |
| return len(self.texts) |
|
|
| def __getitem__(self, item): |
| text = str(self.texts[item]) |
| label = self.labels[item] |
| encoding = self.tokenizer._encode_plus( |
| text, |
| add_special_tokens=True, |
| max_length=self.max_len, |
| return_token_type_ids=False, |
| padding='max_length', |
| truncation=True, |
| return_attention_mask=True, |
| return_tensors='pt', |
| ) |
| return { |
| 'text': text, |
| 'input_ids': encoding['input_ids'].flatten(), |
| 'attention_mask': encoding['attention_mask'].flatten(), |
| 'labels': torch.tensor(label, dtype=torch.long) |
| } |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Using device: {device}") |
|
|
| PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased' |
| tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) |
|
|
| train_data_loader = DataLoader(EmailDataset(train_texts, train_labels, tokenizer), batch_size=16, shuffle=True) |
| test_data_loader = DataLoader(EmailDataset(test_texts, test_labels, tokenizer), batch_size=16, shuffle=False) |
|
|
| model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2) |
| model = model.to(device) |
|
|
| EPOCHS = 3 |
| optimizer = AdamW(model.parameters(), lr=2e-5) |
| total_steps = len(train_data_loader) * EPOCHS |
| scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) |
| loss_fn = torch.nn.CrossEntropyLoss().to(device) |
|
|
| |
| def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): |
| model = model.train() |
| losses = [] |
| correct_predictions = 0 |
| for d in data_loader: |
| input_ids = d["input_ids"].to(device) |
| attention_mask = d["attention_mask"].to(device) |
| labels = d["labels"].to(device) |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
| loss = outputs.loss |
| logits = outputs.logits |
| _, preds = torch.max(logits, dim=1) |
| correct_predictions += torch.sum(preds == labels) |
| losses.append(loss.item()) |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) |
| optimizer.step() |
| scheduler.step() |
| optimizer.zero_grad() |
| return correct_predictions.double() / n_examples, np.mean(losses) |
|
|
| def eval_model(model, data_loader, loss_fn, device, n_examples): |
| model = model.eval() |
| losses = [] |
| correct_predictions = 0 |
| with torch.no_grad(): |
| for d in data_loader: |
| input_ids = d["input_ids"].to(device) |
| attention_mask = d["attention_mask"].to(device) |
| labels = d["labels"].to(device) |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
| loss = outputs.loss |
| logits = outputs.logits |
| _, preds = torch.max(logits, dim=1) |
| correct_predictions += torch.sum(preds == labels) |
| losses.append(loss.item()) |
| return correct_predictions.double() / n_examples, np.mean(losses) |
|
|
| print("Starting training...") |
| for epoch in range(EPOCHS): |
| print(f'Epoch {epoch + 1}/{EPOCHS}') |
| train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_texts)) |
| print(f'Train loss {train_loss} accuracy {train_acc}') |
| val_acc, val_loss = eval_model(model, test_data_loader, loss_fn, device, len(test_texts)) |
| print(f'Val loss {val_loss} accuracy {val_acc}') |
|
|
| |
| def get_predictions(model, data_loader): |
| model = model.eval() |
| messages = [] |
| predictions = [] |
| prediction_probs = [] |
| real_values = [] |
| with torch.no_grad(): |
| for d in data_loader: |
| texts = d["text"] |
| input_ids = d["input_ids"].to(device) |
| attention_mask = d["attention_mask"].to(device) |
| labels = d["labels"].to(device) |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
| logits = outputs.logits |
| _, preds = torch.max(logits, dim=1) |
| messages.extend(texts) |
| predictions.extend(preds) |
| prediction_probs.extend(logits) |
| real_values.extend(labels) |
| predictions = torch.stack(predictions).cpu() |
| real_values = torch.stack(real_values).cpu() |
| return messages, predictions, real_values |
|
|
| y_review_texts, y_pred, y_test = get_predictions(model, test_data_loader) |
| print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam'])) |
|
|
| |
| with open('results.txt', 'w') as f: |
| f.write(f"Accuracy: {accuracy_score(y_test, y_pred)}\n") |
| f.write("\nClassification Report:\n") |
| f.write(classification_report(y_test, y_pred, target_names=['ham', 'spam'])) |
| f.write("\nConfusion Matrix:\n") |
| f.write(str(confusion_matrix(y_test, y_pred))) |
|
|
| print("Training complete. Results saved to results.txt") |
|
|