import pandas as pd import torch from torch.utils.data import Dataset, DataLoader from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup from torch.optim import AdamW from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import numpy as np import time import os # 1. Load and Preprocess Data df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1) df['label'] = df['Category'].map({'ham': 0, 'spam': 1}) train_texts, test_texts, train_labels, test_labels = train_test_split( df['Message'].values, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'].values ) # 2. Dataset Class class EmailDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len=128): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, item): text = str(self.texts[item]) label = self.labels[item] encoding = self.tokenizer._encode_plus( text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) return { 'text': text, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long) } # 3. Setup Training device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) train_data_loader = DataLoader(EmailDataset(train_texts, train_labels, tokenizer), batch_size=16, shuffle=True) test_data_loader = DataLoader(EmailDataset(test_texts, test_labels, tokenizer), batch_size=16, shuffle=False) model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2) model = model.to(device) EPOCHS = 3 optimizer = AdamW(model.parameters(), lr=2e-5) total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) loss_fn = torch.nn.CrossEntropyLoss().to(device) # 4. Training Loop def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) labels = d["labels"].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) loss = outputs.loss logits = outputs.logits _, preds = torch.max(logits, dim=1) correct_predictions += torch.sum(preds == labels) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses) def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) labels = d["labels"].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) loss = outputs.loss logits = outputs.logits _, preds = torch.max(logits, dim=1) correct_predictions += torch.sum(preds == labels) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses) print("Starting training...") for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_texts)) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model(model, test_data_loader, loss_fn, device, len(test_texts)) print(f'Val loss {val_loss} accuracy {val_acc}') # 5. Final Evaluation def get_predictions(model, data_loader): model = model.eval() messages = [] predictions = [] prediction_probs = [] real_values = [] with torch.no_grad(): for d in data_loader: texts = d["text"] input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) labels = d["labels"].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs.logits _, preds = torch.max(logits, dim=1) messages.extend(texts) predictions.extend(preds) prediction_probs.extend(logits) real_values.extend(labels) predictions = torch.stack(predictions).cpu() real_values = torch.stack(real_values).cpu() return messages, predictions, real_values y_review_texts, y_pred, y_test = get_predictions(model, test_data_loader) print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam'])) # Save results for report with open('results.txt', 'w') as f: f.write(f"Accuracy: {accuracy_score(y_test, y_pred)}\n") f.write("\nClassification Report:\n") f.write(classification_report(y_test, y_pred, target_names=['ham', 'spam'])) f.write("\nConfusion Matrix:\n") f.write(str(confusion_matrix(y_test, y_pred))) print("Training complete. Results saved to results.txt")