#!/usr/bin/env python3 """ Train CodeBERT-based model for web attack detection Dataset: /c1/web-attack-detection/dataset.csv Output: /c1/new-models/ """ import os import pandas as pd import numpy as np import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix from tqdm import tqdm import json import random from collections import Counter # Set random seeds for reproducibility def set_seed(seed=42): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) set_seed(42) # Configuration class Config: # Paths data_path = "/c1/web-attack-detection/dataset.csv" model_base_path = "/c1/huggingface/codebert-base" output_dir = "/c1/new-models" # Training parameters max_length = 256 # Reduced from 512 batch_size = 8 # Reduced from 32 gradient_accumulation_steps = 4 # Effective batch size = 8 * 4 = 32 epochs = 3 learning_rate = 2e-5 warmup_steps = 500 weight_decay = 0.01 # Data split train_size = 0.8 test_size = 0.2 # Sampling strategy use_sampling = True # Enable sampling sampling_strategy = "balanced" # Options: "balanced", "oversample", "undersample", "none" # GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Early stopping early_stopping_patience = 2 config = Config() print("="*80) print("Web Attack Detection Model Training") print("="*80) print(f"Device: {config.device}") print(f"Data path: {config.data_path}") print(f"Model base: {config.model_base_path}") print(f"Output dir: {config.output_dir}") print(f"Sampling strategy: {config.sampling_strategy}") print("="*80) # Create output directory os.makedirs(config.output_dir, exist_ok=True) # Load data print("\n1. Loading dataset...") df = pd.read_csv(config.data_path) print(f"Total samples: {len(df)}") print(f"\nLabel distribution:") print(df['Label'].value_counts()) print(f"\nLabel proportions:") print(df['Label'].value_counts(normalize=True)) # Clean data print("\n2. Cleaning data...") df = df.dropna(subset=['Sentence', 'Label']) df['Sentence'] = df['Sentence'].astype(str) df['Label'] = df['Label'].astype(int) print(f"Samples after cleaning: {len(df)}") # Split data print("\n3. Splitting data (80% train, 20% test)...") train_df, test_df = train_test_split( df, test_size=config.test_size, random_state=42, stratify=df['Label'] ) print(f"Train samples: {len(train_df)}") print(f"Test samples: {len(test_df)}") print(f"\nTrain label distribution:") print(train_df['Label'].value_counts()) print(f"\nTest label distribution:") print(test_df['Label'].value_counts()) # Apply sampling strategy def apply_sampling(df, strategy="balanced"): """Apply sampling strategy to balance dataset""" if strategy == "none": return df label_counts = df['Label'].value_counts() print(f"\nOriginal distribution: {dict(label_counts)}") if strategy == "balanced": # Balanced: make both classes equal to average target_count = int(label_counts.mean()) print(f"Target count per class: {target_count}") elif strategy == "oversample": # Oversample minority to match majority target_count = label_counts.max() print(f"Target count per class (oversample): {target_count}") elif strategy == "undersample": # Undersample majority to match minority target_count = label_counts.min() print(f"Target count per class (undersample): {target_count}") balanced_dfs = [] for label in [0, 1]: label_df = df[df['Label'] == label] current_count = len(label_df) if current_count < target_count: # Oversample sampled = label_df.sample(n=target_count, replace=True, random_state=42) elif current_count > target_count: # Undersample sampled = label_df.sample(n=target_count, replace=False, random_state=42) else: sampled = label_df balanced_dfs.append(sampled) balanced_df = pd.concat(balanced_dfs, ignore_index=True) balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle print(f"After sampling: {dict(balanced_df['Label'].value_counts())}") return balanced_df if config.use_sampling: print(f"\n4. Applying sampling strategy: {config.sampling_strategy}...") train_df = apply_sampling(train_df, config.sampling_strategy) print(f"Final train samples: {len(train_df)}") else: print("\n4. Skipping sampling (using original distribution)...") # Load tokenizer print("\n5. Loading CodeBERT tokenizer...") tokenizer = RobertaTokenizer.from_pretrained(config.model_base_path) print(f"Tokenizer loaded: {tokenizer.__class__.__name__}") # Dataset class class WebAttackDataset(Dataset): def __init__(self, dataframe, tokenizer, max_length): self.data = dataframe.reset_index(drop=True) self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, idx): text = str(self.data.loc[idx, 'Sentence']) label = int(self.data.loc[idx, 'Label']) encoding = self.tokenizer( text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt' ) return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label, dtype=torch.long) } # Create datasets print("\n6. Creating datasets...") train_dataset = WebAttackDataset(train_df, tokenizer, config.max_length) test_dataset = WebAttackDataset(test_df, tokenizer, config.max_length) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False) print(f"Train batches: {len(train_loader)}") print(f"Test batches: {len(test_loader)}") # Model class class CodeBERTClassifier(nn.Module): def __init__(self, model_path, num_labels=2, dropout=0.1): super(CodeBERTClassifier, self).__init__() self.codebert = RobertaModel.from_pretrained(model_path) self.dropout = nn.Dropout(dropout) self.classifier = nn.Linear(self.codebert.config.hidden_size, num_labels) def forward(self, input_ids, attention_mask): outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask) pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits # Load model print("\n7. Loading CodeBERT model...") model = CodeBERTClassifier(config.model_base_path) model.to(config.device) print(f"Model loaded and moved to {config.device}") # Count parameters total_params = sum(p.numel() for p in model.parameters()) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"Total parameters: {total_params:,}") print(f"Trainable parameters: {trainable_params:,}") # Optimizer and scheduler optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) total_steps = len(train_loader) * config.epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=total_steps ) criterion = nn.CrossEntropyLoss() # Training function def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, gradient_accumulation_steps=4): model.train() total_loss = 0 predictions = [] true_labels = [] optimizer.zero_grad() progress_bar = tqdm(dataloader, desc="Training") for idx, batch in enumerate(progress_bar): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) logits = model(input_ids, attention_mask) loss = criterion(logits, labels) loss = loss / gradient_accumulation_steps # Normalize loss loss.backward() if (idx + 1) % gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() total_loss += loss.item() * gradient_accumulation_steps preds = torch.argmax(logits, dim=1) predictions.extend(preds.cpu().numpy()) true_labels.extend(labels.cpu().numpy()) progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps}) avg_loss = total_loss / len(dataloader) accuracy = accuracy_score(true_labels, predictions) return avg_loss, accuracy # Evaluation function def evaluate(model, dataloader, criterion, device): model.eval() total_loss = 0 predictions = [] true_labels = [] with torch.no_grad(): for batch in tqdm(dataloader, desc="Evaluating"): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) logits = model(input_ids, attention_mask) loss = criterion(logits, labels) total_loss += loss.item() preds = torch.argmax(logits, dim=1) predictions.extend(preds.cpu().numpy()) true_labels.extend(labels.cpu().numpy()) avg_loss = total_loss / len(dataloader) accuracy = accuracy_score(true_labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support( true_labels, predictions, average='binary' ) return avg_loss, accuracy, precision, recall, f1, predictions, true_labels # Training loop print("\n8. Starting training...") print("="*80) best_accuracy = 0 best_f1 = 0 patience_counter = 0 training_history = [] for epoch in range(config.epochs): print(f"\nEpoch {epoch + 1}/{config.epochs}") print("-" * 80) # Train train_loss, train_acc = train_epoch( model, train_loader, optimizer, scheduler, criterion, config.device, config.gradient_accumulation_steps ) # Evaluate test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate( model, test_loader, criterion, config.device ) # Log results print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}") print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}") print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}") # Save history history = { 'epoch': epoch + 1, 'train_loss': train_loss, 'train_acc': train_acc, 'test_loss': test_loss, 'test_acc': test_acc, 'precision': test_precision, 'recall': test_recall, 'f1': test_f1 } training_history.append(history) # Save best model if test_f1 > best_f1: best_f1 = test_f1 best_accuracy = test_acc patience_counter = 0 # Save PyTorch model model_save_path = os.path.join(config.output_dir, 'best_model.pt') torch.save({ 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'test_acc': test_acc, 'test_f1': test_f1, 'config': vars(config) }, model_save_path) print(f"\n✓ Best model saved! (F1: {test_f1:.4f})") else: patience_counter += 1 print(f"\nNo improvement. Patience: {patience_counter}/{config.early_stopping_patience}") # Early stopping if patience_counter >= config.early_stopping_patience: print(f"\nEarly stopping triggered after {epoch + 1} epochs") break print("\n" + "="*80) print("Training completed!") print("="*80) # Final evaluation print("\n9. Final evaluation on test set...") test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate( model, test_loader, criterion, config.device ) print(f"\nFinal Test Results:") print(f"Accuracy: {test_acc:.4f}") print(f"Precision: {test_precision:.4f}") print(f"Recall: {test_recall:.4f}") print(f"F1 Score: {test_f1:.4f}") # Classification report print("\nClassification Report:") print(classification_report(true_labels, predictions, target_names=['Benign', 'Malicious'])) # Confusion matrix cm = confusion_matrix(true_labels, predictions) print("\nConfusion Matrix:") print(cm) print(f"True Negatives: {cm[0][0]}") print(f"False Positives: {cm[0][1]}") print(f"False Negatives: {cm[1][0]}") print(f"True Positives: {cm[1][1]}") # Save results results = { 'final_metrics': { 'accuracy': float(test_acc), 'precision': float(test_precision), 'recall': float(test_recall), 'f1_score': float(test_f1) }, 'confusion_matrix': cm.tolist(), 'training_history': training_history, 'config': { 'epochs': config.epochs, 'batch_size': config.batch_size, 'learning_rate': config.learning_rate, 'max_length': config.max_length, 'sampling_strategy': config.sampling_strategy, 'train_samples': len(train_df), 'test_samples': len(test_df) } } results_path = os.path.join(config.output_dir, 'training_results.json') with open(results_path, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {results_path}") # Save tokenizer config tokenizer_config = { 'model_name': config.model_base_path, 'max_length': config.max_length } tokenizer_config_path = os.path.join(config.output_dir, 'tokenizer_config.json') with open(tokenizer_config_path, 'w') as f: json.dump(tokenizer_config, f, indent=2) print(f"Tokenizer config saved to: {tokenizer_config_path}") print("\n" + "="*80) print("Training script completed successfully!") print(f"Best F1 Score: {best_f1:.4f}") print(f"Best Accuracy: {best_accuracy:.4f}") print(f"Model saved to: {config.output_dir}") print("="*80)