File size: 14,704 Bytes

62c3b33

#!/usr/bin/env python3
"""
Train CodeBERT-based model for web attack detection
Dataset: /c1/web-attack-detection/dataset.csv
Output: /c1/new-models/
"""

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from tqdm import tqdm
import json
import random
from collections import Counter

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Configuration
class Config:
    # Paths
    data_path = "/c1/web-attack-detection/dataset.csv"
    model_base_path = "/c1/huggingface/codebert-base"
    output_dir = "/c1/new-models"
    
    # Training parameters
    max_length = 256  # Reduced from 512
    batch_size = 8    # Reduced from 32
    gradient_accumulation_steps = 4  # Effective batch size = 8 * 4 = 32
    epochs = 3
    learning_rate = 2e-5
    warmup_steps = 500
    weight_decay = 0.01
    
    # Data split
    train_size = 0.8
    test_size = 0.2
    
    # Sampling strategy
    use_sampling = True  # Enable sampling
    sampling_strategy = "balanced"  # Options: "balanced", "oversample", "undersample", "none"
    
    # GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Early stopping
    early_stopping_patience = 2

config = Config()

print("="*80)
print("Web Attack Detection Model Training")
print("="*80)
print(f"Device: {config.device}")
print(f"Data path: {config.data_path}")
print(f"Model base: {config.model_base_path}")
print(f"Output dir: {config.output_dir}")
print(f"Sampling strategy: {config.sampling_strategy}")
print("="*80)

# Create output directory
os.makedirs(config.output_dir, exist_ok=True)

# Load data
print("\n1. Loading dataset...")
df = pd.read_csv(config.data_path)
print(f"Total samples: {len(df)}")
print(f"\nLabel distribution:")
print(df['Label'].value_counts())
print(f"\nLabel proportions:")
print(df['Label'].value_counts(normalize=True))

# Clean data
print("\n2. Cleaning data...")
df = df.dropna(subset=['Sentence', 'Label'])
df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)
print(f"Samples after cleaning: {len(df)}")

# Split data
print("\n3. Splitting data (80% train, 20% test)...")
train_df, test_df = train_test_split(
    df, 
    test_size=config.test_size, 
    random_state=42,
    stratify=df['Label']
)

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTrain label distribution:")
print(train_df['Label'].value_counts())
print(f"\nTest label distribution:")
print(test_df['Label'].value_counts())

# Apply sampling strategy
def apply_sampling(df, strategy="balanced"):
    """Apply sampling strategy to balance dataset"""
    if strategy == "none":
        return df
    
    label_counts = df['Label'].value_counts()
    print(f"\nOriginal distribution: {dict(label_counts)}")
    
    if strategy == "balanced":
        # Balanced: make both classes equal to average
        target_count = int(label_counts.mean())
        print(f"Target count per class: {target_count}")
        
    elif strategy == "oversample":
        # Oversample minority to match majority
        target_count = label_counts.max()
        print(f"Target count per class (oversample): {target_count}")
        
    elif strategy == "undersample":
        # Undersample majority to match minority
        target_count = label_counts.min()
        print(f"Target count per class (undersample): {target_count}")
    
    balanced_dfs = []
    for label in [0, 1]:
        label_df = df[df['Label'] == label]
        current_count = len(label_df)
        
        if current_count < target_count:
            # Oversample
            sampled = label_df.sample(n=target_count, replace=True, random_state=42)
        elif current_count > target_count:
            # Undersample
            sampled = label_df.sample(n=target_count, replace=False, random_state=42)
        else:
            sampled = label_df
        
        balanced_dfs.append(sampled)
    
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle
    
    print(f"After sampling: {dict(balanced_df['Label'].value_counts())}")
    return balanced_df

if config.use_sampling:
    print(f"\n4. Applying sampling strategy: {config.sampling_strategy}...")
    train_df = apply_sampling(train_df, config.sampling_strategy)
    print(f"Final train samples: {len(train_df)}")
else:
    print("\n4. Skipping sampling (using original distribution)...")

# Load tokenizer
print("\n5. Loading CodeBERT tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(config.model_base_path)
print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

# Dataset class
class WebAttackDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'Sentence'])
        label = int(self.data.loc[idx, 'Label'])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
print("\n6. Creating datasets...")
train_dataset = WebAttackDataset(train_df, tokenizer, config.max_length)
test_dataset = WebAttackDataset(test_df, tokenizer, config.max_length)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

# Model class
class CodeBERTClassifier(nn.Module):
    def __init__(self, model_path, num_labels=2, dropout=0.1):
        super(CodeBERTClassifier, self).__init__()
        self.codebert = RobertaModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.codebert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Load model
print("\n7. Loading CodeBERT model...")
model = CodeBERTClassifier(config.model_base_path)
model.to(config.device)
print(f"Model loaded and moved to {config.device}")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
total_steps = len(train_loader) * config.epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config.warmup_steps,
    num_training_steps=total_steps
)

criterion = nn.CrossEntropyLoss()

# Training function
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, gradient_accumulation_steps=4):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc="Training")
    for idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss = loss / gradient_accumulation_steps  # Normalize loss
        
        loss.backward()
        
        if (idx + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * gradient_accumulation_steps
        
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    
    return avg_loss, accuracy

# Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='binary'
    )
    
    return avg_loss, accuracy, precision, recall, f1, predictions, true_labels

# Training loop
print("\n8. Starting training...")
print("="*80)

best_accuracy = 0
best_f1 = 0
patience_counter = 0
training_history = []

for epoch in range(config.epochs):
    print(f"\nEpoch {epoch + 1}/{config.epochs}")
    print("-" * 80)
    
    # Train
    train_loss, train_acc = train_epoch(
        model, train_loader, optimizer, scheduler, criterion, config.device, config.gradient_accumulation_steps
    )
    
    # Evaluate
    test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
        model, test_loader, criterion, config.device
    )
    
    # Log results
    print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
    print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
    
    # Save history
    history = {
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'train_acc': train_acc,
        'test_loss': test_loss,
        'test_acc': test_acc,
        'precision': test_precision,
        'recall': test_recall,
        'f1': test_f1
    }
    training_history.append(history)
    
    # Save best model
    if test_f1 > best_f1:
        best_f1 = test_f1
        best_accuracy = test_acc
        patience_counter = 0
        
        # Save PyTorch model
        model_save_path = os.path.join(config.output_dir, 'best_model.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'test_acc': test_acc,
            'test_f1': test_f1,
            'config': vars(config)
        }, model_save_path)
        print(f"\n✓ Best model saved! (F1: {test_f1:.4f})")
    else:
        patience_counter += 1
        print(f"\nNo improvement. Patience: {patience_counter}/{config.early_stopping_patience}")
    
    # Early stopping
    if patience_counter >= config.early_stopping_patience:
        print(f"\nEarly stopping triggered after {epoch + 1} epochs")
        break

print("\n" + "="*80)
print("Training completed!")
print("="*80)

# Final evaluation
print("\n9. Final evaluation on test set...")
test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
    model, test_loader, criterion, config.device
)

print(f"\nFinal Test Results:")
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Benign', 'Malicious']))

# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:")
print(cm)
print(f"True Negatives: {cm[0][0]}")
print(f"False Positives: {cm[0][1]}")
print(f"False Negatives: {cm[1][0]}")
print(f"True Positives: {cm[1][1]}")

# Save results
results = {
    'final_metrics': {
        'accuracy': float(test_acc),
        'precision': float(test_precision),
        'recall': float(test_recall),
        'f1_score': float(test_f1)
    },
    'confusion_matrix': cm.tolist(),
    'training_history': training_history,
    'config': {
        'epochs': config.epochs,
        'batch_size': config.batch_size,
        'learning_rate': config.learning_rate,
        'max_length': config.max_length,
        'sampling_strategy': config.sampling_strategy,
        'train_samples': len(train_df),
        'test_samples': len(test_df)
    }
}

results_path = os.path.join(config.output_dir, 'training_results.json')
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"\nResults saved to: {results_path}")

# Save tokenizer config
tokenizer_config = {
    'model_name': config.model_base_path,
    'max_length': config.max_length
}
tokenizer_config_path = os.path.join(config.output_dir, 'tokenizer_config.json')
with open(tokenizer_config_path, 'w') as f:
    json.dump(tokenizer_config, f, indent=2)
print(f"Tokenizer config saved to: {tokenizer_config_path}")

print("\n" + "="*80)
print("Training script completed successfully!")
print(f"Best F1 Score: {best_f1:.4f}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Model saved to: {config.output_dir}")
print("="*80)