redauzhang
upload model fit for web attack payload classfication/ and model based on codebert-base/ dataset used opensource
62c3b33
| #!/usr/bin/env python3 | |
| """ | |
| Train CodeBERT-based model for web attack detection | |
| Dataset: /c1/web-attack-detection/dataset.csv | |
| Output: /c1/new-models/ | |
| """ | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix | |
| from tqdm import tqdm | |
| import json | |
| import random | |
| from collections import Counter | |
| # Set random seeds for reproducibility | |
| def set_seed(seed=42): | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| set_seed(42) | |
| # Configuration | |
| class Config: | |
| # Paths | |
| data_path = "/c1/web-attack-detection/dataset.csv" | |
| model_base_path = "/c1/huggingface/codebert-base" | |
| output_dir = "/c1/new-models" | |
| # Training parameters | |
| max_length = 256 # Reduced from 512 | |
| batch_size = 8 # Reduced from 32 | |
| gradient_accumulation_steps = 4 # Effective batch size = 8 * 4 = 32 | |
| epochs = 3 | |
| learning_rate = 2e-5 | |
| warmup_steps = 500 | |
| weight_decay = 0.01 | |
| # Data split | |
| train_size = 0.8 | |
| test_size = 0.2 | |
| # Sampling strategy | |
| use_sampling = True # Enable sampling | |
| sampling_strategy = "balanced" # Options: "balanced", "oversample", "undersample", "none" | |
| # GPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Early stopping | |
| early_stopping_patience = 2 | |
| config = Config() | |
| print("="*80) | |
| print("Web Attack Detection Model Training") | |
| print("="*80) | |
| print(f"Device: {config.device}") | |
| print(f"Data path: {config.data_path}") | |
| print(f"Model base: {config.model_base_path}") | |
| print(f"Output dir: {config.output_dir}") | |
| print(f"Sampling strategy: {config.sampling_strategy}") | |
| print("="*80) | |
| # Create output directory | |
| os.makedirs(config.output_dir, exist_ok=True) | |
| # Load data | |
| print("\n1. Loading dataset...") | |
| df = pd.read_csv(config.data_path) | |
| print(f"Total samples: {len(df)}") | |
| print(f"\nLabel distribution:") | |
| print(df['Label'].value_counts()) | |
| print(f"\nLabel proportions:") | |
| print(df['Label'].value_counts(normalize=True)) | |
| # Clean data | |
| print("\n2. Cleaning data...") | |
| df = df.dropna(subset=['Sentence', 'Label']) | |
| df['Sentence'] = df['Sentence'].astype(str) | |
| df['Label'] = df['Label'].astype(int) | |
| print(f"Samples after cleaning: {len(df)}") | |
| # Split data | |
| print("\n3. Splitting data (80% train, 20% test)...") | |
| train_df, test_df = train_test_split( | |
| df, | |
| test_size=config.test_size, | |
| random_state=42, | |
| stratify=df['Label'] | |
| ) | |
| print(f"Train samples: {len(train_df)}") | |
| print(f"Test samples: {len(test_df)}") | |
| print(f"\nTrain label distribution:") | |
| print(train_df['Label'].value_counts()) | |
| print(f"\nTest label distribution:") | |
| print(test_df['Label'].value_counts()) | |
| # Apply sampling strategy | |
| def apply_sampling(df, strategy="balanced"): | |
| """Apply sampling strategy to balance dataset""" | |
| if strategy == "none": | |
| return df | |
| label_counts = df['Label'].value_counts() | |
| print(f"\nOriginal distribution: {dict(label_counts)}") | |
| if strategy == "balanced": | |
| # Balanced: make both classes equal to average | |
| target_count = int(label_counts.mean()) | |
| print(f"Target count per class: {target_count}") | |
| elif strategy == "oversample": | |
| # Oversample minority to match majority | |
| target_count = label_counts.max() | |
| print(f"Target count per class (oversample): {target_count}") | |
| elif strategy == "undersample": | |
| # Undersample majority to match minority | |
| target_count = label_counts.min() | |
| print(f"Target count per class (undersample): {target_count}") | |
| balanced_dfs = [] | |
| for label in [0, 1]: | |
| label_df = df[df['Label'] == label] | |
| current_count = len(label_df) | |
| if current_count < target_count: | |
| # Oversample | |
| sampled = label_df.sample(n=target_count, replace=True, random_state=42) | |
| elif current_count > target_count: | |
| # Undersample | |
| sampled = label_df.sample(n=target_count, replace=False, random_state=42) | |
| else: | |
| sampled = label_df | |
| balanced_dfs.append(sampled) | |
| balanced_df = pd.concat(balanced_dfs, ignore_index=True) | |
| balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle | |
| print(f"After sampling: {dict(balanced_df['Label'].value_counts())}") | |
| return balanced_df | |
| if config.use_sampling: | |
| print(f"\n4. Applying sampling strategy: {config.sampling_strategy}...") | |
| train_df = apply_sampling(train_df, config.sampling_strategy) | |
| print(f"Final train samples: {len(train_df)}") | |
| else: | |
| print("\n4. Skipping sampling (using original distribution)...") | |
| # Load tokenizer | |
| print("\n5. Loading CodeBERT tokenizer...") | |
| tokenizer = RobertaTokenizer.from_pretrained(config.model_base_path) | |
| print(f"Tokenizer loaded: {tokenizer.__class__.__name__}") | |
| # Dataset class | |
| class WebAttackDataset(Dataset): | |
| def __init__(self, dataframe, tokenizer, max_length): | |
| self.data = dataframe.reset_index(drop=True) | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| text = str(self.data.loc[idx, 'Sentence']) | |
| label = int(self.data.loc[idx, 'Label']) | |
| encoding = self.tokenizer( | |
| text, | |
| add_special_tokens=True, | |
| max_length=self.max_length, | |
| padding='max_length', | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| return { | |
| 'input_ids': encoding['input_ids'].flatten(), | |
| 'attention_mask': encoding['attention_mask'].flatten(), | |
| 'label': torch.tensor(label, dtype=torch.long) | |
| } | |
| # Create datasets | |
| print("\n6. Creating datasets...") | |
| train_dataset = WebAttackDataset(train_df, tokenizer, config.max_length) | |
| test_dataset = WebAttackDataset(test_df, tokenizer, config.max_length) | |
| train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) | |
| test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False) | |
| print(f"Train batches: {len(train_loader)}") | |
| print(f"Test batches: {len(test_loader)}") | |
| # Model class | |
| class CodeBERTClassifier(nn.Module): | |
| def __init__(self, model_path, num_labels=2, dropout=0.1): | |
| super(CodeBERTClassifier, self).__init__() | |
| self.codebert = RobertaModel.from_pretrained(model_path) | |
| self.dropout = nn.Dropout(dropout) | |
| self.classifier = nn.Linear(self.codebert.config.hidden_size, num_labels) | |
| def forward(self, input_ids, attention_mask): | |
| outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask) | |
| pooled_output = outputs.pooler_output | |
| pooled_output = self.dropout(pooled_output) | |
| logits = self.classifier(pooled_output) | |
| return logits | |
| # Load model | |
| print("\n7. Loading CodeBERT model...") | |
| model = CodeBERTClassifier(config.model_base_path) | |
| model.to(config.device) | |
| print(f"Model loaded and moved to {config.device}") | |
| # Count parameters | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| print(f"Total parameters: {total_params:,}") | |
| print(f"Trainable parameters: {trainable_params:,}") | |
| # Optimizer and scheduler | |
| optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) | |
| total_steps = len(train_loader) * config.epochs | |
| scheduler = get_linear_schedule_with_warmup( | |
| optimizer, | |
| num_warmup_steps=config.warmup_steps, | |
| num_training_steps=total_steps | |
| ) | |
| criterion = nn.CrossEntropyLoss() | |
| # Training function | |
| def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, gradient_accumulation_steps=4): | |
| model.train() | |
| total_loss = 0 | |
| predictions = [] | |
| true_labels = [] | |
| optimizer.zero_grad() | |
| progress_bar = tqdm(dataloader, desc="Training") | |
| for idx, batch in enumerate(progress_bar): | |
| input_ids = batch['input_ids'].to(device) | |
| attention_mask = batch['attention_mask'].to(device) | |
| labels = batch['label'].to(device) | |
| logits = model(input_ids, attention_mask) | |
| loss = criterion(logits, labels) | |
| loss = loss / gradient_accumulation_steps # Normalize loss | |
| loss.backward() | |
| if (idx + 1) % gradient_accumulation_steps == 0: | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| scheduler.step() | |
| optimizer.zero_grad() | |
| total_loss += loss.item() * gradient_accumulation_steps | |
| preds = torch.argmax(logits, dim=1) | |
| predictions.extend(preds.cpu().numpy()) | |
| true_labels.extend(labels.cpu().numpy()) | |
| progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps}) | |
| avg_loss = total_loss / len(dataloader) | |
| accuracy = accuracy_score(true_labels, predictions) | |
| return avg_loss, accuracy | |
| # Evaluation function | |
| def evaluate(model, dataloader, criterion, device): | |
| model.eval() | |
| total_loss = 0 | |
| predictions = [] | |
| true_labels = [] | |
| with torch.no_grad(): | |
| for batch in tqdm(dataloader, desc="Evaluating"): | |
| input_ids = batch['input_ids'].to(device) | |
| attention_mask = batch['attention_mask'].to(device) | |
| labels = batch['label'].to(device) | |
| logits = model(input_ids, attention_mask) | |
| loss = criterion(logits, labels) | |
| total_loss += loss.item() | |
| preds = torch.argmax(logits, dim=1) | |
| predictions.extend(preds.cpu().numpy()) | |
| true_labels.extend(labels.cpu().numpy()) | |
| avg_loss = total_loss / len(dataloader) | |
| accuracy = accuracy_score(true_labels, predictions) | |
| precision, recall, f1, _ = precision_recall_fscore_support( | |
| true_labels, predictions, average='binary' | |
| ) | |
| return avg_loss, accuracy, precision, recall, f1, predictions, true_labels | |
| # Training loop | |
| print("\n8. Starting training...") | |
| print("="*80) | |
| best_accuracy = 0 | |
| best_f1 = 0 | |
| patience_counter = 0 | |
| training_history = [] | |
| for epoch in range(config.epochs): | |
| print(f"\nEpoch {epoch + 1}/{config.epochs}") | |
| print("-" * 80) | |
| # Train | |
| train_loss, train_acc = train_epoch( | |
| model, train_loader, optimizer, scheduler, criterion, config.device, config.gradient_accumulation_steps | |
| ) | |
| # Evaluate | |
| test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate( | |
| model, test_loader, criterion, config.device | |
| ) | |
| # Log results | |
| print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}") | |
| print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}") | |
| print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}") | |
| # Save history | |
| history = { | |
| 'epoch': epoch + 1, | |
| 'train_loss': train_loss, | |
| 'train_acc': train_acc, | |
| 'test_loss': test_loss, | |
| 'test_acc': test_acc, | |
| 'precision': test_precision, | |
| 'recall': test_recall, | |
| 'f1': test_f1 | |
| } | |
| training_history.append(history) | |
| # Save best model | |
| if test_f1 > best_f1: | |
| best_f1 = test_f1 | |
| best_accuracy = test_acc | |
| patience_counter = 0 | |
| # Save PyTorch model | |
| model_save_path = os.path.join(config.output_dir, 'best_model.pt') | |
| torch.save({ | |
| 'epoch': epoch + 1, | |
| 'model_state_dict': model.state_dict(), | |
| 'optimizer_state_dict': optimizer.state_dict(), | |
| 'test_acc': test_acc, | |
| 'test_f1': test_f1, | |
| 'config': vars(config) | |
| }, model_save_path) | |
| print(f"\n✓ Best model saved! (F1: {test_f1:.4f})") | |
| else: | |
| patience_counter += 1 | |
| print(f"\nNo improvement. Patience: {patience_counter}/{config.early_stopping_patience}") | |
| # Early stopping | |
| if patience_counter >= config.early_stopping_patience: | |
| print(f"\nEarly stopping triggered after {epoch + 1} epochs") | |
| break | |
| print("\n" + "="*80) | |
| print("Training completed!") | |
| print("="*80) | |
| # Final evaluation | |
| print("\n9. Final evaluation on test set...") | |
| test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate( | |
| model, test_loader, criterion, config.device | |
| ) | |
| print(f"\nFinal Test Results:") | |
| print(f"Accuracy: {test_acc:.4f}") | |
| print(f"Precision: {test_precision:.4f}") | |
| print(f"Recall: {test_recall:.4f}") | |
| print(f"F1 Score: {test_f1:.4f}") | |
| # Classification report | |
| print("\nClassification Report:") | |
| print(classification_report(true_labels, predictions, target_names=['Benign', 'Malicious'])) | |
| # Confusion matrix | |
| cm = confusion_matrix(true_labels, predictions) | |
| print("\nConfusion Matrix:") | |
| print(cm) | |
| print(f"True Negatives: {cm[0][0]}") | |
| print(f"False Positives: {cm[0][1]}") | |
| print(f"False Negatives: {cm[1][0]}") | |
| print(f"True Positives: {cm[1][1]}") | |
| # Save results | |
| results = { | |
| 'final_metrics': { | |
| 'accuracy': float(test_acc), | |
| 'precision': float(test_precision), | |
| 'recall': float(test_recall), | |
| 'f1_score': float(test_f1) | |
| }, | |
| 'confusion_matrix': cm.tolist(), | |
| 'training_history': training_history, | |
| 'config': { | |
| 'epochs': config.epochs, | |
| 'batch_size': config.batch_size, | |
| 'learning_rate': config.learning_rate, | |
| 'max_length': config.max_length, | |
| 'sampling_strategy': config.sampling_strategy, | |
| 'train_samples': len(train_df), | |
| 'test_samples': len(test_df) | |
| } | |
| } | |
| results_path = os.path.join(config.output_dir, 'training_results.json') | |
| with open(results_path, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nResults saved to: {results_path}") | |
| # Save tokenizer config | |
| tokenizer_config = { | |
| 'model_name': config.model_base_path, | |
| 'max_length': config.max_length | |
| } | |
| tokenizer_config_path = os.path.join(config.output_dir, 'tokenizer_config.json') | |
| with open(tokenizer_config_path, 'w') as f: | |
| json.dump(tokenizer_config, f, indent=2) | |
| print(f"Tokenizer config saved to: {tokenizer_config_path}") | |
| print("\n" + "="*80) | |
| print("Training script completed successfully!") | |
| print(f"Best F1 Score: {best_f1:.4f}") | |
| print(f"Best Accuracy: {best_accuracy:.4f}") | |
| print(f"Model saved to: {config.output_dir}") | |
| print("="*80) | |