redauzhang
upload model fit for web attack payload classfication/ and model based on codebert-base/ dataset used opensource
62c3b33
#!/usr/bin/env python3
"""
Train CodeBERT-based model for web attack detection
Dataset: /c1/web-attack-detection/dataset.csv
Output: /c1/new-models/
"""
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from tqdm import tqdm
import json
import random
from collections import Counter
# Set random seeds for reproducibility
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(42)
# Configuration
class Config:
# Paths
data_path = "/c1/web-attack-detection/dataset.csv"
model_base_path = "/c1/huggingface/codebert-base"
output_dir = "/c1/new-models"
# Training parameters
max_length = 256 # Reduced from 512
batch_size = 8 # Reduced from 32
gradient_accumulation_steps = 4 # Effective batch size = 8 * 4 = 32
epochs = 3
learning_rate = 2e-5
warmup_steps = 500
weight_decay = 0.01
# Data split
train_size = 0.8
test_size = 0.2
# Sampling strategy
use_sampling = True # Enable sampling
sampling_strategy = "balanced" # Options: "balanced", "oversample", "undersample", "none"
# GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Early stopping
early_stopping_patience = 2
config = Config()
print("="*80)
print("Web Attack Detection Model Training")
print("="*80)
print(f"Device: {config.device}")
print(f"Data path: {config.data_path}")
print(f"Model base: {config.model_base_path}")
print(f"Output dir: {config.output_dir}")
print(f"Sampling strategy: {config.sampling_strategy}")
print("="*80)
# Create output directory
os.makedirs(config.output_dir, exist_ok=True)
# Load data
print("\n1. Loading dataset...")
df = pd.read_csv(config.data_path)
print(f"Total samples: {len(df)}")
print(f"\nLabel distribution:")
print(df['Label'].value_counts())
print(f"\nLabel proportions:")
print(df['Label'].value_counts(normalize=True))
# Clean data
print("\n2. Cleaning data...")
df = df.dropna(subset=['Sentence', 'Label'])
df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)
print(f"Samples after cleaning: {len(df)}")
# Split data
print("\n3. Splitting data (80% train, 20% test)...")
train_df, test_df = train_test_split(
df,
test_size=config.test_size,
random_state=42,
stratify=df['Label']
)
print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTrain label distribution:")
print(train_df['Label'].value_counts())
print(f"\nTest label distribution:")
print(test_df['Label'].value_counts())
# Apply sampling strategy
def apply_sampling(df, strategy="balanced"):
"""Apply sampling strategy to balance dataset"""
if strategy == "none":
return df
label_counts = df['Label'].value_counts()
print(f"\nOriginal distribution: {dict(label_counts)}")
if strategy == "balanced":
# Balanced: make both classes equal to average
target_count = int(label_counts.mean())
print(f"Target count per class: {target_count}")
elif strategy == "oversample":
# Oversample minority to match majority
target_count = label_counts.max()
print(f"Target count per class (oversample): {target_count}")
elif strategy == "undersample":
# Undersample majority to match minority
target_count = label_counts.min()
print(f"Target count per class (undersample): {target_count}")
balanced_dfs = []
for label in [0, 1]:
label_df = df[df['Label'] == label]
current_count = len(label_df)
if current_count < target_count:
# Oversample
sampled = label_df.sample(n=target_count, replace=True, random_state=42)
elif current_count > target_count:
# Undersample
sampled = label_df.sample(n=target_count, replace=False, random_state=42)
else:
sampled = label_df
balanced_dfs.append(sampled)
balanced_df = pd.concat(balanced_dfs, ignore_index=True)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle
print(f"After sampling: {dict(balanced_df['Label'].value_counts())}")
return balanced_df
if config.use_sampling:
print(f"\n4. Applying sampling strategy: {config.sampling_strategy}...")
train_df = apply_sampling(train_df, config.sampling_strategy)
print(f"Final train samples: {len(train_df)}")
else:
print("\n4. Skipping sampling (using original distribution)...")
# Load tokenizer
print("\n5. Loading CodeBERT tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(config.model_base_path)
print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")
# Dataset class
class WebAttackDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.data = dataframe.reset_index(drop=True)
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = str(self.data.loc[idx, 'Sentence'])
label = int(self.data.loc[idx, 'Label'])
encoding = self.tokenizer(
text,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
# Create datasets
print("\n6. Creating datasets...")
train_dataset = WebAttackDataset(train_df, tokenizer, config.max_length)
test_dataset = WebAttackDataset(test_df, tokenizer, config.max_length)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")
# Model class
class CodeBERTClassifier(nn.Module):
def __init__(self, model_path, num_labels=2, dropout=0.1):
super(CodeBERTClassifier, self).__init__()
self.codebert = RobertaModel.from_pretrained(model_path)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(self.codebert.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask):
outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
# Load model
print("\n7. Loading CodeBERT model...")
model = CodeBERTClassifier(config.model_base_path)
model.to(config.device)
print(f"Model loaded and moved to {config.device}")
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
total_steps = len(train_loader) * config.epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=config.warmup_steps,
num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss()
# Training function
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, gradient_accumulation_steps=4):
model.train()
total_loss = 0
predictions = []
true_labels = []
optimizer.zero_grad()
progress_bar = tqdm(dataloader, desc="Training")
for idx, batch in enumerate(progress_bar):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
logits = model(input_ids, attention_mask)
loss = criterion(logits, labels)
loss = loss / gradient_accumulation_steps # Normalize loss
loss.backward()
if (idx + 1) % gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
total_loss += loss.item() * gradient_accumulation_steps
preds = torch.argmax(logits, dim=1)
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})
avg_loss = total_loss / len(dataloader)
accuracy = accuracy_score(true_labels, predictions)
return avg_loss, accuracy
# Evaluation function
def evaluate(model, dataloader, criterion, device):
model.eval()
total_loss = 0
predictions = []
true_labels = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
logits = model(input_ids, attention_mask)
loss = criterion(logits, labels)
total_loss += loss.item()
preds = torch.argmax(logits, dim=1)
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
avg_loss = total_loss / len(dataloader)
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
true_labels, predictions, average='binary'
)
return avg_loss, accuracy, precision, recall, f1, predictions, true_labels
# Training loop
print("\n8. Starting training...")
print("="*80)
best_accuracy = 0
best_f1 = 0
patience_counter = 0
training_history = []
for epoch in range(config.epochs):
print(f"\nEpoch {epoch + 1}/{config.epochs}")
print("-" * 80)
# Train
train_loss, train_acc = train_epoch(
model, train_loader, optimizer, scheduler, criterion, config.device, config.gradient_accumulation_steps
)
# Evaluate
test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
model, test_loader, criterion, config.device
)
# Log results
print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
# Save history
history = {
'epoch': epoch + 1,
'train_loss': train_loss,
'train_acc': train_acc,
'test_loss': test_loss,
'test_acc': test_acc,
'precision': test_precision,
'recall': test_recall,
'f1': test_f1
}
training_history.append(history)
# Save best model
if test_f1 > best_f1:
best_f1 = test_f1
best_accuracy = test_acc
patience_counter = 0
# Save PyTorch model
model_save_path = os.path.join(config.output_dir, 'best_model.pt')
torch.save({
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'test_acc': test_acc,
'test_f1': test_f1,
'config': vars(config)
}, model_save_path)
print(f"\n✓ Best model saved! (F1: {test_f1:.4f})")
else:
patience_counter += 1
print(f"\nNo improvement. Patience: {patience_counter}/{config.early_stopping_patience}")
# Early stopping
if patience_counter >= config.early_stopping_patience:
print(f"\nEarly stopping triggered after {epoch + 1} epochs")
break
print("\n" + "="*80)
print("Training completed!")
print("="*80)
# Final evaluation
print("\n9. Final evaluation on test set...")
test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
model, test_loader, criterion, config.device
)
print(f"\nFinal Test Results:")
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
# Classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Benign', 'Malicious']))
# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:")
print(cm)
print(f"True Negatives: {cm[0][0]}")
print(f"False Positives: {cm[0][1]}")
print(f"False Negatives: {cm[1][0]}")
print(f"True Positives: {cm[1][1]}")
# Save results
results = {
'final_metrics': {
'accuracy': float(test_acc),
'precision': float(test_precision),
'recall': float(test_recall),
'f1_score': float(test_f1)
},
'confusion_matrix': cm.tolist(),
'training_history': training_history,
'config': {
'epochs': config.epochs,
'batch_size': config.batch_size,
'learning_rate': config.learning_rate,
'max_length': config.max_length,
'sampling_strategy': config.sampling_strategy,
'train_samples': len(train_df),
'test_samples': len(test_df)
}
}
results_path = os.path.join(config.output_dir, 'training_results.json')
with open(results_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {results_path}")
# Save tokenizer config
tokenizer_config = {
'model_name': config.model_base_path,
'max_length': config.max_length
}
tokenizer_config_path = os.path.join(config.output_dir, 'tokenizer_config.json')
with open(tokenizer_config_path, 'w') as f:
json.dump(tokenizer_config, f, indent=2)
print(f"Tokenizer config saved to: {tokenizer_config_path}")
print("\n" + "="*80)
print("Training script completed successfully!")
print(f"Best F1 Score: {best_f1:.4f}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Model saved to: {config.output_dir}")
print("="*80)