common-injection-payload-classfication / train_new_model.py

redauzhang

upload model fit for web attack payload classfication/ and model based on codebert-base/ dataset used opensource

62c3b33 23 days ago

14.7 kB

	#!/usr/bin/env python3
	"""
	Train CodeBERT-based model for web attack detection
	Dataset: /c1/web-attack-detection/dataset.csv
	Output: /c1/new-models/
	"""

	import os
	import pandas as pd
	import numpy as np
	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader
	from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
	from tqdm import tqdm
	import json
	import random
	from collections import Counter

	# Set random seeds for reproducibility
	def set_seed(seed=42):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

	set_seed(42)

	# Configuration
	class Config:
	# Paths
	data_path = "/c1/web-attack-detection/dataset.csv"
	model_base_path = "/c1/huggingface/codebert-base"
	output_dir = "/c1/new-models"

	# Training parameters
	max_length = 256 # Reduced from 512
	batch_size = 8 # Reduced from 32
	gradient_accumulation_steps = 4 # Effective batch size = 8 * 4 = 32
	epochs = 3
	learning_rate = 2e-5
	warmup_steps = 500
	weight_decay = 0.01

	# Data split
	train_size = 0.8
	test_size = 0.2

	# Sampling strategy
	use_sampling = True # Enable sampling
	sampling_strategy = "balanced" # Options: "balanced", "oversample", "undersample", "none"

	# GPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Early stopping
	early_stopping_patience = 2

	config = Config()

	print("="*80)
	print("Web Attack Detection Model Training")
	print("="*80)
	print(f"Device: {config.device}")
	print(f"Data path: {config.data_path}")
	print(f"Model base: {config.model_base_path}")
	print(f"Output dir: {config.output_dir}")
	print(f"Sampling strategy: {config.sampling_strategy}")
	print("="*80)

	# Create output directory
	os.makedirs(config.output_dir, exist_ok=True)

	# Load data
	print("\n1. Loading dataset...")
	df = pd.read_csv(config.data_path)
	print(f"Total samples: {len(df)}")
	print(f"\nLabel distribution:")
	print(df['Label'].value_counts())
	print(f"\nLabel proportions:")
	print(df['Label'].value_counts(normalize=True))

	# Clean data
	print("\n2. Cleaning data...")
	df = df.dropna(subset=['Sentence', 'Label'])
	df['Sentence'] = df['Sentence'].astype(str)
	df['Label'] = df['Label'].astype(int)
	print(f"Samples after cleaning: {len(df)}")

	# Split data
	print("\n3. Splitting data (80% train, 20% test)...")
	train_df, test_df = train_test_split(
	df,
	test_size=config.test_size,
	random_state=42,
	stratify=df['Label']
	)

	print(f"Train samples: {len(train_df)}")
	print(f"Test samples: {len(test_df)}")
	print(f"\nTrain label distribution:")
	print(train_df['Label'].value_counts())
	print(f"\nTest label distribution:")
	print(test_df['Label'].value_counts())

	# Apply sampling strategy
	def apply_sampling(df, strategy="balanced"):
	"""Apply sampling strategy to balance dataset"""
	if strategy == "none":
	return df

	label_counts = df['Label'].value_counts()
	print(f"\nOriginal distribution: {dict(label_counts)}")

	if strategy == "balanced":
	# Balanced: make both classes equal to average
	target_count = int(label_counts.mean())
	print(f"Target count per class: {target_count}")

	elif strategy == "oversample":
	# Oversample minority to match majority
	target_count = label_counts.max()
	print(f"Target count per class (oversample): {target_count}")

	elif strategy == "undersample":
	# Undersample majority to match minority
	target_count = label_counts.min()
	print(f"Target count per class (undersample): {target_count}")

	balanced_dfs = []
	for label in [0, 1]:
	label_df = df[df['Label'] == label]
	current_count = len(label_df)

	if current_count < target_count:
	# Oversample
	sampled = label_df.sample(n=target_count, replace=True, random_state=42)
	elif current_count > target_count:
	# Undersample
	sampled = label_df.sample(n=target_count, replace=False, random_state=42)
	else:
	sampled = label_df

	balanced_dfs.append(sampled)

	balanced_df = pd.concat(balanced_dfs, ignore_index=True)
	balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle

	print(f"After sampling: {dict(balanced_df['Label'].value_counts())}")
	return balanced_df

	if config.use_sampling:
	print(f"\n4. Applying sampling strategy: {config.sampling_strategy}...")
	train_df = apply_sampling(train_df, config.sampling_strategy)
	print(f"Final train samples: {len(train_df)}")
	else:
	print("\n4. Skipping sampling (using original distribution)...")

	# Load tokenizer
	print("\n5. Loading CodeBERT tokenizer...")
	tokenizer = RobertaTokenizer.from_pretrained(config.model_base_path)
	print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

	# Dataset class
	class WebAttackDataset(Dataset):
	def __init__(self, dataframe, tokenizer, max_length):
	self.data = dataframe.reset_index(drop=True)
	self.tokenizer = tokenizer
	self.max_length = max_length

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	text = str(self.data.loc[idx, 'Sentence'])
	label = int(self.data.loc[idx, 'Label'])

	encoding = self.tokenizer(
	text,
	add_special_tokens=True,
	max_length=self.max_length,
	padding='max_length',
	truncation=True,
	return_tensors='pt'
	)

	return {
	'input_ids': encoding['input_ids'].flatten(),
	'attention_mask': encoding['attention_mask'].flatten(),
	'label': torch.tensor(label, dtype=torch.long)
	}

	# Create datasets
	print("\n6. Creating datasets...")
	train_dataset = WebAttackDataset(train_df, tokenizer, config.max_length)
	test_dataset = WebAttackDataset(test_df, tokenizer, config.max_length)

	train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
	test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)

	print(f"Train batches: {len(train_loader)}")
	print(f"Test batches: {len(test_loader)}")

	# Model class
	class CodeBERTClassifier(nn.Module):
	def __init__(self, model_path, num_labels=2, dropout=0.1):
	super(CodeBERTClassifier, self).__init__()
	self.codebert = RobertaModel.from_pretrained(model_path)
	self.dropout = nn.Dropout(dropout)
	self.classifier = nn.Linear(self.codebert.config.hidden_size, num_labels)

	def forward(self, input_ids, attention_mask):
	outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
	pooled_output = outputs.pooler_output
	pooled_output = self.dropout(pooled_output)
	logits = self.classifier(pooled_output)
	return logits

	# Load model
	print("\n7. Loading CodeBERT model...")
	model = CodeBERTClassifier(config.model_base_path)
	model.to(config.device)
	print(f"Model loaded and moved to {config.device}")

	# Count parameters
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f"Total parameters: {total_params:,}")
	print(f"Trainable parameters: {trainable_params:,}")

	# Optimizer and scheduler
	optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
	total_steps = len(train_loader) * config.epochs
	scheduler = get_linear_schedule_with_warmup(
	optimizer,
	num_warmup_steps=config.warmup_steps,
	num_training_steps=total_steps
	)

	criterion = nn.CrossEntropyLoss()

	# Training function
	def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, gradient_accumulation_steps=4):
	model.train()
	total_loss = 0
	predictions = []
	true_labels = []

	optimizer.zero_grad()

	progress_bar = tqdm(dataloader, desc="Training")
	for idx, batch in enumerate(progress_bar):
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)
	labels = batch['label'].to(device)

	logits = model(input_ids, attention_mask)
	loss = criterion(logits, labels)
	loss = loss / gradient_accumulation_steps # Normalize loss

	loss.backward()

	if (idx + 1) % gradient_accumulation_steps == 0:
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()

	total_loss += loss.item() * gradient_accumulation_steps

	preds = torch.argmax(logits, dim=1)
	predictions.extend(preds.cpu().numpy())
	true_labels.extend(labels.cpu().numpy())

	progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})

	avg_loss = total_loss / len(dataloader)
	accuracy = accuracy_score(true_labels, predictions)

	return avg_loss, accuracy

	# Evaluation function
	def evaluate(model, dataloader, criterion, device):
	model.eval()
	total_loss = 0
	predictions = []
	true_labels = []

	with torch.no_grad():
	for batch in tqdm(dataloader, desc="Evaluating"):
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)
	labels = batch['label'].to(device)

	logits = model(input_ids, attention_mask)
	loss = criterion(logits, labels)

	total_loss += loss.item()

	preds = torch.argmax(logits, dim=1)
	predictions.extend(preds.cpu().numpy())
	true_labels.extend(labels.cpu().numpy())

	avg_loss = total_loss / len(dataloader)
	accuracy = accuracy_score(true_labels, predictions)
	precision, recall, f1, _ = precision_recall_fscore_support(
	true_labels, predictions, average='binary'
	)

	return avg_loss, accuracy, precision, recall, f1, predictions, true_labels

	# Training loop
	print("\n8. Starting training...")
	print("="*80)

	best_accuracy = 0
	best_f1 = 0
	patience_counter = 0
	training_history = []

	for epoch in range(config.epochs):
	print(f"\nEpoch {epoch + 1}/{config.epochs}")
	print("-" * 80)

	# Train
	train_loss, train_acc = train_epoch(
	model, train_loader, optimizer, scheduler, criterion, config.device, config.gradient_accumulation_steps
	)

	# Evaluate
	test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
	model, test_loader, criterion, config.device
	)

	# Log results
	print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
	print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
	print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")

	# Save history
	history = {
	'epoch': epoch + 1,
	'train_loss': train_loss,
	'train_acc': train_acc,
	'test_loss': test_loss,
	'test_acc': test_acc,
	'precision': test_precision,
	'recall': test_recall,
	'f1': test_f1
	}
	training_history.append(history)

	# Save best model
	if test_f1 > best_f1:
	best_f1 = test_f1
	best_accuracy = test_acc
	patience_counter = 0

	# Save PyTorch model
	model_save_path = os.path.join(config.output_dir, 'best_model.pt')
	torch.save({
	'epoch': epoch + 1,
	'model_state_dict': model.state_dict(),
	'optimizer_state_dict': optimizer.state_dict(),
	'test_acc': test_acc,
	'test_f1': test_f1,
	'config': vars(config)
	}, model_save_path)
	print(f"\n✓ Best model saved! (F1: {test_f1:.4f})")
	else:
	patience_counter += 1
	print(f"\nNo improvement. Patience: {patience_counter}/{config.early_stopping_patience}")

	# Early stopping
	if patience_counter >= config.early_stopping_patience:
	print(f"\nEarly stopping triggered after {epoch + 1} epochs")
	break

	print("\n" + "="*80)
	print("Training completed!")
	print("="*80)

	# Final evaluation
	print("\n9. Final evaluation on test set...")
	test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = evaluate(
	model, test_loader, criterion, config.device
	)

	print(f"\nFinal Test Results:")
	print(f"Accuracy: {test_acc:.4f}")
	print(f"Precision: {test_precision:.4f}")
	print(f"Recall: {test_recall:.4f}")
	print(f"F1 Score: {test_f1:.4f}")

	# Classification report
	print("\nClassification Report:")
	print(classification_report(true_labels, predictions, target_names=['Benign', 'Malicious']))

	# Confusion matrix
	cm = confusion_matrix(true_labels, predictions)
	print("\nConfusion Matrix:")
	print(cm)
	print(f"True Negatives: {cm[0][0]}")
	print(f"False Positives: {cm[0][1]}")
	print(f"False Negatives: {cm[1][0]}")
	print(f"True Positives: {cm[1][1]}")

	# Save results
	results = {
	'final_metrics': {
	'accuracy': float(test_acc),
	'precision': float(test_precision),
	'recall': float(test_recall),
	'f1_score': float(test_f1)
	},
	'confusion_matrix': cm.tolist(),
	'training_history': training_history,
	'config': {
	'epochs': config.epochs,
	'batch_size': config.batch_size,
	'learning_rate': config.learning_rate,
	'max_length': config.max_length,
	'sampling_strategy': config.sampling_strategy,
	'train_samples': len(train_df),
	'test_samples': len(test_df)
	}
	}

	results_path = os.path.join(config.output_dir, 'training_results.json')
	with open(results_path, 'w') as f:
	json.dump(results, f, indent=2)
	print(f"\nResults saved to: {results_path}")

	# Save tokenizer config
	tokenizer_config = {
	'model_name': config.model_base_path,
	'max_length': config.max_length
	}
	tokenizer_config_path = os.path.join(config.output_dir, 'tokenizer_config.json')
	with open(tokenizer_config_path, 'w') as f:
	json.dump(tokenizer_config, f, indent=2)
	print(f"Tokenizer config saved to: {tokenizer_config_path}")

	print("\n" + "="*80)
	print("Training script completed successfully!")
	print(f"Best F1 Score: {best_f1:.4f}")
	print(f"Best Accuracy: {best_accuracy:.4f}")
	print(f"Model saved to: {config.output_dir}")
	print("="*80)