Upload model.py with huggingface_hub

75241d1 verified about 1 month ago

41.4 kB

	"""
	CyberHybridNet: A Hybrid Transformer with Multi-Scale Attention for Cybersecurity Anomaly Detection
	==============================================================================================================
	Architecture:
	1. Multi-Scale CNN Feature Extractor (local pattern capture at 3 scales)
	2. Rotary Position Embeddings for temporal awareness
	3. Hybrid Attention Block:
	- Multi-Head Self-Attention (global flow dependencies)
	- Gated Cross-Attention (cross-feature interaction)
	- Feed-Forward with SwiGLU activation
	4. Mixture-of-Experts Classifier with uncertainty estimation

	Datasets: CICIDS2017 (lacg030175) + UNSW-NB15 (Mouwiya)
	"""

	import os
	import sys
	import math
	import time
	import json
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.metrics import (
	classification_report, confusion_matrix, f1_score,
	precision_score, recall_score, accuracy_score, roc_auc_score
	)
	import warnings
	warnings.filterwarnings('ignore')

	# Try importing optional monitoring
	try:
	import trackio
	HAS_TRACKIO = True
	except ImportError:
	HAS_TRACKIO = False

	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"GPU: {torch.cuda.get_device_name(0)}")
	print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {DEVICE}")

	# ============================================================
	# ARCHITECTURE COMPONENTS
	# ============================================================

	class RotaryPositionEmbedding(nn.Module):
	"""Rotary Position Embedding (RoPE) for temporal awareness in flow sequences."""
	def __init__(self, dim, max_seq_len=512):
	super().__init__()
	inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer('inv_freq', inv_freq)
	self.max_seq_len = max_seq_len

	def forward(self, x):
	seq_len = x.shape[1]
	t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
	freqs = torch.einsum('i,j->ij', t, self.inv_freq)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos_emb = emb.cos()[None, :, None, :]
	sin_emb = emb.sin()[None, :, None, :]
	return cos_emb, sin_emb


	def rotate_half(x):
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin):
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	class MultiScaleCNNExtractor(nn.Module):
	"""Multi-scale 1D CNN for local pattern extraction at different granularities."""
	def __init__(self, input_dim, hidden_dim, num_scales=3):
	super().__init__()
	self.scales = nn.ModuleList()
	# Compute per-scale channels so they sum exactly to hidden_dim
	base_ch = hidden_dim // num_scales
	channels = [base_ch] * num_scales
	channels[-1] = hidden_dim - base_ch * (num_scales - 1) # last scale absorbs remainder
	self.total_channels = sum(channels)
	for i in range(num_scales):
	kernel_size = 2 * i + 1 # 1, 3, 5
	padding = i
	ch = channels[i]
	self.scales.append(nn.Sequential(
	nn.Conv1d(input_dim, ch, kernel_size, padding=padding),
	nn.BatchNorm1d(ch),
	nn.GELU(),
	nn.Conv1d(ch, ch, kernel_size, padding=padding),
	nn.BatchNorm1d(ch),
	nn.GELU(),
	))
	self.fusion = nn.Sequential(
	nn.Linear(self.total_channels, hidden_dim),
	nn.LayerNorm(hidden_dim),
	nn.GELU(),
	)

	def forward(self, x):
	# x: (batch, seq_len, features) -> need (batch, features, seq_len) for conv1d
	x_conv = x.transpose(1, 2)
	multi_scale_out = []
	for scale in self.scales:
	out = scale(x_conv) # (batch, hidden//num_scales, seq_len)
	multi_scale_out.append(out)
	concatenated = torch.cat(multi_scale_out, dim=1) # (batch, hidden, seq_len)
	concatenated = concatenated.transpose(1, 2) # (batch, seq_len, hidden)
	return self.fusion(concatenated)


	class SwiGLU(nn.Module):
	"""SwiGLU activation function from PaLM/LLaMA."""
	def __init__(self, dim, hidden_dim, dropout=0.1):
	super().__init__()
	self.w1 = nn.Linear(dim, hidden_dim, bias=False)
	self.w2 = nn.Linear(hidden_dim, dim, bias=False)
	self.w3 = nn.Linear(dim, hidden_dim, bias=False)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))


	class MultiHeadSelfAttention(nn.Module):
	"""Multi-Head Self-Attention with RoPE."""
	def __init__(self, dim, num_heads=8, dropout=0.1):
	super().__init__()
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.scale = self.head_dim ** -0.5

	self.qkv = nn.Linear(dim, 3 * dim, bias=False)
	self.out_proj = nn.Linear(dim, dim, bias=False)
	self.attn_dropout = nn.Dropout(dropout)
	self.rope = RotaryPositionEmbedding(self.head_dim)

	def forward(self, x, mask=None):
	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
	q, k, v = qkv.unbind(0) # Each: (B, heads, N, head_dim)

	# Apply RoPE
	cos, sin = self.rope(x)
	cos = cos.expand(B, -1, self.num_heads, -1).transpose(1, 2)
	sin = sin.expand(B, -1, self.num_heads, -1).transpose(1, 2)
	q, k = apply_rotary_pos_emb(q, k, cos, sin)

	# Scaled dot-product attention
	attn = (q @ k.transpose(-2, -1)) * self.scale
	if mask is not None:
	attn = attn.masked_fill(mask == 0, float('-inf'))
	attn = F.softmax(attn, dim=-1)
	attn = self.attn_dropout(attn)

	out = (attn @ v).transpose(1, 2).reshape(B, N, C)
	return self.out_proj(out)


	class GatedCrossAttention(nn.Module):
	"""Gated Cross-Attention for cross-feature interaction."""
	def __init__(self, dim, num_heads=8, dropout=0.1):
	super().__init__()
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.scale = self.head_dim ** -0.5

	self.q_proj = nn.Linear(dim, dim, bias=False)
	self.k_proj = nn.Linear(dim, dim, bias=False)
	self.v_proj = nn.Linear(dim, dim, bias=False)
	self.out_proj = nn.Linear(dim, dim, bias=False)
	self.gate = nn.Sequential(
	nn.Linear(dim, dim),
	nn.Sigmoid()
	)
	self.attn_dropout = nn.Dropout(dropout)

	def forward(self, query, context):
	B, N, C = query.shape
	_, M, _ = context.shape

	q = self.q_proj(query).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(context).reshape(B, M, self.num_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(context).reshape(B, M, self.num_heads, self.head_dim).transpose(1, 2)

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = F.softmax(attn, dim=-1)
	attn = self.attn_dropout(attn)

	out = (attn @ v).transpose(1, 2).reshape(B, N, C)
	gate_val = self.gate(query)
	return self.out_proj(out * gate_val)


	class HybridAttentionBlock(nn.Module):
	"""
	Hybrid Attention Block combining:
	1. Multi-Head Self-Attention (global)
	2. Gated Cross-Attention (cross-feature)
	3. SwiGLU FFN
	"""
	def __init__(self, dim, num_heads=8, ffn_mult=4, dropout=0.1):
	super().__init__()
	self.norm1 = nn.LayerNorm(dim)
	self.self_attn = MultiHeadSelfAttention(dim, num_heads, dropout)

	self.norm2 = nn.LayerNorm(dim)
	self.cross_attn = GatedCrossAttention(dim, num_heads, dropout)

	self.norm3 = nn.LayerNorm(dim)
	self.ffn = SwiGLU(dim, dim * ffn_mult, dropout)

	self.dropout = nn.Dropout(dropout)

	def forward(self, x, context=None):
	# Self-attention with residual
	x = x + self.dropout(self.self_attn(self.norm1(x)))

	# Cross-attention with residual (if context provided)
	if context is not None:
	x = x + self.dropout(self.cross_attn(self.norm2(x), context))

	# FFN with residual
	x = x + self.dropout(self.ffn(self.norm3(x)))

	return x


	class MixtureOfExpertsClassifier(nn.Module):
	"""Mixture-of-Experts classifier with uncertainty estimation."""
	def __init__(self, dim, num_classes, num_experts=4, dropout=0.1):
	super().__init__()
	self.num_experts = num_experts

	self.gate = nn.Sequential(
	nn.Linear(dim, dim // 2),
	nn.GELU(),
	nn.Linear(dim // 2, num_experts),
	)

	self.experts = nn.ModuleList([
	nn.Sequential(
	nn.Linear(dim, dim // 2),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(dim // 2, num_classes),
	) for _ in range(num_experts)
	])

	def forward(self, x):
	gate_logits = self.gate(x)
	gate_probs = F.softmax(gate_logits, dim=-1)

	expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
	output = torch.einsum('be,bec->bc', gate_probs, expert_outputs)

	return output, gate_probs


	class CyberHybridNet(nn.Module):
	"""
	CyberHybridNet: Hybrid Transformer for Cybersecurity Anomaly Detection

	Architecture:
	- Input Feature Projection
	- Multi-Scale CNN Feature Extractor (3 scales)
	- N x Hybrid Attention Blocks (Self-Attention + Cross-Attention + SwiGLU)
	- Mixture-of-Experts Classifier
	"""
	def __init__(
	self,
	input_dim,
	num_classes,
	hidden_dim=128,
	num_layers=4,
	num_heads=8,
	num_experts=4,
	ffn_mult=4,
	dropout=0.1,
	seq_len=1,
	):
	super().__init__()
	self.hidden_dim = hidden_dim
	self.seq_len = seq_len

	# Input projection
	self.input_proj = nn.Sequential(
	nn.Linear(input_dim, hidden_dim),
	nn.LayerNorm(hidden_dim),
	nn.GELU(),
	nn.Dropout(dropout),
	)

	# Multi-scale CNN extractor (creates context for cross-attention)
	self.cnn_extractor = MultiScaleCNNExtractor(hidden_dim, hidden_dim, num_scales=3)

	# Hybrid attention layers
	self.attention_blocks = nn.ModuleList([
	HybridAttentionBlock(hidden_dim, num_heads, ffn_mult, dropout)
	for _ in range(num_layers)
	])

	# Final normalization
	self.final_norm = nn.LayerNorm(hidden_dim)

	# Pooling attention
	self.pool_query = nn.Parameter(torch.randn(1, 1, hidden_dim) * 0.02)
	self.pool_attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)

	# MoE Classifier
	self.classifier = MixtureOfExpertsClassifier(hidden_dim, num_classes, num_experts, dropout)

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.trunc_normal_(module.weight, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.LayerNorm):
	nn.init.ones_(module.weight)
	nn.init.zeros_(module.bias)

	def forward(self, x):
	"""
	x: (batch_size, input_dim) for single-step or (batch_size, seq_len, input_dim) for sequence
	"""
	if x.dim() == 2:
	x = x.unsqueeze(1) # (batch, 1, input_dim) -> create sequence dim

	B, S, _ = x.shape

	# Project input
	x = self.input_proj(x) # (B, S, hidden_dim)

	# CNN multi-scale features (context for cross-attention)
	cnn_features = self.cnn_extractor(x) # (B, S, hidden_dim)

	# Apply hybrid attention blocks
	for block in self.attention_blocks:
	x = block(x, context=cnn_features)

	x = self.final_norm(x)

	# Attention pooling
	pool_query = self.pool_query.expand(B, -1, -1)
	pooled, _ = self.pool_attn(pool_query, x, x)
	pooled = pooled.squeeze(1) # (B, hidden_dim)

	# Classification
	logits, gate_probs = self.classifier(pooled)

	return logits, gate_probs

	def count_parameters(self):
	return sum(p.numel() for p in self.parameters() if p.requires_grad)


	# ============================================================
	# DATA LOADING & PREPROCESSING
	# ============================================================

	class CyberSecurityDataset(Dataset):
	def __init__(self, features, labels, seq_len=1):
	self.features = torch.FloatTensor(features)
	self.labels = torch.LongTensor(labels)
	self.seq_len = seq_len

	def __len__(self):
	return len(self.labels)

	def __getitem__(self, idx):
	return self.features[idx], self.labels[idx]


	def load_and_preprocess_cicids2017(max_samples=None):
	"""Load CICIDS2017 from HuggingFace with proper preprocessing."""
	from datasets import load_dataset

	print("Loading CICIDS2017 dataset...")
	ds = load_dataset("lacg030175/CICIDS2017", "temporal_3way")

	train_df = ds['train'].to_pandas()
	val_df = ds['validation'].to_pandas()
	test_df = ds['test'].to_pandas()

	print(f"CICIDS2017 - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
	print(f"Label distribution (train):\n{train_df['label'].value_counts()}")

	# Stratified subsample if max_samples is set
	if max_samples and len(train_df) > max_samples:
	from sklearn.model_selection import train_test_split
	train_df, _ = train_test_split(train_df, train_size=max_samples,
	random_state=42, stratify=train_df['label'])
	print(f"Subsampled train to {len(train_df)}")
	if max_samples:
	val_size = min(len(val_df), max_samples // 4)
	test_size = min(len(test_df), max_samples // 4)
	if len(val_df) > val_size:
	val_df, _ = train_test_split(val_df, train_size=val_size,
	random_state=42, stratify=val_df['label'])
	if len(test_df) > test_size:
	test_df, _ = train_test_split(test_df, train_size=test_size,
	random_state=42, stratify=test_df['label'])

	# Get feature columns (exclude labels)
	exclude_cols = ['Label', 'label']
	feature_cols = [c for c in train_df.columns if c not in exclude_cols]

	# Clean data: replace inf, drop NaN
	for df in [train_df, val_df, test_df]:
	df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
	df[feature_cols] = df[feature_cols].fillna(0)

	X_train = train_df[feature_cols].values.astype(np.float32)
	y_train = train_df['label'].values
	X_val = val_df[feature_cols].values.astype(np.float32)
	y_val = val_df['label'].values
	X_test = test_df[feature_cols].values.astype(np.float32)
	y_test = test_df['label'].values

	# Standardize
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	# Clip extreme values
	X_train = np.clip(X_train, -10, 10)
	X_val = np.clip(X_val, -10, 10)
	X_test = np.clip(X_test, -10, 10)

	num_classes = len(np.unique(y_train))
	print(f"Number of classes: {num_classes}")
	print(f"Feature dimension: {X_train.shape[1]}")

	return X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, 'CICIDS2017'


	def load_and_preprocess_unsw_nb15(max_samples=None):
	"""Load UNSW-NB15 from HuggingFace with proper preprocessing."""
	from datasets import load_dataset

	print("Loading UNSW-NB15 dataset...")
	ds = load_dataset("Mouwiya/UNSW-NB15")

	df = ds['train'].to_pandas()
	print(f"UNSW-NB15 total samples: {len(df)}")
	print(f"Label distribution:\n{df['label'].value_counts()}")

	# Subsample if needed
	if max_samples and len(df) > max_samples:
	from sklearn.model_selection import train_test_split
	df, _ = train_test_split(df, train_size=max_samples, random_state=42, stratify=df['label'])
	print(f"Subsampled to {len(df)}")

	# Remove non-numeric/IP columns
	drop_cols = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service',
	'attack_cat', 'label', 'ct_ftp_cmd']
	feature_cols = [c for c in df.columns if c not in drop_cols]

	# Encode categorical columns that remain
	for col in feature_cols:
	if df[col].dtype == 'object':
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col].astype(str))

	# Clean data
	df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
	df[feature_cols] = df[feature_cols].fillna(0)

	X = df[feature_cols].values.astype(np.float32)
	y = df['label'].values

	# Stratified split: 70/15/15
	from sklearn.model_selection import train_test_split
	X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
	X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

	# Standardize
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	# Clip
	X_train = np.clip(X_train, -10, 10)
	X_val = np.clip(X_val, -10, 10)
	X_test = np.clip(X_test, -10, 10)

	num_classes = len(np.unique(y_train))
	print(f"Number of classes: {num_classes}")
	print(f"Feature dimension: {X_train.shape[1]}")

	return X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, 'UNSW-NB15'


	# ============================================================
	# TRAINING ENGINE
	# ============================================================

	class FocalLoss(nn.Module):
	"""Focal Loss for handling class imbalance in cybersecurity datasets."""
	def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
	super().__init__()
	self.alpha = alpha
	self.gamma = gamma
	self.reduction = reduction

	def forward(self, inputs, targets):
	ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
	pt = torch.exp(-ce_loss)
	focal_loss = ((1 - pt) ** self.gamma) * ce_loss

	if self.reduction == 'mean':
	return focal_loss.mean()
	elif self.reduction == 'sum':
	return focal_loss.sum()
	return focal_loss


	class CosineWarmupScheduler:
	"""Cosine LR scheduler with linear warmup."""
	def __init__(self, optimizer, warmup_steps, total_steps, min_lr=1e-7):
	self.optimizer = optimizer
	self.warmup_steps = warmup_steps
	self.total_steps = total_steps
	self.min_lr = min_lr
	self.base_lrs = [pg['lr'] for pg in optimizer.param_groups]
	self.step_count = 0

	def step(self):
	self.step_count += 1
	if self.step_count <= self.warmup_steps:
	lr_mult = self.step_count / max(1, self.warmup_steps)
	else:
	progress = (self.step_count - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
	lr_mult = 0.5 * (1 + math.cos(math.pi * progress))

	for i, pg in enumerate(self.optimizer.param_groups):
	pg['lr'] = max(self.min_lr, self.base_lrs[i] * lr_mult)

	return self.optimizer.param_groups[0]['lr']


	def train_one_epoch(model, dataloader, optimizer, criterion, scheduler, device, epoch):
	model.train()
	total_loss = 0
	correct = 0
	total = 0

	for batch_idx, (features, labels) in enumerate(dataloader):
	features, labels = features.to(device), labels.to(device)

	optimizer.zero_grad()
	logits, gate_probs = model(features)

	# Main loss
	loss = criterion(logits, labels)

	# Load balancing loss for MoE (encourage uniform expert usage)
	expert_usage = gate_probs.mean(0)
	lb_loss = (expert_usage * torch.log(expert_usage + 1e-8)).sum() * 0.01
	loss = loss + lb_loss

	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
	optimizer.step()
	lr = scheduler.step()

	total_loss += loss.item()
	_, predicted = logits.max(1)
	total += labels.size(0)
	correct += predicted.eq(labels).sum().item()

	if batch_idx % 100 == 0:
	print(f" Epoch {epoch} \| Batch {batch_idx}/{len(dataloader)} \| "
	f"Loss: {loss.item():.4f} \| Acc: {100.*correct/total:.2f}% \| LR: {lr:.2e}")

	avg_loss = total_loss / len(dataloader)
	accuracy = 100. * correct / total
	return avg_loss, accuracy


	@torch.no_grad()
	def evaluate(model, dataloader, criterion, device):
	model.eval()
	total_loss = 0
	all_preds = []
	all_labels = []
	all_probs = []

	for features, labels in dataloader:
	features, labels = features.to(device), labels.to(device)
	logits, _ = model(features)
	loss = criterion(logits, labels)
	total_loss += loss.item()

	probs = F.softmax(logits, dim=-1)
	_, predicted = logits.max(1)

	all_preds.extend(predicted.cpu().numpy())
	all_labels.extend(labels.cpu().numpy())
	all_probs.extend(probs.cpu().numpy())

	avg_loss = total_loss / len(dataloader)
	all_preds = np.array(all_preds)
	all_labels = np.array(all_labels)
	all_probs = np.array(all_probs)

	accuracy = accuracy_score(all_labels, all_preds) * 100
	f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0) * 100
	f1_weighted = f1_score(all_labels, all_preds, average='weighted', zero_division=0) * 100
	precision = precision_score(all_labels, all_preds, average='macro', zero_division=0) * 100
	recall = recall_score(all_labels, all_preds, average='macro', zero_division=0) * 100

	try:
	if all_probs.shape[1] == 2:
	auc = roc_auc_score(all_labels, all_probs[:, 1]) * 100
	else:
	auc = roc_auc_score(all_labels, all_probs, multi_class='ovr', average='macro') * 100
	except:
	auc = 0.0

	metrics = {
	'loss': avg_loss,
	'accuracy': accuracy,
	'f1_macro': f1_macro,
	'f1_weighted': f1_weighted,
	'precision': precision,
	'recall': recall,
	'auc': auc,
	}

	return metrics, all_preds, all_labels


	def train_model(dataset_name='CICIDS2017', config=None, max_samples=None):
	"""Main training loop."""

	if config is None:
	config = {
	'hidden_dim': 128,
	'num_layers': 4,
	'num_heads': 8,
	'num_experts': 4,
	'ffn_mult': 4,
	'dropout': 0.15,
	'batch_size': 512,
	'lr': 3e-4,
	'weight_decay': 1e-4,
	'epochs': 30,
	'patience': 7,
	'focal_gamma': 2.0,
	}

	print(f"\n{'='*70}")
	print(f"Training CyberHybridNet on {dataset_name}")
	print(f"{'='*70}")
	print(f"Config: {json.dumps(config, indent=2)}")

	# Load data
	if dataset_name == 'CICIDS2017':
	X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, name = load_and_preprocess_cicids2017(max_samples)
	else:
	X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, name = load_and_preprocess_unsw_nb15(max_samples)

	input_dim = X_train.shape[1]

	# Create datasets
	train_dataset = CyberSecurityDataset(X_train, y_train)
	val_dataset = CyberSecurityDataset(X_val, y_val)
	test_dataset = CyberSecurityDataset(X_test, y_test)

	# Compute class weights for balanced sampling
	class_counts = np.bincount(y_train)
	class_weights = 1.0 / (class_counts + 1e-8)
	class_weights = class_weights / class_weights.sum()
	sample_weights = class_weights[y_train]
	sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

	train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], sampler=sampler,
	num_workers=2, pin_memory=True, drop_last=True)
	val_loader = DataLoader(val_dataset, batch_size=config['batch_size'] * 2, shuffle=False,
	num_workers=2, pin_memory=True)
	test_loader = DataLoader(test_dataset, batch_size=config['batch_size'] * 2, shuffle=False,
	num_workers=2, pin_memory=True)

	# Create model
	model = CyberHybridNet(
	input_dim=input_dim,
	num_classes=num_classes,
	hidden_dim=config['hidden_dim'],
	num_layers=config['num_layers'],
	num_heads=config['num_heads'],
	num_experts=config['num_experts'],
	ffn_mult=config['ffn_mult'],
	dropout=config['dropout'],
	).to(DEVICE)

	num_params = model.count_parameters()
	print(f"\nModel Parameters: {num_params:,} ({num_params/1e6:.2f}M)")
	print(f"Input dim: {input_dim}, Classes: {num_classes}")

	# Loss function with focal loss
	alpha = torch.FloatTensor(class_weights).to(DEVICE)
	criterion = FocalLoss(alpha=alpha, gamma=config['focal_gamma'])

	# Optimizer
	optimizer = torch.optim.AdamW(
	model.parameters(),
	lr=config['lr'],
	weight_decay=config['weight_decay'],
	betas=(0.9, 0.999)
	)

	# Scheduler
	total_steps = len(train_loader) * config['epochs']
	warmup_steps = len(train_loader) * 2 # 2 epochs warmup
	scheduler = CosineWarmupScheduler(optimizer, warmup_steps, total_steps)

	# Initialize tracking
	if HAS_TRACKIO:
	try:
	trackio.init(project="cyberhybridnet", name=f"{dataset_name.lower()}-training")
	print("Trackio monitoring initialized")
	except Exception as e:
	print(f"Trackio init failed: {e}")

	# Training loop
	best_val_f1 = 0
	best_model_state = None
	patience_counter = 0
	training_history = []

	for epoch in range(1, config['epochs'] + 1):
	epoch_start = time.time()

	# Train
	train_loss, train_acc = train_one_epoch(
	model, train_loader, optimizer, criterion, scheduler, DEVICE, epoch
	)

	# Validate
	val_metrics, _, _ = evaluate(model, val_loader, criterion, DEVICE)

	epoch_time = time.time() - epoch_start

	print(f"\nEpoch {epoch}/{config['epochs']} ({epoch_time:.1f}s)")
	print(f" Train - Loss: {train_loss:.4f} \| Acc: {train_acc:.2f}%")
	print(f" Val - Loss: {val_metrics['loss']:.4f} \| Acc: {val_metrics['accuracy']:.2f}% \| "
	f"F1-Macro: {val_metrics['f1_macro']:.2f}% \| F1-Wt: {val_metrics['f1_weighted']:.2f}% \| "
	f"AUC: {val_metrics['auc']:.2f}%")

	# Log to trackio
	if HAS_TRACKIO:
	try:
	trackio.log({
	'train/loss': train_loss,
	'train/accuracy': train_acc,
	'val/loss': val_metrics['loss'],
	'val/accuracy': val_metrics['accuracy'],
	'val/f1_macro': val_metrics['f1_macro'],
	'val/f1_weighted': val_metrics['f1_weighted'],
	'val/precision': val_metrics['precision'],
	'val/recall': val_metrics['recall'],
	'val/auc': val_metrics['auc'],
	'lr': optimizer.param_groups[0]['lr'],
	'epoch': epoch,
	})
	except:
	pass

	training_history.append({
	'epoch': epoch,
	'train_loss': train_loss,
	'train_acc': train_acc,
	**{f'val_{k}': v for k, v in val_metrics.items()}
	})

	# Early stopping
	if val_metrics['f1_macro'] > best_val_f1:
	best_val_f1 = val_metrics['f1_macro']
	best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
	patience_counter = 0
	print(f" ★ New best F1-Macro: {best_val_f1:.2f}%")
	else:
	patience_counter += 1
	if patience_counter >= config['patience']:
	print(f"\nEarly stopping at epoch {epoch} (patience={config['patience']})")
	break

	# Load best model
	if best_model_state is not None:
	model.load_state_dict(best_model_state)
	model = model.to(DEVICE)

	# Final evaluation on test set
	print(f"\n{'='*70}")
	print(f"FINAL TEST EVALUATION ({dataset_name})")
	print(f"{'='*70}")

	test_metrics, test_preds, test_labels = evaluate(model, test_loader, criterion, DEVICE)

	print(f"\nTest Results:")
	print(f" Accuracy: {test_metrics['accuracy']:.2f}%")
	print(f" F1-Macro: {test_metrics['f1_macro']:.2f}%")
	print(f" F1-Weighted: {test_metrics['f1_weighted']:.2f}%")
	print(f" Precision: {test_metrics['precision']:.2f}%")
	print(f" Recall: {test_metrics['recall']:.2f}%")
	print(f" AUC-ROC: {test_metrics['auc']:.2f}%")

	print(f"\nClassification Report:")
	print(classification_report(test_labels, test_preds, zero_division=0))

	return model, test_metrics, config, scaler, feature_cols, num_classes, input_dim, training_history


	def push_model_to_hub(model, config, metrics_cicids, metrics_unsw, input_dim, num_classes_cicids,
	num_classes_unsw, feature_cols_cicids, feature_cols_unsw):
	"""Push trained model to Hugging Face Hub."""
	from huggingface_hub import HfApi, create_repo
	import tempfile

	repo_id = "ha5eeb001/CyberHybridNet-anomaly-detector"

	try:
	api = HfApi()
	try:
	api.create_repo(repo_id, exist_ok=True, private=False)
	except Exception as e:
	print(f"Repo creation note: {e}")

	with tempfile.TemporaryDirectory() as tmpdir:
	# Save model weights
	model_path = os.path.join(tmpdir, "model.pt")
	torch.save(model.state_dict(), model_path)

	# Save config
	model_config = {
	'architecture': 'CyberHybridNet',
	'description': 'Hybrid Transformer with Multi-Scale CNN + Gated Cross-Attention + MoE for Cybersecurity Anomaly Detection',
	'training_config': config,
	'input_dim_cicids': int(feature_cols_cicids) if isinstance(feature_cols_cicids, int) else len(feature_cols_cicids),
	'input_dim_unsw': int(feature_cols_unsw) if isinstance(feature_cols_unsw, int) else len(feature_cols_unsw),
	'num_classes_cicids': num_classes_cicids,
	'num_classes_unsw': num_classes_unsw,
	'metrics_cicids2017': {k: float(v) for k, v in metrics_cicids.items()},
	'metrics_unsw_nb15': {k: float(v) for k, v in metrics_unsw.items()},
	'components': [
	'Multi-Scale 1D CNN Feature Extractor (3 scales)',
	'Rotary Position Embeddings',
	'Multi-Head Self-Attention',
	'Gated Cross-Attention',
	'SwiGLU Feed-Forward Networks',
	'Mixture-of-Experts Classifier (4 experts)',
	'Focal Loss for class imbalance',
	'Cosine LR with warmup',
	],
	'datasets': [
	'lacg030175/CICIDS2017 (temporal_3way split)',
	'Mouwiya/UNSW-NB15',
	]
	}
	config_path = os.path.join(tmpdir, "config.json")
	with open(config_path, 'w') as f:
	json.dump(model_config, f, indent=2)

	# Create README
	readme = f"""---
	tags:
	- cybersecurity
	- anomaly-detection
	- intrusion-detection
	- transformer
	- hybrid-attention
	- pytorch
	license: apache-2.0
	datasets:
	- lacg030175/CICIDS2017
	- Mouwiya/UNSW-NB15
	metrics:
	- accuracy
	- f1
	- precision
	- recall
	---

	# 🛡️ CyberHybridNet: Hybrid Transformer for Cybersecurity Anomaly Detection

	## Architecture

	CyberHybridNet is a cutting-edge hybrid transformer architecture designed specifically for network intrusion / anomaly detection in cybersecurity. It combines multiple advanced components:

	### Key Components:
	1. Multi-Scale 1D CNN Feature Extractor - Captures local patterns at 3 different granularities (kernel sizes 1, 3, 5)
	2. Rotary Position Embeddings (RoPE) - Temporal awareness for network flow sequences
	3. Multi-Head Self-Attention - Global dependency modeling across flow features
	4. Gated Cross-Attention - Cross-feature interaction between CNN and transformer pathways with learned gating
	5. SwiGLU Feed-Forward Networks - Advanced activation function from PaLM/LLaMA
	6. Mixture-of-Experts (MoE) Classifier - 4-expert ensemble with load balancing for robust classification
	7. Focal Loss - Handles severe class imbalance common in cybersecurity datasets
	8. Attention Pooling - Learnable query-based pooling instead of naive mean pooling

	### Architecture Diagram:
	```
	Input Features
	│
	┌───▼───┐
	│ Input │
	│Project │
	└───┬───┘
	│
	┌───▼───────────┐ ┌──────────────────┐
	│ Multi-Scale │────▶│ CNN Context │
	│ CNN Extractor │ │ (3 scales: 1,3,5) │
	└───┬───────────┘ └──────┬───────────┘
	│ │
	│ ┌───────────────────┘
	│ │
	┌───▼────▼───────────┐
	│ Hybrid Attention │ × N layers
	│ ┌─────────────────┐│
	│ │Self-Attn + RoPE ││
	│ ├─────────────────┤│
	│ │Gated Cross-Attn ││
	│ ├─────────────────┤│
	│ │SwiGLU FFN ││
	│ └─────────────────┘│
	└────────┬───────────┘
	│
	┌────────▼───────────┐
	│ Attention Pooling │
	└────────┬───────────┘
	│
	┌────────▼───────────┐
	│ MoE Classifier │
	│ (4 experts + gate) │
	└────────┬───────────┘
	│
	Predictions
	```

	## Performance

	### CICIDS2017 (Temporal Split)
	\| Metric \| Score \|
	\|--------\|-------\|
	\| Accuracy \| {metrics_cicids.get('accuracy', 0):.2f}% \|
	\| F1-Macro \| {metrics_cicids.get('f1_macro', 0):.2f}% \|
	\| F1-Weighted \| {metrics_cicids.get('f1_weighted', 0):.2f}% \|
	\| Precision \| {metrics_cicids.get('precision', 0):.2f}% \|
	\| Recall \| {metrics_cicids.get('recall', 0):.2f}% \|
	\| AUC-ROC \| {metrics_cicids.get('auc', 0):.2f}% \|

	### UNSW-NB15
	\| Metric \| Score \|
	\|--------\|-------\|
	\| Accuracy \| {metrics_unsw.get('accuracy', 0):.2f}% \|
	\| F1-Macro \| {metrics_unsw.get('f1_macro', 0):.2f}% \|
	\| F1-Weighted \| {metrics_unsw.get('f1_weighted', 0):.2f}% \|
	\| Precision \| {metrics_unsw.get('precision', 0):.2f}% \|
	\| Recall \| {metrics_unsw.get('recall', 0):.2f}% \|
	\| AUC-ROC \| {metrics_unsw.get('auc', 0):.2f}% \|

	## Training Details

	- Optimizer: AdamW (lr=3e-4, weight_decay=1e-4)
	- Scheduler: Cosine with linear warmup (2 epochs)
	- Loss: Focal Loss (γ=2.0) with class-weighted sampling
	- Regularization: Dropout (0.15), gradient clipping (max_norm=1.0), MoE load balancing
	- Early Stopping: Patience=7 on validation F1-Macro

	## Usage

	```python
	import torch
	from model import CyberHybridNet

	# Load model
	model = CyberHybridNet(
	input_dim=78, # CICIDS2017 features
	num_classes=3, # BENIGN, ATTACK, UNKNOWN
	hidden_dim=128,
	num_layers=4,
	num_heads=8,
	num_experts=4,
	)
	model.load_state_dict(torch.load("model.pt"))
	model.eval()

	# Predict
	with torch.no_grad():
	features = torch.randn(1, 78) # Your preprocessed features
	logits, gate_probs = model(features)
	prediction = logits.argmax(dim=-1)
	```

	## Datasets
	- [CICIDS2017](https://huggingface.co/datasets/lacg030175/CICIDS2017) - Canadian Institute for Cybersecurity IDS 2017
	- [UNSW-NB15](https://huggingface.co/datasets/Mouwiya/UNSW-NB15) - Australian Centre for Cyber Security
	"""
	readme_path = os.path.join(tmpdir, "README.md")
	with open(readme_path, 'w') as f:
	f.write(readme)

	# Upload all files
	api.upload_file(path_or_fileobj=model_path, path_in_repo="model.pt", repo_id=repo_id)
	api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=repo_id)
	api.upload_file(path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id)

	# Upload the model architecture code
	script_path = os.path.abspath(__file__)
	api.upload_file(path_or_fileobj=script_path, path_in_repo="model.py", repo_id=repo_id)

	print(f"\n✅ Model pushed to: https://huggingface.co/{repo_id}")

	except Exception as e:
	print(f"Error pushing to hub: {e}")
	import traceback
	traceback.print_exc()


	# ============================================================
	# MAIN
	# ============================================================

	if __name__ == "__main__":
	# Detect if GPU is available and set config accordingly
	is_gpu = torch.cuda.is_available()
	max_samples = None if is_gpu else 500000 # Use full data on GPU, subsample on CPU

	config = {
	'hidden_dim': 128,
	'num_layers': 4,
	'num_heads': 8,
	'num_experts': 4,
	'ffn_mult': 4,
	'dropout': 0.15,
	'batch_size': 1024 if is_gpu else 512,
	'lr': 3e-4,
	'weight_decay': 1e-4,
	'epochs': 30 if is_gpu else 20,
	'patience': 7 if is_gpu else 5,
	'focal_gamma': 2.0,
	}

	# ---- Train on CICIDS2017 ----
	print("\n" + "="*80)
	print("PHASE 1: Training on CICIDS2017")
	print("="*80)
	model_cicids, metrics_cicids, _, scaler_cicids, fcols_cicids, nclasses_cicids, input_dim_cicids, hist_cicids = \
	train_model('CICIDS2017', config, max_samples)

	# ---- Train on UNSW-NB15 ----
	print("\n" + "="*80)
	print("PHASE 2: Training on UNSW-NB15")
	print("="*80)
	model_unsw, metrics_unsw, _, scaler_unsw, fcols_unsw, nclasses_unsw, input_dim_unsw, hist_unsw = \
	train_model('UNSW-NB15', config, max_samples)

	# ---- Push best model to Hub ----
	print("\n" + "="*80)
	print("PHASE 3: Pushing models to Hub")
	print("="*80)

	# Push the CICIDS model (typically more challenging dataset)
	push_model_to_hub(
	model_cicids, config, metrics_cicids, metrics_unsw,
	input_dim_cicids, nclasses_cicids, nclasses_unsw,
	fcols_cicids, fcols_unsw
	)

	# Also save UNSW model
	torch.save(model_unsw.state_dict(), '/tmp/model_unsw.pt')
	from huggingface_hub import HfApi
	try:
	api = HfApi()
	api.upload_file(
	path_or_fileobj='/tmp/model_unsw.pt',
	path_in_repo="model_unsw_nb15.pt",
	repo_id="ha5eeb001/CyberHybridNet-anomaly-detector"
	)
	print("UNSW-NB15 model weights uploaded")
	except Exception as e:
	print(f"Upload error: {e}")

	print("\n" + "="*80)
	print("TRAINING COMPLETE!")
	print("="*80)
	print(f"\nCICIDS2017 - Acc: {metrics_cicids['accuracy']:.2f}% \| F1: {metrics_cicids['f1_macro']:.2f}% \| AUC: {metrics_cicids['auc']:.2f}%")
	print(f"UNSW-NB15 - Acc: {metrics_unsw['accuracy']:.2f}% \| F1: {metrics_unsw['f1_macro']:.2f}% \| AUC: {metrics_unsw['auc']:.2f}%")
	print(f"\nModel: https://huggingface.co/ha5eeb001/CyberHybridNet-anomaly-detector")