"""
CyberHybridNet: A Hybrid Transformer with Multi-Scale Attention for Cybersecurity Anomaly Detection
==============================================================================================================
Architecture:
  1. Multi-Scale CNN Feature Extractor (local pattern capture at 3 scales)
  2. Rotary Position Embeddings for temporal awareness
  3. Hybrid Attention Block:
     - Multi-Head Self-Attention (global flow dependencies)
     - Gated Cross-Attention (cross-feature interaction)
     - Feed-Forward with SwiGLU activation
  4. Mixture-of-Experts Classifier with uncertainty estimation
  
Datasets: CICIDS2017 (lacg030175) + UNSW-NB15 (Mouwiya)
"""

import os
import sys
import math
import time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, 
    precision_score, recall_score, accuracy_score, roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

# Try importing optional monitoring
try:
    import trackio
    HAS_TRACKIO = True
except ImportError:
    HAS_TRACKIO = False

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# ============================================================
# ARCHITECTURE COMPONENTS
# ============================================================

class RotaryPositionEmbedding(nn.Module):
    """Rotary Position Embedding (RoPE) for temporal awareness in flow sequences."""
    def __init__(self, dim, max_seq_len=512):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)
        self.max_seq_len = max_seq_len
        
    def forward(self, x):
        seq_len = x.shape[1]
        t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        cos_emb = emb.cos()[None, :, None, :]
        sin_emb = emb.sin()[None, :, None, :]
        return cos_emb, sin_emb


def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin):
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class MultiScaleCNNExtractor(nn.Module):
    """Multi-scale 1D CNN for local pattern extraction at different granularities."""
    def __init__(self, input_dim, hidden_dim, num_scales=3):
        super().__init__()
        self.scales = nn.ModuleList()
        # Compute per-scale channels so they sum exactly to hidden_dim
        base_ch = hidden_dim // num_scales
        channels = [base_ch] * num_scales
        channels[-1] = hidden_dim - base_ch * (num_scales - 1)  # last scale absorbs remainder
        self.total_channels = sum(channels)
        for i in range(num_scales):
            kernel_size = 2 * i + 1  # 1, 3, 5
            padding = i
            ch = channels[i]
            self.scales.append(nn.Sequential(
                nn.Conv1d(input_dim, ch, kernel_size, padding=padding),
                nn.BatchNorm1d(ch),
                nn.GELU(),
                nn.Conv1d(ch, ch, kernel_size, padding=padding),
                nn.BatchNorm1d(ch),
                nn.GELU(),
            ))
        self.fusion = nn.Sequential(
            nn.Linear(self.total_channels, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
        )
        
    def forward(self, x):
        # x: (batch, seq_len, features) -> need (batch, features, seq_len) for conv1d
        x_conv = x.transpose(1, 2)
        multi_scale_out = []
        for scale in self.scales:
            out = scale(x_conv)  # (batch, hidden//num_scales, seq_len)
            multi_scale_out.append(out)
        concatenated = torch.cat(multi_scale_out, dim=1)  # (batch, hidden, seq_len)
        concatenated = concatenated.transpose(1, 2)  # (batch, seq_len, hidden)
        return self.fusion(concatenated)


class SwiGLU(nn.Module):
    """SwiGLU activation function from PaLM/LLaMA."""
    def __init__(self, dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))


class MultiHeadSelfAttention(nn.Module):
    """Multi-Head Self-Attention with RoPE."""
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        
        self.qkv = nn.Linear(dim, 3 * dim, bias=False)
        self.out_proj = nn.Linear(dim, dim, bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        self.rope = RotaryPositionEmbedding(self.head_dim)
        
    def forward(self, x, mask=None):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)  # Each: (B, heads, N, head_dim)
        
        # Apply RoPE
        cos, sin = self.rope(x)
        cos = cos.expand(B, -1, self.num_heads, -1).transpose(1, 2)
        sin = sin.expand(B, -1, self.num_heads, -1).transpose(1, 2)
        q, k = apply_rotary_pos_emb(q, k, cos, sin)
        
        # Scaled dot-product attention
        attn = (q @ k.transpose(-2, -1)) * self.scale
        if mask is not None:
            attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = self.attn_dropout(attn)
        
        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.out_proj(out)


class GatedCrossAttention(nn.Module):
    """Gated Cross-Attention for cross-feature interaction."""
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        
        self.q_proj = nn.Linear(dim, dim, bias=False)
        self.k_proj = nn.Linear(dim, dim, bias=False)
        self.v_proj = nn.Linear(dim, dim, bias=False)
        self.out_proj = nn.Linear(dim, dim, bias=False)
        self.gate = nn.Sequential(
            nn.Linear(dim, dim),
            nn.Sigmoid()
        )
        self.attn_dropout = nn.Dropout(dropout)
        
    def forward(self, query, context):
        B, N, C = query.shape
        _, M, _ = context.shape
        
        q = self.q_proj(query).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(context).reshape(B, M, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(context).reshape(B, M, self.num_heads, self.head_dim).transpose(1, 2)
        
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = F.softmax(attn, dim=-1)
        attn = self.attn_dropout(attn)
        
        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        gate_val = self.gate(query)
        return self.out_proj(out * gate_val)


class HybridAttentionBlock(nn.Module):
    """
    Hybrid Attention Block combining:
    1. Multi-Head Self-Attention (global)
    2. Gated Cross-Attention (cross-feature)
    3. SwiGLU FFN
    """
    def __init__(self, dim, num_heads=8, ffn_mult=4, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.self_attn = MultiHeadSelfAttention(dim, num_heads, dropout)
        
        self.norm2 = nn.LayerNorm(dim)
        self.cross_attn = GatedCrossAttention(dim, num_heads, dropout)
        
        self.norm3 = nn.LayerNorm(dim)
        self.ffn = SwiGLU(dim, dim * ffn_mult, dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, context=None):
        # Self-attention with residual
        x = x + self.dropout(self.self_attn(self.norm1(x)))
        
        # Cross-attention with residual (if context provided)
        if context is not None:
            x = x + self.dropout(self.cross_attn(self.norm2(x), context))
        
        # FFN with residual
        x = x + self.dropout(self.ffn(self.norm3(x)))
        
        return x


class MixtureOfExpertsClassifier(nn.Module):
    """Mixture-of-Experts classifier with uncertainty estimation."""
    def __init__(self, dim, num_classes, num_experts=4, dropout=0.1):
        super().__init__()
        self.num_experts = num_experts
        
        self.gate = nn.Sequential(
            nn.Linear(dim, dim // 2),
            nn.GELU(),
            nn.Linear(dim // 2, num_experts),
        )
        
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(dim, dim // 2),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(dim // 2, num_classes),
            ) for _ in range(num_experts)
        ])
        
    def forward(self, x):
        gate_logits = self.gate(x)
        gate_probs = F.softmax(gate_logits, dim=-1)
        
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
        output = torch.einsum('be,bec->bc', gate_probs, expert_outputs)
        
        return output, gate_probs


class CyberHybridNet(nn.Module):
    """
    CyberHybridNet: Hybrid Transformer for Cybersecurity Anomaly Detection
    
    Architecture:
    - Input Feature Projection
    - Multi-Scale CNN Feature Extractor (3 scales)
    - N x Hybrid Attention Blocks (Self-Attention + Cross-Attention + SwiGLU)
    - Mixture-of-Experts Classifier
    """
    def __init__(
        self,
        input_dim,
        num_classes,
        hidden_dim=128,
        num_layers=4,
        num_heads=8,
        num_experts=4,
        ffn_mult=4,
        dropout=0.1,
        seq_len=1,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        
        # Input projection
        self.input_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        
        # Multi-scale CNN extractor (creates context for cross-attention)
        self.cnn_extractor = MultiScaleCNNExtractor(hidden_dim, hidden_dim, num_scales=3)
        
        # Hybrid attention layers
        self.attention_blocks = nn.ModuleList([
            HybridAttentionBlock(hidden_dim, num_heads, ffn_mult, dropout)
            for _ in range(num_layers)
        ])
        
        # Final normalization
        self.final_norm = nn.LayerNorm(hidden_dim)
        
        # Pooling attention
        self.pool_query = nn.Parameter(torch.randn(1, 1, hidden_dim) * 0.02)
        self.pool_attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
        
        # MoE Classifier
        self.classifier = MixtureOfExpertsClassifier(hidden_dim, num_classes, num_experts, dropout)
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.trunc_normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)
    
    def forward(self, x):
        """
        x: (batch_size, input_dim) for single-step or (batch_size, seq_len, input_dim) for sequence
        """
        if x.dim() == 2:
            x = x.unsqueeze(1)  # (batch, 1, input_dim) -> create sequence dim
            
        B, S, _ = x.shape
        
        # Project input
        x = self.input_proj(x)  # (B, S, hidden_dim)
        
        # CNN multi-scale features (context for cross-attention)
        cnn_features = self.cnn_extractor(x)  # (B, S, hidden_dim)
        
        # Apply hybrid attention blocks
        for block in self.attention_blocks:
            x = block(x, context=cnn_features)
        
        x = self.final_norm(x)
        
        # Attention pooling
        pool_query = self.pool_query.expand(B, -1, -1)
        pooled, _ = self.pool_attn(pool_query, x, x)
        pooled = pooled.squeeze(1)  # (B, hidden_dim)
        
        # Classification
        logits, gate_probs = self.classifier(pooled)
        
        return logits, gate_probs
    
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)


# ============================================================
# DATA LOADING & PREPROCESSING
# ============================================================

class CyberSecurityDataset(Dataset):
    def __init__(self, features, labels, seq_len=1):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
        self.seq_len = seq_len
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


def load_and_preprocess_cicids2017(max_samples=None):
    """Load CICIDS2017 from HuggingFace with proper preprocessing."""
    from datasets import load_dataset
    
    print("Loading CICIDS2017 dataset...")
    ds = load_dataset("lacg030175/CICIDS2017", "temporal_3way")
    
    train_df = ds['train'].to_pandas()
    val_df = ds['validation'].to_pandas()
    test_df = ds['test'].to_pandas()
    
    print(f"CICIDS2017 - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
    print(f"Label distribution (train):\n{train_df['label'].value_counts()}")
    
    # Stratified subsample if max_samples is set
    if max_samples and len(train_df) > max_samples:
        from sklearn.model_selection import train_test_split
        train_df, _ = train_test_split(train_df, train_size=max_samples, 
                                        random_state=42, stratify=train_df['label'])
        print(f"Subsampled train to {len(train_df)}")
    if max_samples:
        val_size = min(len(val_df), max_samples // 4)
        test_size = min(len(test_df), max_samples // 4)
        if len(val_df) > val_size:
            val_df, _ = train_test_split(val_df, train_size=val_size,
                                          random_state=42, stratify=val_df['label'])
        if len(test_df) > test_size:
            test_df, _ = train_test_split(test_df, train_size=test_size,
                                           random_state=42, stratify=test_df['label'])
    
    # Get feature columns (exclude labels)
    exclude_cols = ['Label', 'label']
    feature_cols = [c for c in train_df.columns if c not in exclude_cols]
    
    # Clean data: replace inf, drop NaN
    for df in [train_df, val_df, test_df]:
        df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
        df[feature_cols] = df[feature_cols].fillna(0)
    
    X_train = train_df[feature_cols].values.astype(np.float32)
    y_train = train_df['label'].values
    X_val = val_df[feature_cols].values.astype(np.float32)
    y_val = val_df['label'].values
    X_test = test_df[feature_cols].values.astype(np.float32)
    y_test = test_df['label'].values
    
    # Standardize
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Clip extreme values
    X_train = np.clip(X_train, -10, 10)
    X_val = np.clip(X_val, -10, 10)
    X_test = np.clip(X_test, -10, 10)
    
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    return X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, 'CICIDS2017'


def load_and_preprocess_unsw_nb15(max_samples=None):
    """Load UNSW-NB15 from HuggingFace with proper preprocessing."""
    from datasets import load_dataset
    
    print("Loading UNSW-NB15 dataset...")
    ds = load_dataset("Mouwiya/UNSW-NB15")
    
    df = ds['train'].to_pandas()
    print(f"UNSW-NB15 total samples: {len(df)}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    
    # Subsample if needed
    if max_samples and len(df) > max_samples:
        from sklearn.model_selection import train_test_split
        df, _ = train_test_split(df, train_size=max_samples, random_state=42, stratify=df['label'])
        print(f"Subsampled to {len(df)}")
    
    # Remove non-numeric/IP columns
    drop_cols = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service', 
                 'attack_cat', 'label', 'ct_ftp_cmd']
    feature_cols = [c for c in df.columns if c not in drop_cols]
    
    # Encode categorical columns that remain
    for col in feature_cols:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    
    # Clean data
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
    df[feature_cols] = df[feature_cols].fillna(0)
    
    X = df[feature_cols].values.astype(np.float32)
    y = df['label'].values
    
    # Stratified split: 70/15/15
    from sklearn.model_selection import train_test_split
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    
    # Standardize
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Clip
    X_train = np.clip(X_train, -10, 10)
    X_val = np.clip(X_val, -10, 10)
    X_test = np.clip(X_test, -10, 10)
    
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    return X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, 'UNSW-NB15'


# ============================================================
# TRAINING ENGINE
# ============================================================

class FocalLoss(nn.Module):
    """Focal Loss for handling class imbalance in cybersecurity datasets."""
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


class CosineWarmupScheduler:
    """Cosine LR scheduler with linear warmup."""
    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=1e-7):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.min_lr = min_lr
        self.base_lrs = [pg['lr'] for pg in optimizer.param_groups]
        self.step_count = 0
        
    def step(self):
        self.step_count += 1
        if self.step_count <= self.warmup_steps:
            lr_mult = self.step_count / max(1, self.warmup_steps)
        else:
            progress = (self.step_count - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
            lr_mult = 0.5 * (1 + math.cos(math.pi * progress))
        
        for i, pg in enumerate(self.optimizer.param_groups):
            pg['lr'] = max(self.min_lr, self.base_lrs[i] * lr_mult)
        
        return self.optimizer.param_groups[0]['lr']


def train_one_epoch(model, dataloader, optimizer, criterion, scheduler, device, epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (features, labels) in enumerate(dataloader):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        logits, gate_probs = model(features)
        
        # Main loss
        loss = criterion(logits, labels)
        
        # Load balancing loss for MoE (encourage uniform expert usage)
        expert_usage = gate_probs.mean(0)
        lb_loss = (expert_usage * torch.log(expert_usage + 1e-8)).sum() * 0.01
        loss = loss + lb_loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr = scheduler.step()
        
        total_loss += loss.item()
        _, predicted = logits.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        if batch_idx % 100 == 0:
            print(f"  Epoch {epoch} | Batch {batch_idx}/{len(dataloader)} | "
                  f"Loss: {loss.item():.4f} | Acc: {100.*correct/total:.2f}% | LR: {lr:.2e}")
    
    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / total
    return avg_loss, accuracy


@torch.no_grad()
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
    
    for features, labels in dataloader:
        features, labels = features.to(device), labels.to(device)
        logits, _ = model(features)
        loss = criterion(logits, labels)
        total_loss += loss.item()
        
        probs = F.softmax(logits, dim=-1)
        _, predicted = logits.max(1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    
    accuracy = accuracy_score(all_labels, all_preds) * 100
    f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0) * 100
    f1_weighted = f1_score(all_labels, all_preds, average='weighted', zero_division=0) * 100
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0) * 100
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0) * 100
    
    try:
        if all_probs.shape[1] == 2:
            auc = roc_auc_score(all_labels, all_probs[:, 1]) * 100
        else:
            auc = roc_auc_score(all_labels, all_probs, multi_class='ovr', average='macro') * 100
    except:
        auc = 0.0
    
    metrics = {
        'loss': avg_loss,
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision': precision,
        'recall': recall,
        'auc': auc,
    }
    
    return metrics, all_preds, all_labels


def train_model(dataset_name='CICIDS2017', config=None, max_samples=None):
    """Main training loop."""
    
    if config is None:
        config = {
            'hidden_dim': 128,
            'num_layers': 4,
            'num_heads': 8,
            'num_experts': 4,
            'ffn_mult': 4,
            'dropout': 0.15,
            'batch_size': 512,
            'lr': 3e-4,
            'weight_decay': 1e-4,
            'epochs': 30,
            'patience': 7,
            'focal_gamma': 2.0,
        }
    
    print(f"\n{'='*70}")
    print(f"Training CyberHybridNet on {dataset_name}")
    print(f"{'='*70}")
    print(f"Config: {json.dumps(config, indent=2)}")
    
    # Load data
    if dataset_name == 'CICIDS2017':
        X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, name = load_and_preprocess_cicids2017(max_samples)
    else:
        X_train, y_train, X_val, y_val, X_test, y_test, scaler, feature_cols, num_classes, name = load_and_preprocess_unsw_nb15(max_samples)
    
    input_dim = X_train.shape[1]
    
    # Create datasets
    train_dataset = CyberSecurityDataset(X_train, y_train)
    val_dataset = CyberSecurityDataset(X_val, y_val)
    test_dataset = CyberSecurityDataset(X_test, y_test)
    
    # Compute class weights for balanced sampling
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / (class_counts + 1e-8)
    class_weights = class_weights / class_weights.sum()
    sample_weights = class_weights[y_train]
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
    
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], sampler=sampler, 
                              num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'] * 2, shuffle=False,
                            num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'] * 2, shuffle=False,
                             num_workers=2, pin_memory=True)
    
    # Create model
    model = CyberHybridNet(
        input_dim=input_dim,
        num_classes=num_classes,
        hidden_dim=config['hidden_dim'],
        num_layers=config['num_layers'],
        num_heads=config['num_heads'],
        num_experts=config['num_experts'],
        ffn_mult=config['ffn_mult'],
        dropout=config['dropout'],
    ).to(DEVICE)
    
    num_params = model.count_parameters()
    print(f"\nModel Parameters: {num_params:,} ({num_params/1e6:.2f}M)")
    print(f"Input dim: {input_dim}, Classes: {num_classes}")
    
    # Loss function with focal loss
    alpha = torch.FloatTensor(class_weights).to(DEVICE)
    criterion = FocalLoss(alpha=alpha, gamma=config['focal_gamma'])
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config['lr'], 
        weight_decay=config['weight_decay'],
        betas=(0.9, 0.999)
    )
    
    # Scheduler
    total_steps = len(train_loader) * config['epochs']
    warmup_steps = len(train_loader) * 2  # 2 epochs warmup
    scheduler = CosineWarmupScheduler(optimizer, warmup_steps, total_steps)
    
    # Initialize tracking
    if HAS_TRACKIO:
        try:
            trackio.init(project="cyberhybridnet", name=f"{dataset_name.lower()}-training")
            print("Trackio monitoring initialized")
        except Exception as e:
            print(f"Trackio init failed: {e}")
    
    # Training loop
    best_val_f1 = 0
    best_model_state = None
    patience_counter = 0
    training_history = []
    
    for epoch in range(1, config['epochs'] + 1):
        epoch_start = time.time()
        
        # Train
        train_loss, train_acc = train_one_epoch(
            model, train_loader, optimizer, criterion, scheduler, DEVICE, epoch
        )
        
        # Validate
        val_metrics, _, _ = evaluate(model, val_loader, criterion, DEVICE)
        
        epoch_time = time.time() - epoch_start
        
        print(f"\nEpoch {epoch}/{config['epochs']} ({epoch_time:.1f}s)")
        print(f"  Train - Loss: {train_loss:.4f} | Acc: {train_acc:.2f}%")
        print(f"  Val   - Loss: {val_metrics['loss']:.4f} | Acc: {val_metrics['accuracy']:.2f}% | "
              f"F1-Macro: {val_metrics['f1_macro']:.2f}% | F1-Wt: {val_metrics['f1_weighted']:.2f}% | "
              f"AUC: {val_metrics['auc']:.2f}%")
        
        # Log to trackio
        if HAS_TRACKIO:
            try:
                trackio.log({
                    'train/loss': train_loss,
                    'train/accuracy': train_acc,
                    'val/loss': val_metrics['loss'],
                    'val/accuracy': val_metrics['accuracy'],
                    'val/f1_macro': val_metrics['f1_macro'],
                    'val/f1_weighted': val_metrics['f1_weighted'],
                    'val/precision': val_metrics['precision'],
                    'val/recall': val_metrics['recall'],
                    'val/auc': val_metrics['auc'],
                    'lr': optimizer.param_groups[0]['lr'],
                    'epoch': epoch,
                })
            except:
                pass
        
        training_history.append({
            'epoch': epoch,
            'train_loss': train_loss,
            'train_acc': train_acc,
            **{f'val_{k}': v for k, v in val_metrics.items()}
        })
        
        # Early stopping
        if val_metrics['f1_macro'] > best_val_f1:
            best_val_f1 = val_metrics['f1_macro']
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_counter = 0
            print(f"  ★ New best F1-Macro: {best_val_f1:.2f}%")
        else:
            patience_counter += 1
            if patience_counter >= config['patience']:
                print(f"\nEarly stopping at epoch {epoch} (patience={config['patience']})")
                break
    
    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model = model.to(DEVICE)
    
    # Final evaluation on test set
    print(f"\n{'='*70}")
    print(f"FINAL TEST EVALUATION ({dataset_name})")
    print(f"{'='*70}")
    
    test_metrics, test_preds, test_labels = evaluate(model, test_loader, criterion, DEVICE)
    
    print(f"\nTest Results:")
    print(f"  Accuracy:    {test_metrics['accuracy']:.2f}%")
    print(f"  F1-Macro:    {test_metrics['f1_macro']:.2f}%")
    print(f"  F1-Weighted: {test_metrics['f1_weighted']:.2f}%")
    print(f"  Precision:   {test_metrics['precision']:.2f}%")
    print(f"  Recall:      {test_metrics['recall']:.2f}%")
    print(f"  AUC-ROC:     {test_metrics['auc']:.2f}%")
    
    print(f"\nClassification Report:")
    print(classification_report(test_labels, test_preds, zero_division=0))
    
    return model, test_metrics, config, scaler, feature_cols, num_classes, input_dim, training_history


def push_model_to_hub(model, config, metrics_cicids, metrics_unsw, input_dim, num_classes_cicids, 
                       num_classes_unsw, feature_cols_cicids, feature_cols_unsw):
    """Push trained model to Hugging Face Hub."""
    from huggingface_hub import HfApi, create_repo
    import tempfile
    
    repo_id = "ha5eeb001/CyberHybridNet-anomaly-detector"
    
    try:
        api = HfApi()
        try:
            api.create_repo(repo_id, exist_ok=True, private=False)
        except Exception as e:
            print(f"Repo creation note: {e}")
        
        with tempfile.TemporaryDirectory() as tmpdir:
            # Save model weights
            model_path = os.path.join(tmpdir, "model.pt")
            torch.save(model.state_dict(), model_path)
            
            # Save config
            model_config = {
                'architecture': 'CyberHybridNet',
                'description': 'Hybrid Transformer with Multi-Scale CNN + Gated Cross-Attention + MoE for Cybersecurity Anomaly Detection',
                'training_config': config,
                'input_dim_cicids': int(feature_cols_cicids) if isinstance(feature_cols_cicids, int) else len(feature_cols_cicids),
                'input_dim_unsw': int(feature_cols_unsw) if isinstance(feature_cols_unsw, int) else len(feature_cols_unsw),
                'num_classes_cicids': num_classes_cicids,
                'num_classes_unsw': num_classes_unsw,
                'metrics_cicids2017': {k: float(v) for k, v in metrics_cicids.items()},
                'metrics_unsw_nb15': {k: float(v) for k, v in metrics_unsw.items()},
                'components': [
                    'Multi-Scale 1D CNN Feature Extractor (3 scales)',
                    'Rotary Position Embeddings',
                    'Multi-Head Self-Attention', 
                    'Gated Cross-Attention',
                    'SwiGLU Feed-Forward Networks',
                    'Mixture-of-Experts Classifier (4 experts)',
                    'Focal Loss for class imbalance',
                    'Cosine LR with warmup',
                ],
                'datasets': [
                    'lacg030175/CICIDS2017 (temporal_3way split)',
                    'Mouwiya/UNSW-NB15',
                ]
            }
            config_path = os.path.join(tmpdir, "config.json")
            with open(config_path, 'w') as f:
                json.dump(model_config, f, indent=2)
            
            # Create README
            readme = f"""---
tags:
- cybersecurity
- anomaly-detection
- intrusion-detection
- transformer
- hybrid-attention
- pytorch
license: apache-2.0
datasets:
- lacg030175/CICIDS2017
- Mouwiya/UNSW-NB15
metrics:
- accuracy
- f1
- precision
- recall
---

# 🛡️ CyberHybridNet: Hybrid Transformer for Cybersecurity Anomaly Detection

## Architecture

**CyberHybridNet** is a cutting-edge hybrid transformer architecture designed specifically for network intrusion / anomaly detection in cybersecurity. It combines multiple advanced components:

### Key Components:
1. **Multi-Scale 1D CNN Feature Extractor** - Captures local patterns at 3 different granularities (kernel sizes 1, 3, 5)
2. **Rotary Position Embeddings (RoPE)** - Temporal awareness for network flow sequences
3. **Multi-Head Self-Attention** - Global dependency modeling across flow features
4. **Gated Cross-Attention** - Cross-feature interaction between CNN and transformer pathways with learned gating
5. **SwiGLU Feed-Forward Networks** - Advanced activation function from PaLM/LLaMA
6. **Mixture-of-Experts (MoE) Classifier** - 4-expert ensemble with load balancing for robust classification
7. **Focal Loss** - Handles severe class imbalance common in cybersecurity datasets
8. **Attention Pooling** - Learnable query-based pooling instead of naive mean pooling

### Architecture Diagram:
```
Input Features
      │
  ┌───▼───┐
  │ Input  │
  │Project │
  └───┬───┘
      │
  ┌───▼───────────┐     ┌──────────────────┐
  │ Multi-Scale    │────▶│ CNN Context       │
  │ CNN Extractor  │     │ (3 scales: 1,3,5) │
  └───┬───────────┘     └──────┬───────────┘
      │                        │
      │    ┌───────────────────┘
      │    │
  ┌───▼────▼───────────┐
  │ Hybrid Attention    │ × N layers
  │ ┌─────────────────┐│
  │ │Self-Attn + RoPE ││
  │ ├─────────────────┤│
  │ │Gated Cross-Attn ││
  │ ├─────────────────┤│
  │ │SwiGLU FFN       ││
  │ └─────────────────┘│
  └────────┬───────────┘
           │
  ┌────────▼───────────┐
  │ Attention Pooling   │
  └────────┬───────────┘
           │
  ┌────────▼───────────┐
  │ MoE Classifier     │
  │ (4 experts + gate) │
  └────────┬───────────┘
           │
       Predictions
```

## Performance

### CICIDS2017 (Temporal Split)
| Metric | Score |
|--------|-------|
| Accuracy | {metrics_cicids.get('accuracy', 0):.2f}% |
| F1-Macro | {metrics_cicids.get('f1_macro', 0):.2f}% |
| F1-Weighted | {metrics_cicids.get('f1_weighted', 0):.2f}% |
| Precision | {metrics_cicids.get('precision', 0):.2f}% |
| Recall | {metrics_cicids.get('recall', 0):.2f}% |
| AUC-ROC | {metrics_cicids.get('auc', 0):.2f}% |

### UNSW-NB15
| Metric | Score |
|--------|-------|
| Accuracy | {metrics_unsw.get('accuracy', 0):.2f}% |
| F1-Macro | {metrics_unsw.get('f1_macro', 0):.2f}% |
| F1-Weighted | {metrics_unsw.get('f1_weighted', 0):.2f}% |
| Precision | {metrics_unsw.get('precision', 0):.2f}% |
| Recall | {metrics_unsw.get('recall', 0):.2f}% |
| AUC-ROC | {metrics_unsw.get('auc', 0):.2f}% |

## Training Details

- **Optimizer**: AdamW (lr=3e-4, weight_decay=1e-4)
- **Scheduler**: Cosine with linear warmup (2 epochs)
- **Loss**: Focal Loss (γ=2.0) with class-weighted sampling
- **Regularization**: Dropout (0.15), gradient clipping (max_norm=1.0), MoE load balancing
- **Early Stopping**: Patience=7 on validation F1-Macro

## Usage

```python
import torch
from model import CyberHybridNet

# Load model
model = CyberHybridNet(
    input_dim=78,  # CICIDS2017 features
    num_classes=3,  # BENIGN, ATTACK, UNKNOWN
    hidden_dim=128,
    num_layers=4,
    num_heads=8,
    num_experts=4,
)
model.load_state_dict(torch.load("model.pt"))
model.eval()

# Predict
with torch.no_grad():
    features = torch.randn(1, 78)  # Your preprocessed features
    logits, gate_probs = model(features)
    prediction = logits.argmax(dim=-1)
```

## Datasets
- [CICIDS2017](https://huggingface.co/datasets/lacg030175/CICIDS2017) - Canadian Institute for Cybersecurity IDS 2017
- [UNSW-NB15](https://huggingface.co/datasets/Mouwiya/UNSW-NB15) - Australian Centre for Cyber Security
"""
            readme_path = os.path.join(tmpdir, "README.md")
            with open(readme_path, 'w') as f:
                f.write(readme)
            
            # Upload all files
            api.upload_file(path_or_fileobj=model_path, path_in_repo="model.pt", repo_id=repo_id)
            api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=repo_id)
            api.upload_file(path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id)
            
            # Upload the model architecture code
            script_path = os.path.abspath(__file__)
            api.upload_file(path_or_fileobj=script_path, path_in_repo="model.py", repo_id=repo_id)
            
            print(f"\n✅ Model pushed to: https://huggingface.co/{repo_id}")
            
    except Exception as e:
        print(f"Error pushing to hub: {e}")
        import traceback
        traceback.print_exc()


# ============================================================
# MAIN
# ============================================================

if __name__ == "__main__":
    # Detect if GPU is available and set config accordingly
    is_gpu = torch.cuda.is_available()
    max_samples = None if is_gpu else 500000  # Use full data on GPU, subsample on CPU
    
    config = {
        'hidden_dim': 128,
        'num_layers': 4,
        'num_heads': 8,
        'num_experts': 4,
        'ffn_mult': 4,
        'dropout': 0.15,
        'batch_size': 1024 if is_gpu else 512,
        'lr': 3e-4,
        'weight_decay': 1e-4,
        'epochs': 30 if is_gpu else 20,
        'patience': 7 if is_gpu else 5,
        'focal_gamma': 2.0,
    }
    
    # ---- Train on CICIDS2017 ----
    print("\n" + "="*80)
    print("PHASE 1: Training on CICIDS2017")
    print("="*80)
    model_cicids, metrics_cicids, _, scaler_cicids, fcols_cicids, nclasses_cicids, input_dim_cicids, hist_cicids = \
        train_model('CICIDS2017', config, max_samples)
    
    # ---- Train on UNSW-NB15 ----
    print("\n" + "="*80) 
    print("PHASE 2: Training on UNSW-NB15")
    print("="*80)
    model_unsw, metrics_unsw, _, scaler_unsw, fcols_unsw, nclasses_unsw, input_dim_unsw, hist_unsw = \
        train_model('UNSW-NB15', config, max_samples)
    
    # ---- Push best model to Hub ----
    print("\n" + "="*80)
    print("PHASE 3: Pushing models to Hub")
    print("="*80)
    
    # Push the CICIDS model (typically more challenging dataset)
    push_model_to_hub(
        model_cicids, config, metrics_cicids, metrics_unsw,
        input_dim_cicids, nclasses_cicids, nclasses_unsw,
        fcols_cicids, fcols_unsw
    )
    
    # Also save UNSW model
    torch.save(model_unsw.state_dict(), '/tmp/model_unsw.pt')
    from huggingface_hub import HfApi
    try:
        api = HfApi()
        api.upload_file(
            path_or_fileobj='/tmp/model_unsw.pt',
            path_in_repo="model_unsw_nb15.pt",
            repo_id="ha5eeb001/CyberHybridNet-anomaly-detector"
        )
        print("UNSW-NB15 model weights uploaded")
    except Exception as e:
        print(f"Upload error: {e}")
    
    print("\n" + "="*80)
    print("TRAINING COMPLETE!")
    print("="*80)
    print(f"\nCICIDS2017 - Acc: {metrics_cicids['accuracy']:.2f}% | F1: {metrics_cicids['f1_macro']:.2f}% | AUC: {metrics_cicids['auc']:.2f}%")
    print(f"UNSW-NB15  - Acc: {metrics_unsw['accuracy']:.2f}% | F1: {metrics_unsw['f1_macro']:.2f}% | AUC: {metrics_unsw['auc']:.2f}%")
    print(f"\nModel: https://huggingface.co/ha5eeb001/CyberHybridNet-anomaly-detector")