File size: 8,451 Bytes

97c54b3

"""
Data preprocessing pipeline for NSL-KDD dataset.
Handles loading, encoding, scaling, and splitting.
"""

import os
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datasets import load_dataset
import pickle
import json

# Fixed seed for reproducibility
SEED = 42
np.random.seed(SEED)

# NSL-KDD attack type to category mapping
ATTACK_MAP = {
    # Normal
    'normal': 'Normal',
    # DoS attacks
    'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS',
    'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS', 'apache2': 'DoS',
    'processtable': 'DoS', 'udpstorm': 'DoS',
    # Probe attacks
    'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe', 'satan': 'Probe',
    'mscan': 'Probe', 'saint': 'Probe',
    # R2L attacks
    'ftp_write': 'R2L', 'guess_passwd': 'R2L', 'imap': 'R2L', 'multihop': 'R2L',
    'phf': 'R2L', 'spy': 'R2L', 'warezclient': 'R2L', 'warezmaster': 'R2L',
    'sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L', 'snmpguess': 'R2L',
    'xlock': 'R2L', 'xsnoop': 'R2L', 'worm': 'R2L',
    # U2R attacks
    'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R',
    'httptunnel': 'U2R', 'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R',
}

# 41 features of NSL-KDD
FEATURE_NAMES = [
    'duration', 'protocol_type', 'service', 'flag',
    'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
    'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login',
    'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
]

CATEGORICAL_COLS = ['protocol_type', 'service', 'flag']

# Class label mapping for 5-class
CLASS_LABELS = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']


def load_nsl_kdd():
    """Load NSL-KDD from HuggingFace Hub."""
    print("Loading NSL-KDD dataset from HuggingFace Hub...")
    ds = load_dataset("Mireu-Lab/NSL-KDD")
    
    df_train = ds['train'].to_pandas()
    df_test = ds['test'].to_pandas()
    
    print(f"Train: {len(df_train)} samples")
    print(f"Test:  {len(df_test)} samples")
    
    return df_train, df_test


def analyze_dataset(df_train, df_test):
    """Print dataset statistics for documentation."""
    print("\n" + "="*60)
    print("DATASET ANALYSIS")
    print("="*60)
    
    print(f"\nTraining set: {len(df_train)} samples")
    print(f"Test set:     {len(df_test)} samples")
    
    print("\n--- Class Distribution (Training) ---")
    train_dist = df_train['class'].value_counts()
    for cls, count in train_dist.items():
        pct = 100 * count / len(df_train)
        print(f"  {cls:10s}: {count:6d} ({pct:.1f}%)")
    
    print("\n--- Class Distribution (Test) ---")
    test_dist = df_test['class'].value_counts()
    for cls, count in test_dist.items():
        pct = 100 * count / len(df_test)
        print(f"  {cls:10s}: {count:6d} ({pct:.1f}%)")
    
    print("\n--- Categorical Features ---")
    for col in CATEGORICAL_COLS:
        n_train = df_train[col].nunique()
        n_test = df_test[col].nunique()
        print(f"  {col:15s}: {n_train} train / {n_test} test unique values")
        
        # Check for unseen test values
        train_vals = set(df_train[col].unique())
        test_vals = set(df_test[col].unique())
        unseen = test_vals - train_vals
        if unseen:
            print(f"    Warning: {len(unseen)} unseen test values: {unseen}")
    
    print("\n--- Feature Ranges (numeric) ---")
    numeric_cols = [c for c in FEATURE_NAMES if c not in CATEGORICAL_COLS]
    for col in numeric_cols[:10]:
        print(f"  {col:35s}: [{df_train[col].min():.2f}, {df_train[col].max():.2f}]")
    print(f"  ... and {len(numeric_cols)-10} more numeric features")
    
    return train_dist, test_dist


def preprocess(df_train, df_test, binary=True):
    """
    Full preprocessing pipeline.
    
    Args:
        df_train: Training DataFrame
        df_test: Test DataFrame
        binary: If True, binary classification (normal vs anomaly)
    
    Returns:
        X_train, X_test, y_train, y_test, label_encoders, scaler, class_names
    """
    print(f"\nPreprocessing ({'binary' if binary else '5-class'} classification)...")
    
    df_tr = df_train.copy()
    df_te = df_test.copy()
    
    # --- Encode target ---
    if binary:
        class_names = ['anomaly', 'normal']
        le_y = LabelEncoder()
        y_train = le_y.fit_transform(df_tr['class'].values)
        y_test = le_y.transform(df_te['class'].values)
    else:
        class_names = CLASS_LABELS
        le_y = LabelEncoder()
        le_y.classes_ = np.array(CLASS_LABELS)
        y_train = le_y.fit_transform(df_tr['class'].values)
        y_test = le_y.transform(df_te['class'].values)
    
    # --- Encode categorical features ---
    label_encoders = {}
    for col in CATEGORICAL_COLS:
        le = LabelEncoder()
        le.fit(df_tr[col])
        
        # Handle unseen test labels
        known = set(le.classes_)
        df_te[col] = df_te[col].apply(lambda x: x if x in known else le.classes_[0])
        
        df_tr[col] = le.transform(df_tr[col])
        df_te[col] = le.transform(df_te[col])
        label_encoders[col] = le
        print(f"  Encoded {col}: {len(le.classes_)} categories")
    
    # --- Extract features ---
    X_train = df_tr[FEATURE_NAMES].values.astype(np.float32)
    X_test = df_te[FEATURE_NAMES].values.astype(np.float32)
    
    # --- Scale features ---
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(f"  X_train shape: {X_train.shape}")
    print(f"  X_test shape:  {X_test.shape}")
    print(f"  y_train distribution: {np.bincount(y_train)}")
    print(f"  y_test distribution:  {np.bincount(y_test)}")
    
    return X_train, X_test, y_train, y_test, label_encoders, scaler, class_names


def save_preprocessed(X_train, X_test, y_train, y_test, label_encoders, scaler, 
                       class_names, output_dir='data/processed'):
    """Save preprocessed data for reproducibility."""
    os.makedirs(output_dir, exist_ok=True)
    
    np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
    np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)
    
    with open(os.path.join(output_dir, 'encoders.pkl'), 'wb') as f:
        pickle.dump({'label_encoders': label_encoders, 'scaler': scaler}, f)
    
    with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
        json.dump({
            'feature_names': FEATURE_NAMES,
            'categorical_cols': CATEGORICAL_COLS,
            'class_names': class_names,
            'n_train': len(X_train),
            'n_test': len(X_test),
            'n_features': X_train.shape[1],
            'seed': SEED,
        }, f, indent=2)
    
    print(f"\nSaved preprocessed data to {output_dir}/")


def load_preprocessed(data_dir='data/processed'):
    """Load preprocessed data."""
    X_train = np.load(os.path.join(data_dir, 'X_train.npy'))
    X_test = np.load(os.path.join(data_dir, 'X_test.npy'))
    y_train = np.load(os.path.join(data_dir, 'y_train.npy'))
    y_test = np.load(os.path.join(data_dir, 'y_test.npy'))
    
    with open(os.path.join(data_dir, 'encoders.pkl'), 'rb') as f:
        objs = pickle.load(f)
    
    with open(os.path.join(data_dir, 'metadata.json')) as f:
        meta = json.load(f)
    
    return X_train, X_test, y_train, y_test, objs['label_encoders'], objs['scaler'], meta


if __name__ == '__main__':
    df_train, df_test = load_nsl_kdd()
    analyze_dataset(df_train, df_test)
    
    X_train, X_test, y_train, y_test, le, scaler, class_names = preprocess(
        df_train, df_test, binary=True
    )
    save_preprocessed(X_train, X_test, y_train, y_test, le, scaler, class_names)
    
    print("\nPreprocessing complete!")