| """ |
| Data preprocessing pipeline for NSL-KDD dataset. |
| Handles loading, encoding, scaling, and splitting. |
| """ |
|
|
| import os |
| import sys |
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import LabelEncoder, MinMaxScaler |
| from datasets import load_dataset |
| import pickle |
| import json |
|
|
| |
| SEED = 42 |
| np.random.seed(SEED) |
|
|
| |
| ATTACK_MAP = { |
| |
| 'normal': 'Normal', |
| |
| 'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS', |
| 'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS', 'apache2': 'DoS', |
| 'processtable': 'DoS', 'udpstorm': 'DoS', |
| |
| 'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe', 'satan': 'Probe', |
| 'mscan': 'Probe', 'saint': 'Probe', |
| |
| 'ftp_write': 'R2L', 'guess_passwd': 'R2L', 'imap': 'R2L', 'multihop': 'R2L', |
| 'phf': 'R2L', 'spy': 'R2L', 'warezclient': 'R2L', 'warezmaster': 'R2L', |
| 'sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L', 'snmpguess': 'R2L', |
| 'xlock': 'R2L', 'xsnoop': 'R2L', 'worm': 'R2L', |
| |
| 'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R', |
| 'httptunnel': 'U2R', 'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R', |
| } |
|
|
| |
| FEATURE_NAMES = [ |
| 'duration', 'protocol_type', 'service', 'flag', |
| 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', |
| 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', |
| 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', |
| 'num_shells', 'num_access_files', 'num_outbound_cmds', |
| 'is_host_login', 'is_guest_login', |
| 'count', 'srv_count', |
| 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', |
| 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', |
| 'dst_host_count', 'dst_host_srv_count', |
| 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', |
| 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', |
| 'dst_host_serror_rate', 'dst_host_srv_serror_rate', |
| 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' |
| ] |
|
|
| CATEGORICAL_COLS = ['protocol_type', 'service', 'flag'] |
|
|
| |
| CLASS_LABELS = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R'] |
|
|
|
|
| def load_nsl_kdd(): |
| """Load NSL-KDD from HuggingFace Hub.""" |
| print("Loading NSL-KDD dataset from HuggingFace Hub...") |
| ds = load_dataset("Mireu-Lab/NSL-KDD") |
| |
| df_train = ds['train'].to_pandas() |
| df_test = ds['test'].to_pandas() |
| |
| print(f"Train: {len(df_train)} samples") |
| print(f"Test: {len(df_test)} samples") |
| |
| return df_train, df_test |
|
|
|
|
| def analyze_dataset(df_train, df_test): |
| """Print dataset statistics for documentation.""" |
| print("\n" + "="*60) |
| print("DATASET ANALYSIS") |
| print("="*60) |
| |
| print(f"\nTraining set: {len(df_train)} samples") |
| print(f"Test set: {len(df_test)} samples") |
| |
| print("\n--- Class Distribution (Training) ---") |
| train_dist = df_train['class'].value_counts() |
| for cls, count in train_dist.items(): |
| pct = 100 * count / len(df_train) |
| print(f" {cls:10s}: {count:6d} ({pct:.1f}%)") |
| |
| print("\n--- Class Distribution (Test) ---") |
| test_dist = df_test['class'].value_counts() |
| for cls, count in test_dist.items(): |
| pct = 100 * count / len(df_test) |
| print(f" {cls:10s}: {count:6d} ({pct:.1f}%)") |
| |
| print("\n--- Categorical Features ---") |
| for col in CATEGORICAL_COLS: |
| n_train = df_train[col].nunique() |
| n_test = df_test[col].nunique() |
| print(f" {col:15s}: {n_train} train / {n_test} test unique values") |
| |
| |
| train_vals = set(df_train[col].unique()) |
| test_vals = set(df_test[col].unique()) |
| unseen = test_vals - train_vals |
| if unseen: |
| print(f" Warning: {len(unseen)} unseen test values: {unseen}") |
| |
| print("\n--- Feature Ranges (numeric) ---") |
| numeric_cols = [c for c in FEATURE_NAMES if c not in CATEGORICAL_COLS] |
| for col in numeric_cols[:10]: |
| print(f" {col:35s}: [{df_train[col].min():.2f}, {df_train[col].max():.2f}]") |
| print(f" ... and {len(numeric_cols)-10} more numeric features") |
| |
| return train_dist, test_dist |
|
|
|
|
| def preprocess(df_train, df_test, binary=True): |
| """ |
| Full preprocessing pipeline. |
| |
| Args: |
| df_train: Training DataFrame |
| df_test: Test DataFrame |
| binary: If True, binary classification (normal vs anomaly) |
| |
| Returns: |
| X_train, X_test, y_train, y_test, label_encoders, scaler, class_names |
| """ |
| print(f"\nPreprocessing ({'binary' if binary else '5-class'} classification)...") |
| |
| df_tr = df_train.copy() |
| df_te = df_test.copy() |
| |
| |
| if binary: |
| class_names = ['anomaly', 'normal'] |
| le_y = LabelEncoder() |
| y_train = le_y.fit_transform(df_tr['class'].values) |
| y_test = le_y.transform(df_te['class'].values) |
| else: |
| class_names = CLASS_LABELS |
| le_y = LabelEncoder() |
| le_y.classes_ = np.array(CLASS_LABELS) |
| y_train = le_y.fit_transform(df_tr['class'].values) |
| y_test = le_y.transform(df_te['class'].values) |
| |
| |
| label_encoders = {} |
| for col in CATEGORICAL_COLS: |
| le = LabelEncoder() |
| le.fit(df_tr[col]) |
| |
| |
| known = set(le.classes_) |
| df_te[col] = df_te[col].apply(lambda x: x if x in known else le.classes_[0]) |
| |
| df_tr[col] = le.transform(df_tr[col]) |
| df_te[col] = le.transform(df_te[col]) |
| label_encoders[col] = le |
| print(f" Encoded {col}: {len(le.classes_)} categories") |
| |
| |
| X_train = df_tr[FEATURE_NAMES].values.astype(np.float32) |
| X_test = df_te[FEATURE_NAMES].values.astype(np.float32) |
| |
| |
| scaler = MinMaxScaler() |
| X_train = scaler.fit_transform(X_train) |
| X_test = scaler.transform(X_test) |
| |
| print(f" X_train shape: {X_train.shape}") |
| print(f" X_test shape: {X_test.shape}") |
| print(f" y_train distribution: {np.bincount(y_train)}") |
| print(f" y_test distribution: {np.bincount(y_test)}") |
| |
| return X_train, X_test, y_train, y_test, label_encoders, scaler, class_names |
|
|
|
|
| def save_preprocessed(X_train, X_test, y_train, y_test, label_encoders, scaler, |
| class_names, output_dir='data/processed'): |
| """Save preprocessed data for reproducibility.""" |
| os.makedirs(output_dir, exist_ok=True) |
| |
| np.save(os.path.join(output_dir, 'X_train.npy'), X_train) |
| np.save(os.path.join(output_dir, 'X_test.npy'), X_test) |
| np.save(os.path.join(output_dir, 'y_train.npy'), y_train) |
| np.save(os.path.join(output_dir, 'y_test.npy'), y_test) |
| |
| with open(os.path.join(output_dir, 'encoders.pkl'), 'wb') as f: |
| pickle.dump({'label_encoders': label_encoders, 'scaler': scaler}, f) |
| |
| with open(os.path.join(output_dir, 'metadata.json'), 'w') as f: |
| json.dump({ |
| 'feature_names': FEATURE_NAMES, |
| 'categorical_cols': CATEGORICAL_COLS, |
| 'class_names': class_names, |
| 'n_train': len(X_train), |
| 'n_test': len(X_test), |
| 'n_features': X_train.shape[1], |
| 'seed': SEED, |
| }, f, indent=2) |
| |
| print(f"\nSaved preprocessed data to {output_dir}/") |
|
|
|
|
| def load_preprocessed(data_dir='data/processed'): |
| """Load preprocessed data.""" |
| X_train = np.load(os.path.join(data_dir, 'X_train.npy')) |
| X_test = np.load(os.path.join(data_dir, 'X_test.npy')) |
| y_train = np.load(os.path.join(data_dir, 'y_train.npy')) |
| y_test = np.load(os.path.join(data_dir, 'y_test.npy')) |
| |
| with open(os.path.join(data_dir, 'encoders.pkl'), 'rb') as f: |
| objs = pickle.load(f) |
| |
| with open(os.path.join(data_dir, 'metadata.json')) as f: |
| meta = json.load(f) |
| |
| return X_train, X_test, y_train, y_test, objs['label_encoders'], objs['scaler'], meta |
|
|
|
|
| if __name__ == '__main__': |
| df_train, df_test = load_nsl_kdd() |
| analyze_dataset(df_train, df_test) |
| |
| X_train, X_test, y_train, y_test, le, scaler, class_names = preprocess( |
| df_train, df_test, binary=True |
| ) |
| save_preprocessed(X_train, X_test, y_train, y_test, le, scaler, class_names) |
| |
| print("\nPreprocessing complete!") |
|
|