deep-learning-project / data /preprocess.py
cathrica's picture
Add data preprocessing pipeline
97c54b3 verified
"""
Data preprocessing pipeline for NSL-KDD dataset.
Handles loading, encoding, scaling, and splitting.
"""
import os
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datasets import load_dataset
import pickle
import json
# Fixed seed for reproducibility
SEED = 42
np.random.seed(SEED)
# NSL-KDD attack type to category mapping
ATTACK_MAP = {
# Normal
'normal': 'Normal',
# DoS attacks
'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS',
'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS', 'apache2': 'DoS',
'processtable': 'DoS', 'udpstorm': 'DoS',
# Probe attacks
'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe', 'satan': 'Probe',
'mscan': 'Probe', 'saint': 'Probe',
# R2L attacks
'ftp_write': 'R2L', 'guess_passwd': 'R2L', 'imap': 'R2L', 'multihop': 'R2L',
'phf': 'R2L', 'spy': 'R2L', 'warezclient': 'R2L', 'warezmaster': 'R2L',
'sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L', 'snmpguess': 'R2L',
'xlock': 'R2L', 'xsnoop': 'R2L', 'worm': 'R2L',
# U2R attacks
'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R',
'httptunnel': 'U2R', 'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R',
}
# 41 features of NSL-KDD
FEATURE_NAMES = [
'duration', 'protocol_type', 'service', 'flag',
'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
'num_shells', 'num_access_files', 'num_outbound_cmds',
'is_host_login', 'is_guest_login',
'count', 'srv_count',
'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count',
'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate',
'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
]
CATEGORICAL_COLS = ['protocol_type', 'service', 'flag']
# Class label mapping for 5-class
CLASS_LABELS = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']
def load_nsl_kdd():
"""Load NSL-KDD from HuggingFace Hub."""
print("Loading NSL-KDD dataset from HuggingFace Hub...")
ds = load_dataset("Mireu-Lab/NSL-KDD")
df_train = ds['train'].to_pandas()
df_test = ds['test'].to_pandas()
print(f"Train: {len(df_train)} samples")
print(f"Test: {len(df_test)} samples")
return df_train, df_test
def analyze_dataset(df_train, df_test):
"""Print dataset statistics for documentation."""
print("\n" + "="*60)
print("DATASET ANALYSIS")
print("="*60)
print(f"\nTraining set: {len(df_train)} samples")
print(f"Test set: {len(df_test)} samples")
print("\n--- Class Distribution (Training) ---")
train_dist = df_train['class'].value_counts()
for cls, count in train_dist.items():
pct = 100 * count / len(df_train)
print(f" {cls:10s}: {count:6d} ({pct:.1f}%)")
print("\n--- Class Distribution (Test) ---")
test_dist = df_test['class'].value_counts()
for cls, count in test_dist.items():
pct = 100 * count / len(df_test)
print(f" {cls:10s}: {count:6d} ({pct:.1f}%)")
print("\n--- Categorical Features ---")
for col in CATEGORICAL_COLS:
n_train = df_train[col].nunique()
n_test = df_test[col].nunique()
print(f" {col:15s}: {n_train} train / {n_test} test unique values")
# Check for unseen test values
train_vals = set(df_train[col].unique())
test_vals = set(df_test[col].unique())
unseen = test_vals - train_vals
if unseen:
print(f" Warning: {len(unseen)} unseen test values: {unseen}")
print("\n--- Feature Ranges (numeric) ---")
numeric_cols = [c for c in FEATURE_NAMES if c not in CATEGORICAL_COLS]
for col in numeric_cols[:10]:
print(f" {col:35s}: [{df_train[col].min():.2f}, {df_train[col].max():.2f}]")
print(f" ... and {len(numeric_cols)-10} more numeric features")
return train_dist, test_dist
def preprocess(df_train, df_test, binary=True):
"""
Full preprocessing pipeline.
Args:
df_train: Training DataFrame
df_test: Test DataFrame
binary: If True, binary classification (normal vs anomaly)
Returns:
X_train, X_test, y_train, y_test, label_encoders, scaler, class_names
"""
print(f"\nPreprocessing ({'binary' if binary else '5-class'} classification)...")
df_tr = df_train.copy()
df_te = df_test.copy()
# --- Encode target ---
if binary:
class_names = ['anomaly', 'normal']
le_y = LabelEncoder()
y_train = le_y.fit_transform(df_tr['class'].values)
y_test = le_y.transform(df_te['class'].values)
else:
class_names = CLASS_LABELS
le_y = LabelEncoder()
le_y.classes_ = np.array(CLASS_LABELS)
y_train = le_y.fit_transform(df_tr['class'].values)
y_test = le_y.transform(df_te['class'].values)
# --- Encode categorical features ---
label_encoders = {}
for col in CATEGORICAL_COLS:
le = LabelEncoder()
le.fit(df_tr[col])
# Handle unseen test labels
known = set(le.classes_)
df_te[col] = df_te[col].apply(lambda x: x if x in known else le.classes_[0])
df_tr[col] = le.transform(df_tr[col])
df_te[col] = le.transform(df_te[col])
label_encoders[col] = le
print(f" Encoded {col}: {len(le.classes_)} categories")
# --- Extract features ---
X_train = df_tr[FEATURE_NAMES].values.astype(np.float32)
X_test = df_te[FEATURE_NAMES].values.astype(np.float32)
# --- Scale features ---
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(f" X_train shape: {X_train.shape}")
print(f" X_test shape: {X_test.shape}")
print(f" y_train distribution: {np.bincount(y_train)}")
print(f" y_test distribution: {np.bincount(y_test)}")
return X_train, X_test, y_train, y_test, label_encoders, scaler, class_names
def save_preprocessed(X_train, X_test, y_train, y_test, label_encoders, scaler,
class_names, output_dir='data/processed'):
"""Save preprocessed data for reproducibility."""
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
np.save(os.path.join(output_dir, 'y_test.npy'), y_test)
with open(os.path.join(output_dir, 'encoders.pkl'), 'wb') as f:
pickle.dump({'label_encoders': label_encoders, 'scaler': scaler}, f)
with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
json.dump({
'feature_names': FEATURE_NAMES,
'categorical_cols': CATEGORICAL_COLS,
'class_names': class_names,
'n_train': len(X_train),
'n_test': len(X_test),
'n_features': X_train.shape[1],
'seed': SEED,
}, f, indent=2)
print(f"\nSaved preprocessed data to {output_dir}/")
def load_preprocessed(data_dir='data/processed'):
"""Load preprocessed data."""
X_train = np.load(os.path.join(data_dir, 'X_train.npy'))
X_test = np.load(os.path.join(data_dir, 'X_test.npy'))
y_train = np.load(os.path.join(data_dir, 'y_train.npy'))
y_test = np.load(os.path.join(data_dir, 'y_test.npy'))
with open(os.path.join(data_dir, 'encoders.pkl'), 'rb') as f:
objs = pickle.load(f)
with open(os.path.join(data_dir, 'metadata.json')) as f:
meta = json.load(f)
return X_train, X_test, y_train, y_test, objs['label_encoders'], objs['scaler'], meta
if __name__ == '__main__':
df_train, df_test = load_nsl_kdd()
analyze_dataset(df_train, df_test)
X_train, X_test, y_train, y_test, le, scaler, class_names = preprocess(
df_train, df_test, binary=True
)
save_preprocessed(X_train, X_test, y_train, y_test, le, scaler, class_names)
print("\nPreprocessing complete!")