Add data preprocessing pipeline

97c54b3 verified about 1 month ago

8.45 kB

	"""
	Data preprocessing pipeline for NSL-KDD dataset.
	Handles loading, encoding, scaling, and splitting.
	"""

	import os
	import sys
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import LabelEncoder, MinMaxScaler
	from datasets import load_dataset
	import pickle
	import json

	# Fixed seed for reproducibility
	SEED = 42
	np.random.seed(SEED)

	# NSL-KDD attack type to category mapping
	ATTACK_MAP = {
	# Normal
	'normal': 'Normal',
	# DoS attacks
	'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS',
	'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS', 'apache2': 'DoS',
	'processtable': 'DoS', 'udpstorm': 'DoS',
	# Probe attacks
	'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe', 'satan': 'Probe',
	'mscan': 'Probe', 'saint': 'Probe',
	# R2L attacks
	'ftp_write': 'R2L', 'guess_passwd': 'R2L', 'imap': 'R2L', 'multihop': 'R2L',
	'phf': 'R2L', 'spy': 'R2L', 'warezclient': 'R2L', 'warezmaster': 'R2L',
	'sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L', 'snmpguess': 'R2L',
	'xlock': 'R2L', 'xsnoop': 'R2L', 'worm': 'R2L',
	# U2R attacks
	'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R',
	'httptunnel': 'U2R', 'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R',
	}

	# 41 features of NSL-KDD
	FEATURE_NAMES = [
	'duration', 'protocol_type', 'service', 'flag',
	'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
	'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
	'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
	'num_shells', 'num_access_files', 'num_outbound_cmds',
	'is_host_login', 'is_guest_login',
	'count', 'srv_count',
	'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
	'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
	'dst_host_count', 'dst_host_srv_count',
	'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
	'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
	'dst_host_serror_rate', 'dst_host_srv_serror_rate',
	'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
	]

	CATEGORICAL_COLS = ['protocol_type', 'service', 'flag']

	# Class label mapping for 5-class
	CLASS_LABELS = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']


	def load_nsl_kdd():
	"""Load NSL-KDD from HuggingFace Hub."""
	print("Loading NSL-KDD dataset from HuggingFace Hub...")
	ds = load_dataset("Mireu-Lab/NSL-KDD")

	df_train = ds['train'].to_pandas()
	df_test = ds['test'].to_pandas()

	print(f"Train: {len(df_train)} samples")
	print(f"Test: {len(df_test)} samples")

	return df_train, df_test


	def analyze_dataset(df_train, df_test):
	"""Print dataset statistics for documentation."""
	print("\n" + "="*60)
	print("DATASET ANALYSIS")
	print("="*60)

	print(f"\nTraining set: {len(df_train)} samples")
	print(f"Test set: {len(df_test)} samples")

	print("\n--- Class Distribution (Training) ---")
	train_dist = df_train['class'].value_counts()
	for cls, count in train_dist.items():
	pct = 100 * count / len(df_train)
	print(f" {cls:10s}: {count:6d} ({pct:.1f}%)")

	print("\n--- Class Distribution (Test) ---")
	test_dist = df_test['class'].value_counts()
	for cls, count in test_dist.items():
	pct = 100 * count / len(df_test)
	print(f" {cls:10s}: {count:6d} ({pct:.1f}%)")

	print("\n--- Categorical Features ---")
	for col in CATEGORICAL_COLS:
	n_train = df_train[col].nunique()
	n_test = df_test[col].nunique()
	print(f" {col:15s}: {n_train} train / {n_test} test unique values")

	# Check for unseen test values
	train_vals = set(df_train[col].unique())
	test_vals = set(df_test[col].unique())
	unseen = test_vals - train_vals
	if unseen:
	print(f" Warning: {len(unseen)} unseen test values: {unseen}")

	print("\n--- Feature Ranges (numeric) ---")
	numeric_cols = [c for c in FEATURE_NAMES if c not in CATEGORICAL_COLS]
	for col in numeric_cols[:10]:
	print(f" {col:35s}: [{df_train[col].min():.2f}, {df_train[col].max():.2f}]")
	print(f" ... and {len(numeric_cols)-10} more numeric features")

	return train_dist, test_dist


	def preprocess(df_train, df_test, binary=True):
	"""
	Full preprocessing pipeline.

	Args:
	df_train: Training DataFrame
	df_test: Test DataFrame
	binary: If True, binary classification (normal vs anomaly)

	Returns:
	X_train, X_test, y_train, y_test, label_encoders, scaler, class_names
	"""
	print(f"\nPreprocessing ({'binary' if binary else '5-class'} classification)...")

	df_tr = df_train.copy()
	df_te = df_test.copy()

	# --- Encode target ---
	if binary:
	class_names = ['anomaly', 'normal']
	le_y = LabelEncoder()
	y_train = le_y.fit_transform(df_tr['class'].values)
	y_test = le_y.transform(df_te['class'].values)
	else:
	class_names = CLASS_LABELS
	le_y = LabelEncoder()
	le_y.classes_ = np.array(CLASS_LABELS)
	y_train = le_y.fit_transform(df_tr['class'].values)
	y_test = le_y.transform(df_te['class'].values)

	# --- Encode categorical features ---
	label_encoders = {}
	for col in CATEGORICAL_COLS:
	le = LabelEncoder()
	le.fit(df_tr[col])

	# Handle unseen test labels
	known = set(le.classes_)
	df_te[col] = df_te[col].apply(lambda x: x if x in known else le.classes_[0])

	df_tr[col] = le.transform(df_tr[col])
	df_te[col] = le.transform(df_te[col])
	label_encoders[col] = le
	print(f" Encoded {col}: {len(le.classes_)} categories")

	# --- Extract features ---
	X_train = df_tr[FEATURE_NAMES].values.astype(np.float32)
	X_test = df_te[FEATURE_NAMES].values.astype(np.float32)

	# --- Scale features ---
	scaler = MinMaxScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	print(f" X_train shape: {X_train.shape}")
	print(f" X_test shape: {X_test.shape}")
	print(f" y_train distribution: {np.bincount(y_train)}")
	print(f" y_test distribution: {np.bincount(y_test)}")

	return X_train, X_test, y_train, y_test, label_encoders, scaler, class_names


	def save_preprocessed(X_train, X_test, y_train, y_test, label_encoders, scaler,
	class_names, output_dir='data/processed'):
	"""Save preprocessed data for reproducibility."""
	os.makedirs(output_dir, exist_ok=True)

	np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
	np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
	np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
	np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

	with open(os.path.join(output_dir, 'encoders.pkl'), 'wb') as f:
	pickle.dump({'label_encoders': label_encoders, 'scaler': scaler}, f)

	with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
	json.dump({
	'feature_names': FEATURE_NAMES,
	'categorical_cols': CATEGORICAL_COLS,
	'class_names': class_names,
	'n_train': len(X_train),
	'n_test': len(X_test),
	'n_features': X_train.shape[1],
	'seed': SEED,
	}, f, indent=2)

	print(f"\nSaved preprocessed data to {output_dir}/")


	def load_preprocessed(data_dir='data/processed'):
	"""Load preprocessed data."""
	X_train = np.load(os.path.join(data_dir, 'X_train.npy'))
	X_test = np.load(os.path.join(data_dir, 'X_test.npy'))
	y_train = np.load(os.path.join(data_dir, 'y_train.npy'))
	y_test = np.load(os.path.join(data_dir, 'y_test.npy'))

	with open(os.path.join(data_dir, 'encoders.pkl'), 'rb') as f:
	objs = pickle.load(f)

	with open(os.path.join(data_dir, 'metadata.json')) as f:
	meta = json.load(f)

	return X_train, X_test, y_train, y_test, objs['label_encoders'], objs['scaler'], meta


	if __name__ == '__main__':
	df_train, df_test = load_nsl_kdd()
	analyze_dataset(df_train, df_test)

	X_train, X_test, y_train, y_test, le, scaler, class_names = preprocess(
	df_train, df_test, binary=True
	)
	save_preprocessed(X_train, X_test, y_train, y_test, le, scaler, class_names)

	print("\nPreprocessing complete!")