{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "7ctVNsZvi8OE" }, "source": [ "# š”ļø SentinelNet ā AI-Powered Network Intrusion Detection System (NIDS)\n", "\n", "**Dataset:** NSL-KDD \n", "**Goal:** Classify network traffic as Normal / DoS / Probe / R2L / U2R using machine learning\n", "\n", "---\n", "\n", "## Project Modules\n", "| Week | Module |\n", "|------|--------|\n", "| 1 | Dataset Acquisition & Exploration |\n", "| 2 | Data Cleaning & Preprocessing |\n", "| 3 | Feature Engineering & Selection |\n", "| 4 | Supervised Model Training |\n", "| 5 | Anomaly Detection (Unsupervised) |\n", "| 6 | Model Evaluation & Fine-Tuning |\n", "| 7 | Alert Generation & Logging |\n", "| 8 | Documentation & Summary |" ] }, { "cell_type": "markdown", "metadata": { "id": "E5lqsSr3i8OI" }, "source": [ "---\n", "## š¦ Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sh1Jsy_9i8OK" }, "outputs": [], "source": [ "# Install required packages (run once)\n", "!pip install xgboost imbalanced-learn --quiet" ] }, { "cell_type": "markdown", "metadata": { "id": "Rip8w4_Vi8OL" }, "source": [ "---\n", "## š§ Global Imports & Configuration" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JrzAutU4i8OM", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e28d3e59-b702-45b8-f20c-4e5f68642a55" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "XGBoost available.\n", "ā All libraries imported and directories created.\n" ] } ], "source": [ "import os\n", "import json\n", "import joblib\n", "import warnings\n", "import requests\n", "from datetime import datetime\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.preprocessing import (\n", " LabelEncoder, StandardScaler, RobustScaler,\n", " OneHotEncoder, label_binarize\n", ")\n", "from sklearn.model_selection import (\n", " train_test_split, GridSearchCV,\n", " StratifiedKFold, cross_val_score\n", ")\n", "from sklearn.ensemble import (\n", " RandomForestClassifier, GradientBoostingClassifier, IsolationForest\n", ")\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import LocalOutlierFactor\n", "from sklearn.svm import OneClassSVM\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "from sklearn.metrics import (\n", " classification_report, confusion_matrix,\n", " accuracy_score, f1_score,\n", " precision_score, recall_score,\n", " roc_curve, auc\n", ")\n", "from imblearn.over_sampling import SMOTE\n", "\n", "try:\n", " from xgboost import XGBClassifier\n", " XGBOOST_AVAILABLE = True\n", " print('XGBoost available.')\n", "except ImportError:\n", " XGBOOST_AVAILABLE = False\n", " print('XGBoost not installed ā will skip.')\n", "\n", "warnings.filterwarnings('ignore')\n", "plt.style.use('ggplot')\n", "sns.set_palette('husl')\n", "\n", "# āā Reproducibility āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "RANDOM_STATE = 42\n", "np.random.seed(RANDOM_STATE)\n", "\n", "# āā Directories āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "DATA_DIR = 'data'\n", "MODELS_DIR = 'models'\n", "ALERTS_DIR = 'alerts'\n", "PROC_DIR = os.path.join(DATA_DIR, 'processed')\n", "\n", "for d in [DATA_DIR, MODELS_DIR, ALERTS_DIR, PROC_DIR]:\n", " os.makedirs(d, exist_ok=True)\n", "\n", "print('ā All libraries imported and directories created.')" ] }, { "cell_type": "markdown", "metadata": { "id": "rC3swRnSi8OM" }, "source": [ "---\n", "## š NSL-KDD Column Schema & Attack Mapping" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "62mlko4gi8ON", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "517f0699-f8fb-4aaf-954d-1690efada188" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "ā Schema and mappings defined.\n" ] } ], "source": [ "# āā Column names for the NSL-KDD dataset āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "COLUMNS = [\n", " 'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',\n", " 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',\n", " 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',\n", " 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',\n", " 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',\n", " 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',\n", " 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',\n", " 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',\n", " 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',\n", " 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty_level'\n", "]\n", "\n", "# āā Attack ā category mapping āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "ATTACK_MAPPING = {\n", " 'normal' : 'normal',\n", " # DoS\n", " 'back' : 'DoS', 'land' : 'DoS', 'neptune' : 'DoS',\n", " 'pod' : 'DoS', 'smurf' : 'DoS', 'teardrop' : 'DoS',\n", " 'mailbomb' : 'DoS', 'apache2' : 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS',\n", " # Probe\n", " 'satan' : 'Probe', 'ipsweep' : 'Probe', 'nmap' : 'Probe',\n", " 'portsweep' : 'Probe', 'mscan' : 'Probe', 'saint' : 'Probe',\n", " # R2L\n", " 'guess_passwd' : 'R2L', 'ftp_write' : 'R2L', 'imap' : 'R2L',\n", " 'phf' : 'R2L', 'multihop' : 'R2L', 'warezmaster': 'R2L',\n", " 'warezclient' : 'R2L', 'spy' : 'R2L', 'xlock' : 'R2L',\n", " 'xsnoop' : 'R2L', 'snmpguess' : 'R2L', 'snmpgetattack': 'R2L',\n", " 'httptunnel' : 'R2L', 'sendmail' : 'R2L', 'named' : 'R2L',\n", " # U2R\n", " 'buffer_overflow' : 'U2R', 'loadmodule' : 'U2R', 'perl' : 'U2R',\n", " 'rootkit' : 'U2R', 'ps' : 'U2R', 'xterm' : 'U2R', 'sqlattack': 'U2R'\n", "}\n", "\n", "# āā Severity levels for alert generation āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "SEVERITY_MAP = {\n", " 'normal': 'None',\n", " 'DoS' : 'Critical',\n", " 'Probe' : 'Medium',\n", " 'R2L' : 'High',\n", " 'U2R' : 'Critical',\n", "}\n", "SEVERITY_COLOR = {\n", " 'None' : '#4CAF50',\n", " 'Medium' : '#FF9800',\n", " 'High' : '#F44336',\n", " 'Critical': '#9C27B0',\n", "}\n", "\n", "print('ā Schema and mappings defined.')" ] }, { "cell_type": "markdown", "metadata": { "id": "nMBiXy_Ci8OO" }, "source": [ "---\n", "# š Week 1 ā Dataset Acquisition & Exploration" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "i-86VAt9i8OO", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cbc41910-ab6b-459f-92d5-71deba868a55" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading NSL-KDD dataset...\n", " ⬠Downloading KDDTrain+.txt...\n", " ā Saved ā data/KDDTrain+.txt\n", " ⬠Downloading KDDTest+.txt...\n", " ā Saved ā data/KDDTest+.txt\n", "Done.\n" ] } ], "source": [ "# āā Download NSL-KDD dataset āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n", "URLS = {\n", " 'KDDTrain+.txt': 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt',\n", " 'KDDTest+.txt' : 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt'\n", "}\n", "\n", "def download_file(url, filename):\n", " path = os.path.join(DATA_DIR, filename)\n", " if os.path.exists(path):\n", " print(f' ā Already exists: {filename}')\n", " return\n", " print(f' ⬠Downloading {filename}...')\n", " try:\n", " r = requests.get(url, stream=True)\n", " r.raise_for_status()\n", " with open(path, 'wb') as f:\n", " for chunk in r.iter_content(chunk_size=8192):\n", " f.write(chunk)\n", " print(f' ā Saved ā {path}')\n", " except Exception as e:\n", " print(f' ā Failed: {e}')\n", "\n", "print('Downloading NSL-KDD dataset...')\n", "for fname, url in URLS.items():\n", " download_file(url, fname)\n", "print('Done.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5kJtQiG5i8OP", "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "outputId": "985558bb-5630-44dd-9cad-36a33247b54c" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Train shape : (125973, 43)\n", "Test shape : (22544, 43)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " duration protocol_type service flag src_bytes dst_bytes land \\\n", "0 0 tcp ftp_data SF 491 0 0 \n", "1 0 udp other SF 146 0 0 \n", "2 0 tcp private S0 0 0 0 \n", "3 0 tcp http SF 232 8153 0 \n", "4 0 tcp http SF 199 420 0 \n", "\n", " wrong_fragment urgent hot ... dst_host_same_srv_rate \\\n", "0 0 0 0 ... 0.17 \n", "1 0 0 0 ... 0.00 \n", "2 0 0 0 ... 0.10 \n", "3 0 0 0 ... 1.00 \n", "4 0 0 0 ... 1.00 \n", "\n", " dst_host_diff_srv_rate dst_host_same_src_port_rate \\\n", "0 0.03 0.17 \n", "1 0.60 0.88 \n", "2 0.05 0.00 \n", "3 0.00 0.03 \n", "4 0.00 0.00 \n", "\n", " dst_host_srv_diff_host_rate dst_host_serror_rate \\\n", "0 0.00 0.00 \n", "1 0.00 0.00 \n", "2 0.00 1.00 \n", "3 0.04 0.03 \n", "4 0.00 0.00 \n", "\n", " dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate \\\n", "0 0.00 0.05 0.00 \n", "1 0.00 0.00 0.00 \n", "2 1.00 0.00 0.00 \n", "3 0.01 0.00 0.01 \n", "4 0.00 0.00 0.00 \n", "\n", " label difficulty_level \n", "0 normal 20 \n", "1 normal 15 \n", "2 neptune 19 \n", "3 normal 21 \n", "4 normal 21 \n", "\n", "[5 rows x 43 columns]" ], "text/html": [ "\n", "
| \n", " | duration | \n", "protocol_type | \n", "service | \n", "flag | \n", "src_bytes | \n", "dst_bytes | \n", "land | \n", "wrong_fragment | \n", "urgent | \n", "hot | \n", "... | \n", "dst_host_same_srv_rate | \n", "dst_host_diff_srv_rate | \n", "dst_host_same_src_port_rate | \n", "dst_host_srv_diff_host_rate | \n", "dst_host_serror_rate | \n", "dst_host_srv_serror_rate | \n", "dst_host_rerror_rate | \n", "dst_host_srv_rerror_rate | \n", "label | \n", "difficulty_level | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "tcp | \n", "ftp_data | \n", "SF | \n", "491 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0.17 | \n", "0.03 | \n", "0.17 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.05 | \n", "0.00 | \n", "normal | \n", "20 | \n", "
| 1 | \n", "0 | \n", "udp | \n", "other | \n", "SF | \n", "146 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0.00 | \n", "0.60 | \n", "0.88 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "normal | \n", "15 | \n", "
| 2 | \n", "0 | \n", "tcp | \n", "private | \n", "S0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0.10 | \n", "0.05 | \n", "0.00 | \n", "0.00 | \n", "1.00 | \n", "1.00 | \n", "0.00 | \n", "0.00 | \n", "neptune | \n", "19 | \n", "
| 3 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "232 | \n", "8153 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "1.00 | \n", "0.00 | \n", "0.03 | \n", "0.04 | \n", "0.03 | \n", "0.01 | \n", "0.00 | \n", "0.01 | \n", "normal | \n", "21 | \n", "
| 4 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "199 | \n", "420 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "1.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "normal | \n", "21 | \n", "
5 rows Ć 43 columns
\n", "| \n", " | duration | \n", "src_bytes | \n", "dst_bytes | \n", "land | \n", "wrong_fragment | \n", "urgent | \n", "hot | \n", "num_failed_logins | \n", "logged_in | \n", "num_compromised | \n", "... | \n", "dst_host_srv_count | \n", "dst_host_same_srv_rate | \n", "dst_host_diff_srv_rate | \n", "dst_host_same_src_port_rate | \n", "dst_host_srv_diff_host_rate | \n", "dst_host_serror_rate | \n", "dst_host_srv_serror_rate | \n", "dst_host_rerror_rate | \n", "dst_host_srv_rerror_rate | \n", "difficulty_level | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | \n", "125973.00000 | \n", "1.259730e+05 | \n", "1.259730e+05 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "... | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "125973.000000 | \n", "
| mean | \n", "287.14465 | \n", "4.556674e+04 | \n", "1.977911e+04 | \n", "0.000198 | \n", "0.022687 | \n", "0.000111 | \n", "0.204409 | \n", "0.001222 | \n", "0.395736 | \n", "0.279250 | \n", "... | \n", "115.653005 | \n", "0.521242 | \n", "0.082951 | \n", "0.148379 | \n", "0.032542 | \n", "0.284452 | \n", "0.278485 | \n", "0.118832 | \n", "0.120240 | \n", "19.504060 | \n", "
| std | \n", "2604.51531 | \n", "5.870331e+06 | \n", "4.021269e+06 | \n", "0.014086 | \n", "0.253530 | \n", "0.014366 | \n", "2.149968 | \n", "0.045239 | \n", "0.489010 | \n", "23.942042 | \n", "... | \n", "110.702741 | \n", "0.448949 | \n", "0.188922 | \n", "0.308997 | \n", "0.112564 | \n", "0.444784 | \n", "0.445669 | \n", "0.306557 | \n", "0.319459 | \n", "2.291503 | \n", "
| min | \n", "0.00000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
| 25% | \n", "0.00000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "10.000000 | \n", "0.050000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "18.000000 | \n", "
| 50% | \n", "0.00000 | \n", "4.400000e+01 | \n", "0.000000e+00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "63.000000 | \n", "0.510000 | \n", "0.020000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "20.000000 | \n", "
| 75% | \n", "0.00000 | \n", "2.760000e+02 | \n", "5.160000e+02 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "0.000000 | \n", "... | \n", "255.000000 | \n", "1.000000 | \n", "0.070000 | \n", "0.060000 | \n", "0.020000 | \n", "1.000000 | \n", "1.000000 | \n", "0.000000 | \n", "0.000000 | \n", "21.000000 | \n", "
| max | \n", "42908.00000 | \n", "1.379964e+09 | \n", "1.309937e+09 | \n", "1.000000 | \n", "3.000000 | \n", "3.000000 | \n", "77.000000 | \n", "5.000000 | \n", "1.000000 | \n", "7479.000000 | \n", "... | \n", "255.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "21.000000 | \n", "
8 rows Ć 39 columns
\n", "