{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "C36kdei0JAGU",
        "outputId": "a3b9ca41-83ba-4246-ebd3-a88937443fd9"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/sklearn/feature_selection/_univariate_selection.py:111: UserWarning: Features [16] are constant.\n",
            "  warnings.warn(\"Features %s are constant.\" % constant_features_idx, UserWarning)\n",
            "/usr/local/lib/python3.12/dist-packages/sklearn/feature_selection/_univariate_selection.py:112: RuntimeWarning: invalid value encountered in divide\n",
            "  f = msb / msw\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "New shape after feature selection: (110596, 50)\n",
            "Epoch 1/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m13s\u001b[0m 5ms/step - accuracy: 0.9748 - loss: 0.0709 - val_accuracy: 0.7912 - val_loss: 0.7181\n",
            "Epoch 2/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9827 - loss: 0.0469 - val_accuracy: 0.7963 - val_loss: 0.8565\n",
            "Epoch 3/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9843 - loss: 0.0415 - val_accuracy: 0.7947 - val_loss: 0.9044\n",
            "Epoch 4/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9858 - loss: 0.0377 - val_accuracy: 0.7976 - val_loss: 0.8448\n",
            "Epoch 5/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9863 - loss: 0.0361 - val_accuracy: 0.8339 - val_loss: 0.8099\n",
            "Epoch 6/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 3ms/step - accuracy: 0.9871 - loss: 0.0340 - val_accuracy: 0.8187 - val_loss: 0.8643\n",
            "Epoch 7/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9875 - loss: 0.0331 - val_accuracy: 0.8238 - val_loss: 0.9187\n",
            "Epoch 8/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9881 - loss: 0.0326 - val_accuracy: 0.8306 - val_loss: 0.8933\n",
            "Epoch 9/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9883 - loss: 0.0316 - val_accuracy: 0.8199 - val_loss: 0.8902\n",
            "Epoch 10/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9885 - loss: 0.0306 - val_accuracy: 0.8251 - val_loss: 0.9340\n",
            "Epoch 11/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9890 - loss: 0.0297 - val_accuracy: 0.8217 - val_loss: 1.0413\n",
            "Epoch 12/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9888 - loss: 0.0295 - val_accuracy: 0.7996 - val_loss: 1.2353\n",
            "Epoch 13/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9893 - loss: 0.0289 - val_accuracy: 0.8299 - val_loss: 1.0090\n",
            "Epoch 14/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9893 - loss: 0.0279 - val_accuracy: 0.8273 - val_loss: 0.8989\n",
            "Epoch 15/15\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9896 - loss: 0.0288 - val_accuracy: 0.8173 - val_loss: 1.1206\n",
            "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step - accuracy: 0.8173 - loss: 1.1206\n",
            "Final Accuracy: 0.8173350095748901\n",
            "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step\n",
            "\n",
            "Classification Report:\n",
            "\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.71      0.97      0.82      9711\n",
            "           1       0.97      0.70      0.81     12833\n",
            "\n",
            "    accuracy                           0.82     22544\n",
            "   macro avg       0.84      0.84      0.82     22544\n",
            "weighted avg       0.86      0.82      0.82     22544\n",
            "\n",
            "\n",
            "Confusion Matrix:\n",
            "\n",
            "[[9392  319]\n",
            " [3799 9034]]\n"
          ]
        }
      ],
      "source": [
        "# =========================\n",
        "# 1. IMPORTS\n",
        "# =========================\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.feature_selection import SelectKBest, f_classif\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import classification_report, confusion_matrix\n",
        "\n",
        "# =========================\n",
        "# 2. LOAD DATA\n",
        "# =========================\n",
        "train_path = \"KDDTrain+.txt\"\n",
        "test_path = \"KDDTest+.txt\"\n",
        "\n",
        "columns = [\n",
        "    \"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\"dst_bytes\",\"land\",\n",
        "    \"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\"logged_in\",\n",
        "    \"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\"num_file_creations\",\n",
        "    \"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\"is_host_login\",\n",
        "    \"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\"srv_serror_rate\",\n",
        "    \"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\"diff_srv_rate\",\n",
        "    \"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n",
        "    \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\n",
        "    \"dst_host_same_src_port_rate\",\"dst_host_srv_diff_host_rate\",\n",
        "    \"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n",
        "    \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\n",
        "    \"label\",\"difficulty\"\n",
        "]\n",
        "\n",
        "train_df = pd.read_csv(train_path, names=columns)\n",
        "test_df = pd.read_csv(test_path, names=columns)\n",
        "\n",
        "# =========================\n",
        "# 3. LABEL CONVERSION\n",
        "# =========================\n",
        "def label_map(x):\n",
        "    return 0 if x == \"normal\" else 1\n",
        "\n",
        "train_df['label'] = train_df['label'].apply(label_map)\n",
        "test_df['label'] = test_df['label'].apply(label_map)\n",
        "\n",
        "# =========================\n",
        "# 4. ONE-HOT ENCODING\n",
        "# =========================\n",
        "categorical_cols = ['protocol_type', 'service', 'flag']\n",
        "\n",
        "train_df = pd.get_dummies(train_df, columns=categorical_cols)\n",
        "test_df = pd.get_dummies(test_df, columns=categorical_cols)\n",
        "\n",
        "train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)\n",
        "\n",
        "# =========================\n",
        "# 5. SPLIT FEATURES\n",
        "# =========================\n",
        "X_train = train_df.drop(['label', 'difficulty'], axis=1)\n",
        "y_train = train_df['label']\n",
        "\n",
        "X_test = test_df.drop(['label', 'difficulty'], axis=1)\n",
        "y_test = test_df['label']\n",
        "\n",
        "# =========================\n",
        "# 6. NORMALIZATION\n",
        "# =========================\n",
        "scaler = StandardScaler()\n",
        "X_train = scaler.fit_transform(X_train)\n",
        "X_test = scaler.transform(X_test)\n",
        "\n",
        "# =========================\n",
        "# 7. FEATURE SELECTION (IMPORTANT)\n",
        "# =========================\n",
        "selector = SelectKBest(score_func=f_classif, k=50)\n",
        "\n",
        "X_train = selector.fit_transform(X_train, y_train)\n",
        "X_test = selector.transform(X_test)\n",
        "\n",
        "print(\"New shape after feature selection:\", X_train.shape)\n",
        "\n",
        "# =========================\n",
        "# 8. BUILD MODEL (IMPROVED)\n",
        "# =========================\n",
        "model = tf.keras.Sequential([\n",
        "    tf.keras.layers.Input(shape=(X_train.shape[1],)),\n",
        "\n",
        "    tf.keras.layers.Dense(128, activation='relu'),\n",
        "    tf.keras.layers.BatchNormalization(),\n",
        "    tf.keras.layers.Dropout(0.3),\n",
        "\n",
        "    tf.keras.layers.Dense(64, activation='relu'),\n",
        "    tf.keras.layers.BatchNormalization(),\n",
        "    tf.keras.layers.Dropout(0.3),\n",
        "\n",
        "    tf.keras.layers.Dense(32, activation='relu'),\n",
        "\n",
        "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
        "])\n",
        "\n",
        "model.compile(\n",
        "    optimizer='adam',\n",
        "    loss='binary_crossentropy',\n",
        "    metrics=['accuracy']\n",
        ")\n",
        "\n",
        "# =========================\n",
        "# 9. TRAIN MODEL\n",
        "# =========================\n",
        "history = model.fit(\n",
        "    X_train, y_train,\n",
        "    epochs=15,\n",
        "    batch_size=64,\n",
        "    validation_data=(X_test, y_test)\n",
        ")\n",
        "\n",
        "# =========================\n",
        "# 10. EVALUATE\n",
        "# =========================\n",
        "loss, acc = model.evaluate(X_test, y_test)\n",
        "print(\"Final Accuracy:\", acc)\n",
        "\n",
        "# =========================\n",
        "# 11. METRICS (IMPORTANT FOR REPORT)\n",
        "# =========================\n",
        "y_pred = (model.predict(X_test) > 0.5).astype(\"int32\")\n",
        "\n",
        "print(\"\\nClassification Report:\\n\")\n",
        "print(classification_report(y_test, y_pred))\n",
        "\n",
        "print(\"\\nConfusion Matrix:\\n\")\n",
        "print(confusion_matrix(y_test, y_pred))"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from tensorflow.keras import layers, models\n",
        "\n",
        "# Train ONLY on normal data\n",
        "X_train_normal = X_train[y_train == 0]\n",
        "\n",
        "# Autoencoder model\n",
        "input_dim = X_train.shape[1]\n",
        "\n",
        "autoencoder = models.Sequential([\n",
        "    layers.Input(shape=(input_dim,)),\n",
        "\n",
        "    layers.Dense(64, activation='relu'),\n",
        "    layers.Dense(32, activation='relu'),\n",
        "    layers.Dense(16, activation='relu'),\n",
        "\n",
        "    layers.Dense(32, activation='relu'),\n",
        "    layers.Dense(64, activation='relu'),\n",
        "\n",
        "    layers.Dense(input_dim, activation='sigmoid')\n",
        "])\n",
        "\n",
        "autoencoder.compile(optimizer='adam', loss='mse')\n",
        "\n",
        "# Train\n",
        "autoencoder.fit(\n",
        "    X_train_normal,\n",
        "    X_train_normal,\n",
        "    epochs=15,\n",
        "    batch_size=64,\n",
        "    validation_data=(X_test, X_test)\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WUgrtCu68UgM",
        "outputId": "8113c359-5ebc-4f8e-e870-32f4bd79c5e9"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 5ms/step - loss: 0.2818 - val_loss: 0.6870\n",
            "Epoch 2/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 4ms/step - loss: 0.2517 - val_loss: 0.6827\n",
            "Epoch 3/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2508 - val_loss: 0.6792\n",
            "Epoch 4/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2506 - val_loss: 0.6784\n",
            "Epoch 5/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2500 - val_loss: 0.6769\n",
            "Epoch 6/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 4ms/step - loss: 0.2499 - val_loss: 0.6767\n",
            "Epoch 7/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2499 - val_loss: 0.6818\n",
            "Epoch 8/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2498 - val_loss: 0.6743\n",
            "Epoch 9/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2498 - val_loss: 0.6778\n",
            "Epoch 10/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6721\n",
            "Epoch 11/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6738\n",
            "Epoch 12/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6773\n",
            "Epoch 13/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6788\n",
            "Epoch 14/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2496 - val_loss: 0.6767\n",
            "Epoch 15/15\n",
            "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6770\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.src.callbacks.history.History at 0x79e9b6ce9610>"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Reconstruction error\n",
        "reconstructions = autoencoder.predict(X_test)\n",
        "\n",
        "mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)\n",
        "\n",
        "# Threshold\n",
        "# Get reconstruction error for NORMAL training data\n",
        "train_recon = autoencoder.predict(X_train_normal)\n",
        "\n",
        "train_mse = np.mean(np.power(X_train_normal - train_recon, 2), axis=1)\n",
        "\n",
        "# Better threshold\n",
        "threshold = np.percentile(train_mse, 95)\n",
        "\n",
        "# Predictions\n",
        "y_pred_ae = (mse > threshold).astype(int)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2mTcym9l8Wd7",
        "outputId": "11596bb4-da65-41ba-9d72-1f61112f2b76"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step\n",
            "\u001b[1m1847/1847\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 2ms/step\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "print(classification_report(y_test, y_pred_ae))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5QkUi8bSK7XH",
        "outputId": "a4cb6f1a-78ed-4a63-da1c-57aee27d0b46"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.64      0.93      0.75      9711\n",
            "           1       0.91      0.60      0.72     12833\n",
            "\n",
            "    accuracy                           0.74     22544\n",
            "   macro avg       0.78      0.76      0.74     22544\n",
            "weighted avg       0.79      0.74      0.74     22544\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from tensorflow.keras import layers, models\n",
        "\n",
        "# Reshape data for LSTM\n",
        "X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))\n",
        "X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))\n",
        "\n",
        "# Build LSTM model\n",
        "model_lstm = models.Sequential([\n",
        "    layers.LSTM(64, input_shape=(1, X_train.shape[1])),\n",
        "    layers.Dropout(0.3),\n",
        "\n",
        "    layers.Dense(32, activation='relu'),\n",
        "    layers.Dense(1, activation='sigmoid')\n",
        "])\n",
        "\n",
        "model_lstm.compile(\n",
        "    optimizer='adam',\n",
        "    loss='binary_crossentropy',\n",
        "    metrics=['accuracy']\n",
        ")\n",
        "\n",
        "# Train\n",
        "model_lstm.fit(\n",
        "    X_train_lstm, y_train,\n",
        "    epochs=10,\n",
        "    batch_size=64,\n",
        "    validation_data=(X_test_lstm, y_test)\n",
        ")\n",
        "\n",
        "# Evaluate\n",
        "loss, acc = model_lstm.evaluate(X_test_lstm, y_test)\n",
        "print(\"LSTM Accuracy:\", acc)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-WQDtVbqLlPK",
        "outputId": "906b349f-f0d0-40f8-8b9b-0bbdf0d548d6"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/10\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/keras/src/layers/rnn/rnn.py:199: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
            "  super().__init__(**kwargs)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m15s\u001b[0m 6ms/step - accuracy: 0.9768 - loss: 0.0723 - val_accuracy: 0.7811 - val_loss: 0.9115\n",
            "Epoch 2/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 5ms/step - accuracy: 0.9841 - loss: 0.0417 - val_accuracy: 0.7835 - val_loss: 0.9094\n",
            "Epoch 3/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9858 - loss: 0.0375 - val_accuracy: 0.7972 - val_loss: 0.9576\n",
            "Epoch 4/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9873 - loss: 0.0339 - val_accuracy: 0.7971 - val_loss: 1.0328\n",
            "Epoch 5/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9882 - loss: 0.0317 - val_accuracy: 0.8055 - val_loss: 1.0339\n",
            "Epoch 6/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9887 - loss: 0.0302 - val_accuracy: 0.8074 - val_loss: 1.1582\n",
            "Epoch 7/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9893 - loss: 0.0286 - val_accuracy: 0.8054 - val_loss: 1.2248\n",
            "Epoch 8/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9897 - loss: 0.0279 - val_accuracy: 0.8148 - val_loss: 1.1761\n",
            "Epoch 9/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9905 - loss: 0.0266 - val_accuracy: 0.8226 - val_loss: 1.1026\n",
            "Epoch 10/10\n",
            "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9904 - loss: 0.0257 - val_accuracy: 0.8242 - val_loss: 1.1574\n",
            "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 3ms/step - accuracy: 0.8242 - loss: 1.1574\n",
            "LSTM Accuracy: 0.8241660594940186\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =========================\n",
        "# 1. INSTALL + IMPORT\n",
        "# =========================\n",
        "!pip install pyspark\n",
        "\n",
        "from pyspark.sql import SparkSession\n",
        "from pyspark.sql.functions import when\n",
        "\n",
        "# =========================\n",
        "# 2. START SPARK\n",
        "# =========================\n",
        "spark = SparkSession.builder \\\n",
        "    .appName(\"IDS_Project\") \\\n",
        "    .getOrCreate()\n",
        "\n",
        "print(\"Spark Started ✅\")\n",
        "\n",
        "# =========================\n",
        "# 3. LOAD DATA\n",
        "# =========================\n",
        "spark_df = spark.read.csv(\n",
        "    \"KDDTrain+.txt\",\n",
        "    header=False,\n",
        "    inferSchema=True\n",
        ")\n",
        "\n",
        "# =========================\n",
        "# 4. ADD COLUMN NAMES\n",
        "# =========================\n",
        "spark_df = spark_df.toDF(*columns)\n",
        "\n",
        "print(\"Columns assigned ✅\")\n",
        "\n",
        "# =========================\n",
        "# 5. BASIC CHECK\n",
        "# =========================\n",
        "spark_df.show(5)\n",
        "\n",
        "# =========================\n",
        "# 6. DISTRIBUTED LABEL CONVERSION\n",
        "# =========================\n",
        "spark_df = spark_df.withColumn(\n",
        "    \"label\",\n",
        "    when(spark_df[\"label\"] == \"normal\", 0).otherwise(1)\n",
        ")\n",
        "\n",
        "print(\"Label converted ✅\")\n",
        "\n",
        "spark_df.groupBy(\"label\").count().show()\n",
        "\n",
        "# =========================\n",
        "# 7. DISTRIBUTED FEATURE ENGINEERING\n",
        "# =========================\n",
        "spark_df = spark_df.withColumn(\n",
        "    \"bytes_total\",\n",
        "    spark_df[\"src_bytes\"] + spark_df[\"dst_bytes\"]\n",
        ")\n",
        "\n",
        "spark_df.select(\"src_bytes\", \"dst_bytes\", \"bytes_total\").show(5)\n",
        "\n",
        "# =========================\n",
        "# 8. DISTRIBUTED FILTERING\n",
        "# =========================\n",
        "normal_df = spark_df.filter(spark_df[\"label\"] == 0)\n",
        "attack_df = spark_df.filter(spark_df[\"label\"] == 1)\n",
        "\n",
        "print(\"Normal count:\", normal_df.count())\n",
        "print(\"Attack count:\", attack_df.count())\n",
        "\n",
        "# =========================\n",
        "# 9. SHOW DISTRIBUTION\n",
        "# =========================\n",
        "spark_df.groupBy(\"protocol_type\").count().show()\n",
        "\n",
        "print(\"PySpark processing complete ✅\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IqP3MdTLQpTU",
        "outputId": "70a25dd3-3e6c-42e1-d841-e560231955f0"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: pyspark in /usr/local/lib/python3.12/dist-packages (4.0.2)\n",
            "Requirement already satisfied: py4j<0.10.9.10,>=0.10.9.7 in /usr/local/lib/python3.12/dist-packages (from pyspark) (0.10.9.9)\n",
            "Spark Started ✅\n",
            "Columns assigned ✅\n",
            "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
            "|duration|protocol_type| service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_host_count|dst_host_srv_count|dst_host_same_srv_rate|dst_host_diff_srv_rate|dst_host_same_src_port_rate|dst_host_srv_diff_host_rate|dst_host_serror_rate|dst_host_srv_serror_rate|dst_host_rerror_rate|dst_host_srv_rerror_rate|  label|difficulty|\n",
            "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
            "|       0|          tcp|ftp_data|  SF|      491|        0|   0|             0|     0|  0|                0|        0|              0|         0|           0|       0|                 0|         0|               0|                0|            0|             0|    2|        2|        0.0|            0.0|        0.0|            0.0|          1.0|          0.0|               0.0|           150|                25|                  0.17|                  0.03|                       0.17|                        0.0|                 0.0|                     0.0|                0.05|                     0.0| normal|        20|\n",
            "|       0|          udp|   other|  SF|      146|        0|   0|             0|     0|  0|                0|        0|              0|         0|           0|       0|                 0|         0|               0|                0|            0|             0|   13|        1|        0.0|            0.0|        0.0|            0.0|         0.08|         0.15|               0.0|           255|                 1|                   0.0|                   0.6|                       0.88|                        0.0|                 0.0|                     0.0|                 0.0|                     0.0| normal|        15|\n",
            "|       0|          tcp| private|  S0|        0|        0|   0|             0|     0|  0|                0|        0|              0|         0|           0|       0|                 0|         0|               0|                0|            0|             0|  123|        6|        1.0|            1.0|        0.0|            0.0|         0.05|         0.07|               0.0|           255|                26|                   0.1|                  0.05|                        0.0|                        0.0|                 1.0|                     1.0|                 0.0|                     0.0|neptune|        19|\n",
            "|       0|          tcp|    http|  SF|      232|     8153|   0|             0|     0|  0|                0|        1|              0|         0|           0|       0|                 0|         0|               0|                0|            0|             0|    5|        5|        0.2|            0.2|        0.0|            0.0|          1.0|          0.0|               0.0|            30|               255|                   1.0|                   0.0|                       0.03|                       0.04|                0.03|                    0.01|                 0.0|                    0.01| normal|        21|\n",
            "|       0|          tcp|    http|  SF|      199|      420|   0|             0|     0|  0|                0|        1|              0|         0|           0|       0|                 0|         0|               0|                0|            0|             0|   30|       32|        0.0|            0.0|        0.0|            0.0|          1.0|          0.0|              0.09|           255|               255|                   1.0|                   0.0|                        0.0|                        0.0|                 0.0|                     0.0|                 0.0|                     0.0| normal|        21|\n",
            "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
            "only showing top 5 rows\n",
            "Label converted ✅\n",
            "+-----+-----+\n",
            "|label|count|\n",
            "+-----+-----+\n",
            "|    1|58630|\n",
            "|    0|67343|\n",
            "+-----+-----+\n",
            "\n",
            "+---------+---------+-----------+\n",
            "|src_bytes|dst_bytes|bytes_total|\n",
            "+---------+---------+-----------+\n",
            "|      491|        0|        491|\n",
            "|      146|        0|        146|\n",
            "|        0|        0|          0|\n",
            "|      232|     8153|       8385|\n",
            "|      199|      420|        619|\n",
            "+---------+---------+-----------+\n",
            "only showing top 5 rows\n",
            "Normal count: 67343\n",
            "Attack count: 58630\n",
            "+-------------+------+\n",
            "|protocol_type| count|\n",
            "+-------------+------+\n",
            "|          tcp|102689|\n",
            "|          udp| 14993|\n",
            "|         icmp|  8291|\n",
            "+-------------+------+\n",
            "\n",
            "PySpark processing complete ✅\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =========================\n",
        "# FINAL SECURE API (SIMULATION)\n",
        "# =========================\n",
        "\n",
        "API_KEY = \"12345\"\n",
        "\n",
        "def secure_predict(input_data, api_key):\n",
        "\n",
        "    # 🔐 Security check\n",
        "    if api_key != API_KEY:\n",
        "        return {\"error\": \"Unauthorized access\"}\n",
        "\n",
        "    # Convert input\n",
        "    data = np.array(input_data).reshape(1, -1)\n",
        "\n",
        "    # Model prediction\n",
        "    prediction = model.predict(data)\n",
        "    result = int(prediction[0][0] > 0.5)\n",
        "\n",
        "    return {\n",
        "        \"prediction\": result,\n",
        "        \"message\": \"Attack\" if result == 1 else \"Normal\"\n",
        "    }"
      ],
      "metadata": {
        "id": "Lfe5tGxj6njn"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sample = X_test[0]\n",
        "\n",
        "output = secure_predict(sample, \"12345\")\n",
        "\n",
        "print(output)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gmfxYPEa7lyg",
        "outputId": "ed3a28fc-b094-432b-b84b-60704d7f41b1"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 267ms/step\n",
            "{'prediction': 1, 'message': 'Attack'}\n"
          ]
        }
      ]
    }
  ]
}