dev1461 commited on
Commit
9e853b6
Β·
verified Β·
1 Parent(s): 934afda

Upload final_project.ipynb

Browse files
Files changed (1) hide show
  1. final_project.ipynb +687 -0
final_project.ipynb ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 6,
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "C36kdei0JAGU",
27
+ "outputId": "a3b9ca41-83ba-4246-ebd3-a88937443fd9"
28
+ },
29
+ "outputs": [
30
+ {
31
+ "output_type": "stream",
32
+ "name": "stderr",
33
+ "text": [
34
+ "/usr/local/lib/python3.12/dist-packages/sklearn/feature_selection/_univariate_selection.py:111: UserWarning: Features [16] are constant.\n",
35
+ " warnings.warn(\"Features %s are constant.\" % constant_features_idx, UserWarning)\n",
36
+ "/usr/local/lib/python3.12/dist-packages/sklearn/feature_selection/_univariate_selection.py:112: RuntimeWarning: invalid value encountered in divide\n",
37
+ " f = msb / msw\n"
38
+ ]
39
+ },
40
+ {
41
+ "output_type": "stream",
42
+ "name": "stdout",
43
+ "text": [
44
+ "New shape after feature selection: (110596, 50)\n",
45
+ "Epoch 1/15\n",
46
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m13s\u001b[0m 5ms/step - accuracy: 0.9748 - loss: 0.0709 - val_accuracy: 0.7912 - val_loss: 0.7181\n",
47
+ "Epoch 2/15\n",
48
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9827 - loss: 0.0469 - val_accuracy: 0.7963 - val_loss: 0.8565\n",
49
+ "Epoch 3/15\n",
50
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9843 - loss: 0.0415 - val_accuracy: 0.7947 - val_loss: 0.9044\n",
51
+ "Epoch 4/15\n",
52
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9858 - loss: 0.0377 - val_accuracy: 0.7976 - val_loss: 0.8448\n",
53
+ "Epoch 5/15\n",
54
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9863 - loss: 0.0361 - val_accuracy: 0.8339 - val_loss: 0.8099\n",
55
+ "Epoch 6/15\n",
56
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 3ms/step - accuracy: 0.9871 - loss: 0.0340 - val_accuracy: 0.8187 - val_loss: 0.8643\n",
57
+ "Epoch 7/15\n",
58
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9875 - loss: 0.0331 - val_accuracy: 0.8238 - val_loss: 0.9187\n",
59
+ "Epoch 8/15\n",
60
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9881 - loss: 0.0326 - val_accuracy: 0.8306 - val_loss: 0.8933\n",
61
+ "Epoch 9/15\n",
62
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9883 - loss: 0.0316 - val_accuracy: 0.8199 - val_loss: 0.8902\n",
63
+ "Epoch 10/15\n",
64
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9885 - loss: 0.0306 - val_accuracy: 0.8251 - val_loss: 0.9340\n",
65
+ "Epoch 11/15\n",
66
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9890 - loss: 0.0297 - val_accuracy: 0.8217 - val_loss: 1.0413\n",
67
+ "Epoch 12/15\n",
68
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9888 - loss: 0.0295 - val_accuracy: 0.7996 - val_loss: 1.2353\n",
69
+ "Epoch 13/15\n",
70
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9893 - loss: 0.0289 - val_accuracy: 0.8299 - val_loss: 1.0090\n",
71
+ "Epoch 14/15\n",
72
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - accuracy: 0.9893 - loss: 0.0279 - val_accuracy: 0.8273 - val_loss: 0.8989\n",
73
+ "Epoch 15/15\n",
74
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - accuracy: 0.9896 - loss: 0.0288 - val_accuracy: 0.8173 - val_loss: 1.1206\n",
75
+ "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step - accuracy: 0.8173 - loss: 1.1206\n",
76
+ "Final Accuracy: 0.8173350095748901\n",
77
+ "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step\n",
78
+ "\n",
79
+ "Classification Report:\n",
80
+ "\n",
81
+ " precision recall f1-score support\n",
82
+ "\n",
83
+ " 0 0.71 0.97 0.82 9711\n",
84
+ " 1 0.97 0.70 0.81 12833\n",
85
+ "\n",
86
+ " accuracy 0.82 22544\n",
87
+ " macro avg 0.84 0.84 0.82 22544\n",
88
+ "weighted avg 0.86 0.82 0.82 22544\n",
89
+ "\n",
90
+ "\n",
91
+ "Confusion Matrix:\n",
92
+ "\n",
93
+ "[[9392 319]\n",
94
+ " [3799 9034]]\n"
95
+ ]
96
+ }
97
+ ],
98
+ "source": [
99
+ "# =========================\n",
100
+ "# 1. IMPORTS\n",
101
+ "# =========================\n",
102
+ "import pandas as pd\n",
103
+ "import numpy as np\n",
104
+ "import tensorflow as tf\n",
105
+ "\n",
106
+ "from sklearn.preprocessing import StandardScaler\n",
107
+ "from sklearn.feature_selection import SelectKBest, f_classif\n",
108
+ "from sklearn.model_selection import train_test_split\n",
109
+ "from sklearn.metrics import classification_report, confusion_matrix\n",
110
+ "\n",
111
+ "# =========================\n",
112
+ "# 2. LOAD DATA\n",
113
+ "# =========================\n",
114
+ "train_path = \"KDDTrain+.txt\"\n",
115
+ "test_path = \"KDDTest+.txt\"\n",
116
+ "\n",
117
+ "columns = [\n",
118
+ " \"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\"dst_bytes\",\"land\",\n",
119
+ " \"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\"logged_in\",\n",
120
+ " \"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\"num_file_creations\",\n",
121
+ " \"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\"is_host_login\",\n",
122
+ " \"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\"srv_serror_rate\",\n",
123
+ " \"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\"diff_srv_rate\",\n",
124
+ " \"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n",
125
+ " \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\n",
126
+ " \"dst_host_same_src_port_rate\",\"dst_host_srv_diff_host_rate\",\n",
127
+ " \"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n",
128
+ " \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\n",
129
+ " \"label\",\"difficulty\"\n",
130
+ "]\n",
131
+ "\n",
132
+ "train_df = pd.read_csv(train_path, names=columns)\n",
133
+ "test_df = pd.read_csv(test_path, names=columns)\n",
134
+ "\n",
135
+ "# =========================\n",
136
+ "# 3. LABEL CONVERSION\n",
137
+ "# =========================\n",
138
+ "def label_map(x):\n",
139
+ " return 0 if x == \"normal\" else 1\n",
140
+ "\n",
141
+ "train_df['label'] = train_df['label'].apply(label_map)\n",
142
+ "test_df['label'] = test_df['label'].apply(label_map)\n",
143
+ "\n",
144
+ "# =========================\n",
145
+ "# 4. ONE-HOT ENCODING\n",
146
+ "# =========================\n",
147
+ "categorical_cols = ['protocol_type', 'service', 'flag']\n",
148
+ "\n",
149
+ "train_df = pd.get_dummies(train_df, columns=categorical_cols)\n",
150
+ "test_df = pd.get_dummies(test_df, columns=categorical_cols)\n",
151
+ "\n",
152
+ "train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)\n",
153
+ "\n",
154
+ "# =========================\n",
155
+ "# 5. SPLIT FEATURES\n",
156
+ "# =========================\n",
157
+ "X_train = train_df.drop(['label', 'difficulty'], axis=1)\n",
158
+ "y_train = train_df['label']\n",
159
+ "\n",
160
+ "X_test = test_df.drop(['label', 'difficulty'], axis=1)\n",
161
+ "y_test = test_df['label']\n",
162
+ "\n",
163
+ "# =========================\n",
164
+ "# 6. NORMALIZATION\n",
165
+ "# =========================\n",
166
+ "scaler = StandardScaler()\n",
167
+ "X_train = scaler.fit_transform(X_train)\n",
168
+ "X_test = scaler.transform(X_test)\n",
169
+ "\n",
170
+ "# =========================\n",
171
+ "# 7. FEATURE SELECTION (IMPORTANT)\n",
172
+ "# =========================\n",
173
+ "selector = SelectKBest(score_func=f_classif, k=50)\n",
174
+ "\n",
175
+ "X_train = selector.fit_transform(X_train, y_train)\n",
176
+ "X_test = selector.transform(X_test)\n",
177
+ "\n",
178
+ "print(\"New shape after feature selection:\", X_train.shape)\n",
179
+ "\n",
180
+ "# =========================\n",
181
+ "# 8. BUILD MODEL (IMPROVED)\n",
182
+ "# =========================\n",
183
+ "model = tf.keras.Sequential([\n",
184
+ " tf.keras.layers.Input(shape=(X_train.shape[1],)),\n",
185
+ "\n",
186
+ " tf.keras.layers.Dense(128, activation='relu'),\n",
187
+ " tf.keras.layers.BatchNormalization(),\n",
188
+ " tf.keras.layers.Dropout(0.3),\n",
189
+ "\n",
190
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
191
+ " tf.keras.layers.BatchNormalization(),\n",
192
+ " tf.keras.layers.Dropout(0.3),\n",
193
+ "\n",
194
+ " tf.keras.layers.Dense(32, activation='relu'),\n",
195
+ "\n",
196
+ " tf.keras.layers.Dense(1, activation='sigmoid')\n",
197
+ "])\n",
198
+ "\n",
199
+ "model.compile(\n",
200
+ " optimizer='adam',\n",
201
+ " loss='binary_crossentropy',\n",
202
+ " metrics=['accuracy']\n",
203
+ ")\n",
204
+ "\n",
205
+ "# =========================\n",
206
+ "# 9. TRAIN MODEL\n",
207
+ "# =========================\n",
208
+ "history = model.fit(\n",
209
+ " X_train, y_train,\n",
210
+ " epochs=15,\n",
211
+ " batch_size=64,\n",
212
+ " validation_data=(X_test, y_test)\n",
213
+ ")\n",
214
+ "\n",
215
+ "# =========================\n",
216
+ "# 10. EVALUATE\n",
217
+ "# =========================\n",
218
+ "loss, acc = model.evaluate(X_test, y_test)\n",
219
+ "print(\"Final Accuracy:\", acc)\n",
220
+ "\n",
221
+ "# =========================\n",
222
+ "# 11. METRICS (IMPORTANT FOR REPORT)\n",
223
+ "# =========================\n",
224
+ "y_pred = (model.predict(X_test) > 0.5).astype(\"int32\")\n",
225
+ "\n",
226
+ "print(\"\\nClassification Report:\\n\")\n",
227
+ "print(classification_report(y_test, y_pred))\n",
228
+ "\n",
229
+ "print(\"\\nConfusion Matrix:\\n\")\n",
230
+ "print(confusion_matrix(y_test, y_pred))"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "source": [
236
+ "from tensorflow.keras import layers, models\n",
237
+ "\n",
238
+ "# Train ONLY on normal data\n",
239
+ "X_train_normal = X_train[y_train == 0]\n",
240
+ "\n",
241
+ "# Autoencoder model\n",
242
+ "input_dim = X_train.shape[1]\n",
243
+ "\n",
244
+ "autoencoder = models.Sequential([\n",
245
+ " layers.Input(shape=(input_dim,)),\n",
246
+ "\n",
247
+ " layers.Dense(64, activation='relu'),\n",
248
+ " layers.Dense(32, activation='relu'),\n",
249
+ " layers.Dense(16, activation='relu'),\n",
250
+ "\n",
251
+ " layers.Dense(32, activation='relu'),\n",
252
+ " layers.Dense(64, activation='relu'),\n",
253
+ "\n",
254
+ " layers.Dense(input_dim, activation='sigmoid')\n",
255
+ "])\n",
256
+ "\n",
257
+ "autoencoder.compile(optimizer='adam', loss='mse')\n",
258
+ "\n",
259
+ "# Train\n",
260
+ "autoencoder.fit(\n",
261
+ " X_train_normal,\n",
262
+ " X_train_normal,\n",
263
+ " epochs=15,\n",
264
+ " batch_size=64,\n",
265
+ " validation_data=(X_test, X_test)\n",
266
+ ")"
267
+ ],
268
+ "metadata": {
269
+ "colab": {
270
+ "base_uri": "https://localhost:8080/"
271
+ },
272
+ "id": "WUgrtCu68UgM",
273
+ "outputId": "8113c359-5ebc-4f8e-e870-32f4bd79c5e9"
274
+ },
275
+ "execution_count": 7,
276
+ "outputs": [
277
+ {
278
+ "output_type": "stream",
279
+ "name": "stdout",
280
+ "text": [
281
+ "Epoch 1/15\n",
282
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 5ms/step - loss: 0.2818 - val_loss: 0.6870\n",
283
+ "Epoch 2/15\n",
284
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 4ms/step - loss: 0.2517 - val_loss: 0.6827\n",
285
+ "Epoch 3/15\n",
286
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2508 - val_loss: 0.6792\n",
287
+ "Epoch 4/15\n",
288
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2506 - val_loss: 0.6784\n",
289
+ "Epoch 5/15\n",
290
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2500 - val_loss: 0.6769\n",
291
+ "Epoch 6/15\n",
292
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 4ms/step - loss: 0.2499 - val_loss: 0.6767\n",
293
+ "Epoch 7/15\n",
294
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2499 - val_loss: 0.6818\n",
295
+ "Epoch 8/15\n",
296
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2498 - val_loss: 0.6743\n",
297
+ "Epoch 9/15\n",
298
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2498 - val_loss: 0.6778\n",
299
+ "Epoch 10/15\n",
300
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6721\n",
301
+ "Epoch 11/15\n",
302
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6738\n",
303
+ "Epoch 12/15\n",
304
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6773\n",
305
+ "Epoch 13/15\n",
306
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6788\n",
307
+ "Epoch 14/15\n",
308
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2496 - val_loss: 0.6767\n",
309
+ "Epoch 15/15\n",
310
+ "\u001b[1m924/924\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 3ms/step - loss: 0.2497 - val_loss: 0.6770\n"
311
+ ]
312
+ },
313
+ {
314
+ "output_type": "execute_result",
315
+ "data": {
316
+ "text/plain": [
317
+ "<keras.src.callbacks.history.History at 0x79e9b6ce9610>"
318
+ ]
319
+ },
320
+ "metadata": {},
321
+ "execution_count": 7
322
+ }
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "source": [
328
+ "# Reconstruction error\n",
329
+ "reconstructions = autoencoder.predict(X_test)\n",
330
+ "\n",
331
+ "mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)\n",
332
+ "\n",
333
+ "# Threshold\n",
334
+ "# Get reconstruction error for NORMAL training data\n",
335
+ "train_recon = autoencoder.predict(X_train_normal)\n",
336
+ "\n",
337
+ "train_mse = np.mean(np.power(X_train_normal - train_recon, 2), axis=1)\n",
338
+ "\n",
339
+ "# Better threshold\n",
340
+ "threshold = np.percentile(train_mse, 95)\n",
341
+ "\n",
342
+ "# Predictions\n",
343
+ "y_pred_ae = (mse > threshold).astype(int)"
344
+ ],
345
+ "metadata": {
346
+ "colab": {
347
+ "base_uri": "https://localhost:8080/"
348
+ },
349
+ "id": "2mTcym9l8Wd7",
350
+ "outputId": "11596bb4-da65-41ba-9d72-1f61112f2b76"
351
+ },
352
+ "execution_count": 8,
353
+ "outputs": [
354
+ {
355
+ "output_type": "stream",
356
+ "name": "stdout",
357
+ "text": [
358
+ "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step\n",
359
+ "\u001b[1m1847/1847\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 2ms/step\n"
360
+ ]
361
+ }
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "source": [
367
+ "from sklearn.metrics import classification_report\n",
368
+ "\n",
369
+ "print(classification_report(y_test, y_pred_ae))"
370
+ ],
371
+ "metadata": {
372
+ "colab": {
373
+ "base_uri": "https://localhost:8080/"
374
+ },
375
+ "id": "5QkUi8bSK7XH",
376
+ "outputId": "a4cb6f1a-78ed-4a63-da1c-57aee27d0b46"
377
+ },
378
+ "execution_count": 9,
379
+ "outputs": [
380
+ {
381
+ "output_type": "stream",
382
+ "name": "stdout",
383
+ "text": [
384
+ " precision recall f1-score support\n",
385
+ "\n",
386
+ " 0 0.64 0.93 0.75 9711\n",
387
+ " 1 0.91 0.60 0.72 12833\n",
388
+ "\n",
389
+ " accuracy 0.74 22544\n",
390
+ " macro avg 0.78 0.76 0.74 22544\n",
391
+ "weighted avg 0.79 0.74 0.74 22544\n",
392
+ "\n"
393
+ ]
394
+ }
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "source": [
400
+ "from tensorflow.keras import layers, models\n",
401
+ "\n",
402
+ "# Reshape data for LSTM\n",
403
+ "X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))\n",
404
+ "X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))\n",
405
+ "\n",
406
+ "# Build LSTM model\n",
407
+ "model_lstm = models.Sequential([\n",
408
+ " layers.LSTM(64, input_shape=(1, X_train.shape[1])),\n",
409
+ " layers.Dropout(0.3),\n",
410
+ "\n",
411
+ " layers.Dense(32, activation='relu'),\n",
412
+ " layers.Dense(1, activation='sigmoid')\n",
413
+ "])\n",
414
+ "\n",
415
+ "model_lstm.compile(\n",
416
+ " optimizer='adam',\n",
417
+ " loss='binary_crossentropy',\n",
418
+ " metrics=['accuracy']\n",
419
+ ")\n",
420
+ "\n",
421
+ "# Train\n",
422
+ "model_lstm.fit(\n",
423
+ " X_train_lstm, y_train,\n",
424
+ " epochs=10,\n",
425
+ " batch_size=64,\n",
426
+ " validation_data=(X_test_lstm, y_test)\n",
427
+ ")\n",
428
+ "\n",
429
+ "# Evaluate\n",
430
+ "loss, acc = model_lstm.evaluate(X_test_lstm, y_test)\n",
431
+ "print(\"LSTM Accuracy:\", acc)"
432
+ ],
433
+ "metadata": {
434
+ "colab": {
435
+ "base_uri": "https://localhost:8080/"
436
+ },
437
+ "id": "-WQDtVbqLlPK",
438
+ "outputId": "906b349f-f0d0-40f8-8b9b-0bbdf0d548d6"
439
+ },
440
+ "execution_count": 10,
441
+ "outputs": [
442
+ {
443
+ "output_type": "stream",
444
+ "name": "stdout",
445
+ "text": [
446
+ "Epoch 1/10\n"
447
+ ]
448
+ },
449
+ {
450
+ "output_type": "stream",
451
+ "name": "stderr",
452
+ "text": [
453
+ "/usr/local/lib/python3.12/dist-packages/keras/src/layers/rnn/rnn.py:199: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
454
+ " super().__init__(**kwargs)\n"
455
+ ]
456
+ },
457
+ {
458
+ "output_type": "stream",
459
+ "name": "stdout",
460
+ "text": [
461
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m15s\u001b[0m 6ms/step - accuracy: 0.9768 - loss: 0.0723 - val_accuracy: 0.7811 - val_loss: 0.9115\n",
462
+ "Epoch 2/10\n",
463
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 5ms/step - accuracy: 0.9841 - loss: 0.0417 - val_accuracy: 0.7835 - val_loss: 0.9094\n",
464
+ "Epoch 3/10\n",
465
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9858 - loss: 0.0375 - val_accuracy: 0.7972 - val_loss: 0.9576\n",
466
+ "Epoch 4/10\n",
467
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9873 - loss: 0.0339 - val_accuracy: 0.7971 - val_loss: 1.0328\n",
468
+ "Epoch 5/10\n",
469
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9882 - loss: 0.0317 - val_accuracy: 0.8055 - val_loss: 1.0339\n",
470
+ "Epoch 6/10\n",
471
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9887 - loss: 0.0302 - val_accuracy: 0.8074 - val_loss: 1.1582\n",
472
+ "Epoch 7/10\n",
473
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9893 - loss: 0.0286 - val_accuracy: 0.8054 - val_loss: 1.2248\n",
474
+ "Epoch 8/10\n",
475
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9897 - loss: 0.0279 - val_accuracy: 0.8148 - val_loss: 1.1761\n",
476
+ "Epoch 9/10\n",
477
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9905 - loss: 0.0266 - val_accuracy: 0.8226 - val_loss: 1.1026\n",
478
+ "Epoch 10/10\n",
479
+ "\u001b[1m1729/1729\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 6ms/step - accuracy: 0.9904 - loss: 0.0257 - val_accuracy: 0.8242 - val_loss: 1.1574\n",
480
+ "\u001b[1m705/705\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 3ms/step - accuracy: 0.8242 - loss: 1.1574\n",
481
+ "LSTM Accuracy: 0.8241660594940186\n"
482
+ ]
483
+ }
484
+ ]
485
+ },
486
+ {
487
+ "cell_type": "code",
488
+ "source": [
489
+ "# =========================\n",
490
+ "# 1. INSTALL + IMPORT\n",
491
+ "# =========================\n",
492
+ "!pip install pyspark\n",
493
+ "\n",
494
+ "from pyspark.sql import SparkSession\n",
495
+ "from pyspark.sql.functions import when\n",
496
+ "\n",
497
+ "# =========================\n",
498
+ "# 2. START SPARK\n",
499
+ "# =========================\n",
500
+ "spark = SparkSession.builder \\\n",
501
+ " .appName(\"IDS_Project\") \\\n",
502
+ " .getOrCreate()\n",
503
+ "\n",
504
+ "print(\"Spark Started βœ…\")\n",
505
+ "\n",
506
+ "# =========================\n",
507
+ "# 3. LOAD DATA\n",
508
+ "# =========================\n",
509
+ "spark_df = spark.read.csv(\n",
510
+ " \"KDDTrain+.txt\",\n",
511
+ " header=False,\n",
512
+ " inferSchema=True\n",
513
+ ")\n",
514
+ "\n",
515
+ "# =========================\n",
516
+ "# 4. ADD COLUMN NAMES\n",
517
+ "# =========================\n",
518
+ "spark_df = spark_df.toDF(*columns)\n",
519
+ "\n",
520
+ "print(\"Columns assigned βœ…\")\n",
521
+ "\n",
522
+ "# =========================\n",
523
+ "# 5. BASIC CHECK\n",
524
+ "# =========================\n",
525
+ "spark_df.show(5)\n",
526
+ "\n",
527
+ "# =========================\n",
528
+ "# 6. DISTRIBUTED LABEL CONVERSION\n",
529
+ "# =========================\n",
530
+ "spark_df = spark_df.withColumn(\n",
531
+ " \"label\",\n",
532
+ " when(spark_df[\"label\"] == \"normal\", 0).otherwise(1)\n",
533
+ ")\n",
534
+ "\n",
535
+ "print(\"Label converted βœ…\")\n",
536
+ "\n",
537
+ "spark_df.groupBy(\"label\").count().show()\n",
538
+ "\n",
539
+ "# =========================\n",
540
+ "# 7. DISTRIBUTED FEATURE ENGINEERING\n",
541
+ "# =========================\n",
542
+ "spark_df = spark_df.withColumn(\n",
543
+ " \"bytes_total\",\n",
544
+ " spark_df[\"src_bytes\"] + spark_df[\"dst_bytes\"]\n",
545
+ ")\n",
546
+ "\n",
547
+ "spark_df.select(\"src_bytes\", \"dst_bytes\", \"bytes_total\").show(5)\n",
548
+ "\n",
549
+ "# =========================\n",
550
+ "# 8. DISTRIBUTED FILTERING\n",
551
+ "# =========================\n",
552
+ "normal_df = spark_df.filter(spark_df[\"label\"] == 0)\n",
553
+ "attack_df = spark_df.filter(spark_df[\"label\"] == 1)\n",
554
+ "\n",
555
+ "print(\"Normal count:\", normal_df.count())\n",
556
+ "print(\"Attack count:\", attack_df.count())\n",
557
+ "\n",
558
+ "# =========================\n",
559
+ "# 9. SHOW DISTRIBUTION\n",
560
+ "# =========================\n",
561
+ "spark_df.groupBy(\"protocol_type\").count().show()\n",
562
+ "\n",
563
+ "print(\"PySpark processing complete βœ…\")"
564
+ ],
565
+ "metadata": {
566
+ "colab": {
567
+ "base_uri": "https://localhost:8080/"
568
+ },
569
+ "id": "IqP3MdTLQpTU",
570
+ "outputId": "70a25dd3-3e6c-42e1-d841-e560231955f0"
571
+ },
572
+ "execution_count": 11,
573
+ "outputs": [
574
+ {
575
+ "output_type": "stream",
576
+ "name": "stdout",
577
+ "text": [
578
+ "Requirement already satisfied: pyspark in /usr/local/lib/python3.12/dist-packages (4.0.2)\n",
579
+ "Requirement already satisfied: py4j<0.10.9.10,>=0.10.9.7 in /usr/local/lib/python3.12/dist-packages (from pyspark) (0.10.9.9)\n",
580
+ "Spark Started βœ…\n",
581
+ "Columns assigned βœ…\n",
582
+ "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
583
+ "|duration|protocol_type| service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_host_count|dst_host_srv_count|dst_host_same_srv_rate|dst_host_diff_srv_rate|dst_host_same_src_port_rate|dst_host_srv_diff_host_rate|dst_host_serror_rate|dst_host_srv_serror_rate|dst_host_rerror_rate|dst_host_srv_rerror_rate| label|difficulty|\n",
584
+ "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
585
+ "| 0| tcp|ftp_data| SF| 491| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 2| 2| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 150| 25| 0.17| 0.03| 0.17| 0.0| 0.0| 0.0| 0.05| 0.0| normal| 20|\n",
586
+ "| 0| udp| other| SF| 146| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 13| 1| 0.0| 0.0| 0.0| 0.0| 0.08| 0.15| 0.0| 255| 1| 0.0| 0.6| 0.88| 0.0| 0.0| 0.0| 0.0| 0.0| normal| 15|\n",
587
+ "| 0| tcp| private| S0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 123| 6| 1.0| 1.0| 0.0| 0.0| 0.05| 0.07| 0.0| 255| 26| 0.1| 0.05| 0.0| 0.0| 1.0| 1.0| 0.0| 0.0|neptune| 19|\n",
588
+ "| 0| tcp| http| SF| 232| 8153| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 5| 5| 0.2| 0.2| 0.0| 0.0| 1.0| 0.0| 0.0| 30| 255| 1.0| 0.0| 0.03| 0.04| 0.03| 0.01| 0.0| 0.01| normal| 21|\n",
589
+ "| 0| tcp| http| SF| 199| 420| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 30| 32| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.09| 255| 255| 1.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| normal| 21|\n",
590
+ "+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+----------+\n",
591
+ "only showing top 5 rows\n",
592
+ "Label converted βœ…\n",
593
+ "+-----+-----+\n",
594
+ "|label|count|\n",
595
+ "+-----+-----+\n",
596
+ "| 1|58630|\n",
597
+ "| 0|67343|\n",
598
+ "+-----+-----+\n",
599
+ "\n",
600
+ "+---------+---------+-----------+\n",
601
+ "|src_bytes|dst_bytes|bytes_total|\n",
602
+ "+---------+---------+-----------+\n",
603
+ "| 491| 0| 491|\n",
604
+ "| 146| 0| 146|\n",
605
+ "| 0| 0| 0|\n",
606
+ "| 232| 8153| 8385|\n",
607
+ "| 199| 420| 619|\n",
608
+ "+---------+---------+-----------+\n",
609
+ "only showing top 5 rows\n",
610
+ "Normal count: 67343\n",
611
+ "Attack count: 58630\n",
612
+ "+-------------+------+\n",
613
+ "|protocol_type| count|\n",
614
+ "+-------------+------+\n",
615
+ "| tcp|102689|\n",
616
+ "| udp| 14993|\n",
617
+ "| icmp| 8291|\n",
618
+ "+-------------+------+\n",
619
+ "\n",
620
+ "PySpark processing complete βœ…\n"
621
+ ]
622
+ }
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "source": [
628
+ "# =========================\n",
629
+ "# FINAL SECURE API (SIMULATION)\n",
630
+ "# =========================\n",
631
+ "\n",
632
+ "API_KEY = \"12345\"\n",
633
+ "\n",
634
+ "def secure_predict(input_data, api_key):\n",
635
+ "\n",
636
+ " # πŸ” Security check\n",
637
+ " if api_key != API_KEY:\n",
638
+ " return {\"error\": \"Unauthorized access\"}\n",
639
+ "\n",
640
+ " # Convert input\n",
641
+ " data = np.array(input_data).reshape(1, -1)\n",
642
+ "\n",
643
+ " # Model prediction\n",
644
+ " prediction = model.predict(data)\n",
645
+ " result = int(prediction[0][0] > 0.5)\n",
646
+ "\n",
647
+ " return {\n",
648
+ " \"prediction\": result,\n",
649
+ " \"message\": \"Attack\" if result == 1 else \"Normal\"\n",
650
+ " }"
651
+ ],
652
+ "metadata": {
653
+ "id": "Lfe5tGxj6njn"
654
+ },
655
+ "execution_count": 12,
656
+ "outputs": []
657
+ },
658
+ {
659
+ "cell_type": "code",
660
+ "source": [
661
+ "sample = X_test[0]\n",
662
+ "\n",
663
+ "output = secure_predict(sample, \"12345\")\n",
664
+ "\n",
665
+ "print(output)"
666
+ ],
667
+ "metadata": {
668
+ "colab": {
669
+ "base_uri": "https://localhost:8080/"
670
+ },
671
+ "id": "gmfxYPEa7lyg",
672
+ "outputId": "ed3a28fc-b094-432b-b84b-60704d7f41b1"
673
+ },
674
+ "execution_count": 13,
675
+ "outputs": [
676
+ {
677
+ "output_type": "stream",
678
+ "name": "stdout",
679
+ "text": [
680
+ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 267ms/step\n",
681
+ "{'prediction': 1, 'message': 'Attack'}\n"
682
+ ]
683
+ }
684
+ ]
685
+ }
686
+ ]
687
+ }