19kmunz commited on
Commit
43cec6e
·
1 Parent(s): bb1a1a5

Upload Final_Model_Source_Code.ipynb

Browse files
Files changed (1) hide show
  1. Final_Model_Source_Code.ipynb +797 -0
Final_Model_Source_Code.ipynb ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "source": [
6
+ "# Install Necessary Packages"
7
+ ],
8
+ "metadata": {
9
+ "id": "GUB8N3k9fq-E"
10
+ }
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {
16
+ "id": "zt59bSq5vcnA"
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "#Necessary installations\n",
21
+ "!pip install datasets evaluate transformers[sentencepiece]\n",
22
+ "!pip install huggingface_hub\n",
23
+ "!pip install pandas\n",
24
+ "!pip install imblearn\n",
25
+ "!pip install torch"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "source": [
31
+ "# Load the Dataset"
32
+ ],
33
+ "metadata": {
34
+ "id": "9lyEyWBic5RN"
35
+ }
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {
41
+ "id": "QJDszQKe6oxK"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "from datasets import Features, Value, ClassLabel\n",
46
+ "import pandas as pd\n",
47
+ "\n",
48
+ "from datasets import load_dataset\n",
49
+ "dataset = load_dataset(\"19kmunz/iot-23-preprocessed-minimumcolumns\")\n",
50
+ "print(dataset.shape)"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "metadata": {
56
+ "id": "wRjakUpXD3D9"
57
+ },
58
+ "source": [
59
+ "# Oversample the Dataset"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {
66
+ "id": "wzU5AHGxD2Ut"
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "from imblearn.over_sampling import SMOTE\n",
71
+ "from sklearn.preprocessing import OneHotEncoder\n",
72
+ "from sklearn.preprocessing import LabelEncoder\n",
73
+ "from sklearn.model_selection import train_test_split"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "id": "mT027c7R1t7n"
81
+ },
82
+ "outputs": [],
83
+ "source": [
84
+ "df = dataset['train'].to_pandas()\n"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {
91
+ "id": "v2l9xGpr6bZc"
92
+ },
93
+ "outputs": [],
94
+ "source": [
95
+ "# Separate features and target\n",
96
+ "features = ['id.resp_p', 'proto', 'conn_state', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
97
+ "X = df[features]\n",
98
+ "y = df['label']"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "markdown",
103
+ "metadata": {
104
+ "id": "SlFbgG_69B1K"
105
+ },
106
+ "source": [
107
+ "ADASYN and SMOTE oversampling algorithm expects numeric data, but features like proto is non-numeric categorical column. SMOTE cannot handle the string values like 'tcp' in those columns. So, I applied one hot encoding to categorical columns and then applied SMOTE"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "metadata": {
114
+ "id": "8zSNEGIiWjMZ"
115
+ },
116
+ "outputs": [],
117
+ "source": [
118
+ "#########################################NEWWWW#############################################\n",
119
+ "# Define categorical columns to be label-encoded\n",
120
+ "cat_cols = ['proto', 'conn_state']\n",
121
+ "\n",
122
+ "# Initialize a dictionary to store label encoders for each column\n",
123
+ "label_encoders = {}\n",
124
+ "label_encoded_columns = {} # Store label-encoded columns\n",
125
+ "\n",
126
+ "for col in cat_cols:\n",
127
+ " le = LabelEncoder()\n",
128
+ " label_encoded = le.fit_transform(df[col])\n",
129
+ " df[col + '_label'] = label_encoded # Create new columns with label-encoded data\n",
130
+ " label_encoders[col] = le\n",
131
+ " label_encoded_columns[col] = label_encoded\n",
132
+ "# Get numeric columns\n",
133
+ "num_cols = ['id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
134
+ "\n",
135
+ "# Extract numeric columns\n",
136
+ "X_num = df[num_cols]\n",
137
+ "\n",
138
+ "# Concatenate label-encoded columns and numeric columns\n",
139
+ "X_combined = pd.concat([df[['proto_label', 'conn_state_label']], X_num], axis=1)\n",
140
+ "\n",
141
+ "# Store the labels in y_os\n",
142
+ "y_os = df['label']\n",
143
+ "y_os1 = df['label'].apply(lambda x: 0 if x == \"Benign\" else 1)\n",
144
+ "\n",
145
+ "# Specify desired number of samples\n",
146
+ "#k_neighbors = 10000 - y_os.shape[0]\n",
147
+ "\n",
148
+ "# Perform oversampling using SMOTE\n",
149
+ "smote = SMOTE(sampling_strategy={0: 5000, 1: 5000})\n",
150
+ "X_combined_os, Y_combined_os = smote.fit_resample(X_combined, y_os1)"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "source": [
156
+ "# Print new class counts\n",
157
+ "print(Y_combined_os.value_counts())\n",
158
+ "print(X_combined_os.shape)"
159
+ ],
160
+ "metadata": {
161
+ "id": "mZ1iMnEIkVAj"
162
+ },
163
+ "execution_count": null,
164
+ "outputs": []
165
+ },
166
+ {
167
+ "cell_type": "markdown",
168
+ "source": [
169
+ "# Split the Dataset"
170
+ ],
171
+ "metadata": {
172
+ "id": "oO9g2nhlbr3o"
173
+ }
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "metadata": {
179
+ "id": "OzJI6451n4tE"
180
+ },
181
+ "outputs": [],
182
+ "source": [
183
+ "# Manually define the column names\n",
184
+ "column_names = ['proto_label', 'conn_state_label', 'id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
185
+ "result_column = ['label']\n",
186
+ "\n",
187
+ "# Create a new DataFrame with the oversampled data and specified column names\n",
188
+ "X_combined_os_df = pd.DataFrame(X_combined_os, columns=column_names)\n",
189
+ "Y_combined_os_df = pd.DataFrame(Y_combined_os, columns=result_column)\n",
190
+ "\n",
191
+ "# Print the first 5 rows of the oversampled data\n",
192
+ "print(X_combined_os_df.shape)\n",
193
+ "print(X_combined_os_df.head())"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": null,
199
+ "metadata": {
200
+ "id": "YwnVJ7RqKFRD"
201
+ },
202
+ "outputs": [],
203
+ "source": [
204
+ "# Split oversampled data\n",
205
+ "\n",
206
+ "# Initial split into train and temp test sets\n",
207
+ "X_train, X_temp, y_train, y_temp = train_test_split(X_combined_os_df, Y_combined_os_df, test_size=0.2, random_state=42)\n",
208
+ "\n",
209
+ "# Split oversampled data\n",
210
+ "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)\n",
211
+ "\n",
212
+ "print(\"Oversampled dataset shape:\", X_combined_os.shape)\n",
213
+ "print(\"X_train shape:\", X_train.shape)\n",
214
+ "print(\"X_test shape:\", X_test.shape)\n",
215
+ "print(\"X_val shape:\", X_val.shape)\n"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "markdown",
220
+ "metadata": {
221
+ "id": "WHobtry9LI_d"
222
+ },
223
+ "source": [
224
+ "# Tokenize the Dataset"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "markdown",
229
+ "source": [
230
+ "### Run one of the following cell if loading from local. Otherwise x_train and y_train are already defined."
231
+ ],
232
+ "metadata": {
233
+ "id": "3UMlgohccAPg"
234
+ }
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "source": [
239
+ "import pandas as pd\n",
240
+ "X_train = pd.read_csv('X_train.csv', index_col=0)\n",
241
+ "y_train = pd.read_csv('y_train.csv', index_col=0)"
242
+ ],
243
+ "metadata": {
244
+ "id": "z2BM318ufee_"
245
+ },
246
+ "execution_count": null,
247
+ "outputs": []
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "source": [
252
+ "train_encodings = torch.load('train_encodings.pt')\n",
253
+ "val_encodings = torch.load('val_encodings.pt')\n",
254
+ "test_encodings = torch.load('test_encodings.pt')"
255
+ ],
256
+ "metadata": {
257
+ "id": "sVTr9fxIMZl9"
258
+ },
259
+ "execution_count": null,
260
+ "outputs": []
261
+ },
262
+ {
263
+ "cell_type": "markdown",
264
+ "source": [
265
+ "### Otherwise, Continue running here"
266
+ ],
267
+ "metadata": {
268
+ "id": "j0FyKqdMezWv"
269
+ }
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "metadata": {
275
+ "id": "U09fvCzaMn2P"
276
+ },
277
+ "outputs": [],
278
+ "source": [
279
+ "# Dictionary of feature names to use in the make sentence function\n",
280
+ "feature_names = {'id.resp_p':'response port',\n",
281
+ " 'proto_label':'transport protocol',\n",
282
+ " 'orig_pkts':'number of packets sent by the origin',\n",
283
+ " 'conn_state_label':'connection state',\n",
284
+ " 'orig_ip_bytes':'number of IP level bytes sent by the originator',\n",
285
+ " 'resp_ip_bytes':'number of IP level bytes sent by the responder'}\n",
286
+ "\n",
287
+ "# Function to make sentences out of the data\n",
288
+ "def make_sentence(row):\n",
289
+ " sentences = {}\n",
290
+ " for feature in row.keys():\n",
291
+ " if feature != 'label':\n",
292
+ " sentences[feature] = feature_names[feature] + \" is \" + str(row[feature]) + \".\"\n",
293
+ " return sentences"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {
300
+ "id": "Fe_vj8hO9dNw"
301
+ },
302
+ "outputs": [],
303
+ "source": [
304
+ "# Take all sentence observations and make them into paragraph inputs\n",
305
+ "def make_paragraphs(ser):\n",
306
+ " paragraphs_list = []\n",
307
+ " for index,obs in ser.items():\n",
308
+ " new_para = obs['id.resp_p'] + \" \" + obs['proto_label'] + \" \" + obs['conn_state_label'] + \" \" + obs['orig_pkts'] + \" \" + obs['orig_ip_bytes'] + \" \" + obs['resp_ip_bytes']\n",
309
+ " paragraphs_list.append(new_para)\n",
310
+ " return pd.Series(paragraphs_list, index=ser.index)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {
317
+ "id": "bNyv9zOlGaBm"
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "from transformers import BertTokenizer\n",
322
+ "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
323
+ "\n",
324
+ "\n",
325
+ "# Transform the dataset into sentences\n",
326
+ "X_train_sentences = X_train.apply(make_sentence, axis=1)\n",
327
+ "X_val_sentences = X_val.apply(make_sentence, axis=1)\n",
328
+ "X_test_sentences = X_test.apply(make_sentence, axis=1)\n",
329
+ "\n",
330
+ "# Transform the sentences into paragraphs\n",
331
+ "X_train_paragraphs = make_paragraphs(X_train_sentences)\n",
332
+ "X_val_paragraphs = make_paragraphs(X_val_sentences)\n",
333
+ "X_test_paragraphs = make_paragraphs(X_test_sentences)\n",
334
+ "\n",
335
+ "# Turn labels into lists of strings\n",
336
+ "y_train_str = [str(y) for y in y_train['label'].tolist()]\n",
337
+ "y_val_str = [str(y) for y in y_val['label'].tolist()]\n",
338
+ "y_test_str = [str(y) for y in y_test['label'].tolist()]"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "metadata": {
345
+ "id": "f5bT1RIEW0O7"
346
+ },
347
+ "outputs": [],
348
+ "source": [
349
+ "import torch\n",
350
+ "# Encode both paragraphs and the labels\n",
351
+ "train_encodings = tokenizer(text=X_train_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
352
+ "val_encodings = tokenizer(text=X_val_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
353
+ "test_encodings = tokenizer(text=X_test_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
354
+ "\n",
355
+ "# Add label tensors\n",
356
+ "y_train_tensor = torch.tensor(y_train['label'].values)\n",
357
+ "y_val_tensor = torch.tensor(y_val['label'].values)\n",
358
+ "y_test_tensor = torch.tensor(y_test['label'].values)\n",
359
+ "\n",
360
+ "train_encodings['labels'] = y_train_tensor\n",
361
+ "val_encodings['labels'] = y_val_tensor\n",
362
+ "test_encodings['labels'] = y_test_tensor"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "metadata": {
369
+ "id": "OV600RIVGlTi"
370
+ },
371
+ "outputs": [],
372
+ "source": [
373
+ "torch.save(train_encodings, 'train_encodings.pt')\n",
374
+ "torch.save(val_encodings, 'val_encodings.pt')\n",
375
+ "torch.save(test_encodings, 'test_encodings.pt')"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "source": [
381
+ "# Finally, prepare dataset as Hugging Face Dataset"
382
+ ],
383
+ "metadata": {
384
+ "id": "gev2VE5VcnaY"
385
+ }
386
+ },
387
+ {
388
+ "cell_type": "markdown",
389
+ "metadata": {
390
+ "id": "ZNmaJOCUifpD"
391
+ },
392
+ "source": [
393
+ "### Optional: Load training, validation, and test encodings in from Drive or local"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "source": [
399
+ "from google.colab import drive\n",
400
+ "drive.mount('/content/drive')"
401
+ ],
402
+ "metadata": {
403
+ "id": "7NlSBStpD_rO"
404
+ },
405
+ "execution_count": null,
406
+ "outputs": []
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "source": [
411
+ "!pip install torch==2.1.0\n",
412
+ "!pip install -U transformers[torch]\n",
413
+ "!pip install optimum[exporters]"
414
+ ],
415
+ "metadata": {
416
+ "id": "okamUGSAmBYN"
417
+ },
418
+ "execution_count": null,
419
+ "outputs": []
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "source": [
424
+ "import torch\n",
425
+ "from transformers import BertTokenizer\n",
426
+ "# Load tensor data back from drive\n",
427
+ "train_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/train_encodings.pt\")\n",
428
+ "val_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/val_encodings.pt\")\n",
429
+ "test_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/test_encodings.pt\")\n",
430
+ "\n",
431
+ "# Load labels tensors back from drive\n",
432
+ "# y_train_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_train_tensor.pt\")\n",
433
+ "# y_val_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_val_tensor.pt\")\n",
434
+ "# y_test_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_test_tensor.pt\")"
435
+ ],
436
+ "metadata": {
437
+ "id": "rVEX0OhgEAJT"
438
+ },
439
+ "execution_count": null,
440
+ "outputs": []
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "source": [
445
+ "# FROM LOCAL\n",
446
+ "import torch\n",
447
+ "train_encodings = torch.load(\"train_encodings.pt\")\n",
448
+ "val_encodings = torch.load(\"val_encodings.pt\")\n",
449
+ "test_encodings = torch.load(\"test_encodings.pt\")"
450
+ ],
451
+ "metadata": {
452
+ "id": "Jxbp-oouNHsT"
453
+ },
454
+ "execution_count": null,
455
+ "outputs": []
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "source": [
460
+ "print(train_encodings['input_ids'].size())"
461
+ ],
462
+ "metadata": {
463
+ "colab": {
464
+ "base_uri": "https://localhost:8080/"
465
+ },
466
+ "id": "YY7xwbuZlhK4",
467
+ "outputId": "3faf0705-93f8-456e-8dbf-22b406314766"
468
+ },
469
+ "execution_count": null,
470
+ "outputs": [
471
+ {
472
+ "output_type": "stream",
473
+ "name": "stdout",
474
+ "text": [
475
+ "torch.Size([8000, 67])\n"
476
+ ]
477
+ }
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "markdown",
482
+ "source": [
483
+ "### Otherwise, continue running here"
484
+ ],
485
+ "metadata": {
486
+ "id": "dlTL3uj1fKF6"
487
+ }
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "source": [
492
+ "# Creating small datasets to test finetuning\n",
493
+ "train = train_encodings\n",
494
+ "eval = val_encodings\n",
495
+ "test = test_encodings\n",
496
+ "\n",
497
+ "# Creating small datasets to test finetuning (delete :1000 for full dataset)\n",
498
+ "#train = train_encodings[:1000]\n",
499
+ "#eval = val_encodings[:1000]\n",
500
+ "#test = test_encodings[:1000]\n",
501
+ "\n",
502
+ "# Replacing target tensors (delete :128 for full label tensors)\n",
503
+ "# train['labels'] = y_train_tensor[:1000]\n",
504
+ "# eval['labels'] = y_val_tensor[:1000]\n",
505
+ "# test['labels'] = y_test_tensor[:1000]\n",
506
+ "\n",
507
+ "# Pytorch tensors to HF Dataset\n",
508
+ "from datasets import Dataset\n",
509
+ "train_dataset = Dataset.from_dict(train)\n",
510
+ "eval_dataset = Dataset.from_dict(eval)\n",
511
+ "test_dataset = Dataset.from_dict(test)"
512
+ ],
513
+ "metadata": {
514
+ "id": "llZN2akWHxe5"
515
+ },
516
+ "execution_count": null,
517
+ "outputs": []
518
+ },
519
+ {
520
+ "cell_type": "markdown",
521
+ "metadata": {
522
+ "id": "SRLNGcQFvJAa"
523
+ },
524
+ "source": [
525
+ "# Fine-tune BERT for benign vs malicious"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "code",
530
+ "execution_count": null,
531
+ "metadata": {
532
+ "id": "xf2CGlW1dLlH"
533
+ },
534
+ "outputs": [],
535
+ "source": [
536
+ "import torch\n",
537
+ "import torch.nn as nn\n",
538
+ "from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup\n",
539
+ "from transformers import Trainer, TrainingArguments\n",
540
+ "from torch.utils.data import DataLoader, TensorDataset, random_split\n",
541
+ "from sklearn.model_selection import train_test_split\n",
542
+ "from sklearn.utils.class_weight import compute_class_weight"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "source": [
548
+ "import numpy as np\n",
549
+ "import evaluate\n",
550
+ "\n",
551
+ "combined_metrics = evaluate.combine([\"accuracy\", \"f1\"])"
552
+ ],
553
+ "metadata": {
554
+ "id": "maPzffCsAS__"
555
+ },
556
+ "execution_count": null,
557
+ "outputs": []
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "source": [
562
+ "def compute_metrics(eval_pred):\n",
563
+ " logits, labels = eval_pred\n",
564
+ " predictions = np.argmax(logits, axis=-1)\n",
565
+ " results = combined_metrics.compute(predictions=predictions, references=labels)\n",
566
+ " print(f\"Accuracy: {results['accuracy']:.3f}% | F1: {results['f1']:.3f}\")\n",
567
+ " return results"
568
+ ],
569
+ "metadata": {
570
+ "id": "Subi5OZxAvlh"
571
+ },
572
+ "execution_count": null,
573
+ "outputs": []
574
+ },
575
+ {
576
+ "cell_type": "code",
577
+ "source": [
578
+ "# Load pretrained BERT model\n",
579
+ "model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)\n",
580
+ "\n",
581
+ "# OR Load local model\n",
582
+ "# model = BertForSequenceClassification.from_pretrained('./model', num_labels=2)"
583
+ ],
584
+ "metadata": {
585
+ "id": "OWLPaQn9ysMg"
586
+ },
587
+ "execution_count": null,
588
+ "outputs": []
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "source": [
593
+ "# Define TrainingArguments\n",
594
+ "training_args = TrainingArguments(\n",
595
+ " output_dir='./results',\n",
596
+ " num_train_epochs=6,\n",
597
+ " per_device_train_batch_size=32,\n",
598
+ " # per_device_eval_batch_size=16,\n",
599
+ " warmup_steps=500,\n",
600
+ " weight_decay=0.01,\n",
601
+ " logging_dir='./logs',\n",
602
+ " # logging_steps=0.10,\n",
603
+ " eval_steps=0.10,\n",
604
+ " save_steps=0.10,\n",
605
+ " logging_strategy='epoch',\n",
606
+ " evaluation_strategy='epoch',\n",
607
+ " save_strategy='epoch',\n",
608
+ " save_total_limit=2,\n",
609
+ " load_best_model_at_end=True\n",
610
+ ")\n",
611
+ "\n",
612
+ "# Create Trainer instance\n",
613
+ "trainer = Trainer(\n",
614
+ " model=model,\n",
615
+ " args=training_args,\n",
616
+ " train_dataset=train_dataset,\n",
617
+ " eval_dataset=eval_dataset,\n",
618
+ " compute_metrics=compute_metrics\n",
619
+ ")\n",
620
+ "\n",
621
+ "# Train\n",
622
+ "trainer.train()"
623
+ ],
624
+ "metadata": {
625
+ "id": "7a-zvoP0j8C8"
626
+ },
627
+ "execution_count": null,
628
+ "outputs": []
629
+ },
630
+ {
631
+ "cell_type": "code",
632
+ "source": [
633
+ "print(test_dataset)"
634
+ ],
635
+ "metadata": {
636
+ "id": "SzDiVYRf23dp"
637
+ },
638
+ "execution_count": null,
639
+ "outputs": []
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": null,
644
+ "metadata": {
645
+ "id": "TlxpJByQXL_w"
646
+ },
647
+ "outputs": [],
648
+ "source": [
649
+ "# Use test_dataset instead to test it later\n",
650
+ "trainer.evaluate(eval_dataset=test_dataset)"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "source": [
656
+ "model.save_pretrained('./model')"
657
+ ],
658
+ "metadata": {
659
+ "id": "dqMkv8aA5Tdk"
660
+ },
661
+ "execution_count": null,
662
+ "outputs": []
663
+ },
664
+ {
665
+ "cell_type": "markdown",
666
+ "source": [
667
+ "# Save to Hugging Face"
668
+ ],
669
+ "metadata": {
670
+ "id": "-qKGOqJTWt3a"
671
+ }
672
+ },
673
+ {
674
+ "cell_type": "code",
675
+ "source": [
676
+ "from huggingface_hub import create_repo"
677
+ ],
678
+ "metadata": {
679
+ "id": "m0mCacsshEhy"
680
+ },
681
+ "execution_count": null,
682
+ "outputs": []
683
+ },
684
+ {
685
+ "cell_type": "code",
686
+ "source": [
687
+ "!pip install cupy --upgrade"
688
+ ],
689
+ "metadata": {
690
+ "id": "Ba-kOs8WqQTl"
691
+ },
692
+ "execution_count": null,
693
+ "outputs": []
694
+ },
695
+ {
696
+ "cell_type": "code",
697
+ "source": [
698
+ "libcuda.so.1"
699
+ ],
700
+ "metadata": {
701
+ "id": "AK2rcA5-qyGh"
702
+ },
703
+ "execution_count": null,
704
+ "outputs": []
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "source": [
709
+ "!pip install onnxruntime\n",
710
+ "import onnxruntime as rt\n",
711
+ "import onnx\n",
712
+ "import cv2"
713
+ ],
714
+ "metadata": {
715
+ "id": "8z2pir6uo-vM"
716
+ },
717
+ "execution_count": null,
718
+ "outputs": []
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "source": [
723
+ "!optimum-cli export onnx --model ./ --task question-answering ./results/checkpoint-10"
724
+ ],
725
+ "metadata": {
726
+ "id": "WlderhErraWX"
727
+ },
728
+ "execution_count": null,
729
+ "outputs": []
730
+ },
731
+ {
732
+ "cell_type": "code",
733
+ "source": [
734
+ "from onnxruntime import ORTModelForSequenceClassification\n",
735
+ "\n",
736
+ "ort_model = ORTModelForSequenceClassification.from_pretrained(model, export=True)\n",
737
+ "\n",
738
+ "ort_model.save_pretrained(\"./results/checkpoint-10\")"
739
+ ],
740
+ "metadata": {
741
+ "id": "NzPr5eIkZfi7"
742
+ },
743
+ "execution_count": null,
744
+ "outputs": []
745
+ },
746
+ {
747
+ "cell_type": "code",
748
+ "source": [
749
+ "# Export model\n",
750
+ "import torch\n",
751
+ "# Get input ids\n",
752
+ "input_ids = train_dataset['input_ids']\n",
753
+ "# Convert to torch tensor\n",
754
+ "input_ids = torch.tensor(input_ids)\n",
755
+ "\n",
756
+ "torch.onnx.export(model, # Model being run\n",
757
+ " input_ids, # Model input\n",
758
+ " \"IoT23_Log_Prediction.onnx\",# Where to save the model\n",
759
+ " export_params=True, # Store model parameters\n",
760
+ " output_names=['labels'],\n",
761
+ " opset_version=11, # ONNX version\n",
762
+ " do_constant_folding=True, # Optimize\n",
763
+ " input_names = ['input_ids'])"
764
+ ],
765
+ "metadata": {
766
+ "id": "ZM8xTkjeTm0c"
767
+ },
768
+ "execution_count": null,
769
+ "outputs": []
770
+ }
771
+ ],
772
+ "metadata": {
773
+ "colab": {
774
+ "provenance": [],
775
+ "collapsed_sections": [
776
+ "td-xtcTdcoVO",
777
+ "GUB8N3k9fq-E",
778
+ "9lyEyWBic5RN",
779
+ "wRjakUpXD3D9",
780
+ "oO9g2nhlbr3o",
781
+ "3UMlgohccAPg",
782
+ "gev2VE5VcnaY",
783
+ "ZNmaJOCUifpD",
784
+ "L0eqXeQUTpXM"
785
+ ]
786
+ },
787
+ "kernelspec": {
788
+ "display_name": "Python 3",
789
+ "name": "python3"
790
+ },
791
+ "language_info": {
792
+ "name": "python"
793
+ }
794
+ },
795
+ "nbformat": 4,
796
+ "nbformat_minor": 0
797
+ }