kristiangnordby commited on
Commit
74bd76e
Β·
verified Β·
1 Parent(s): b888779

Upload nsLABSE.ipynb

Browse files
Files changed (1) hide show
  1. nsLABSE.ipynb +627 -0
nsLABSE.ipynb ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "9112d5ff-60e3-41f4-b407-2b7a209354a2",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import gzip\n",
12
+ "import json\n",
13
+ "import random\n",
14
+ "import torch\n",
15
+ "import torch.nn as nn\n",
16
+ "import torch.optim as optim\n",
17
+ "from torch.utils.data import DataLoader, TensorDataset\n",
18
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score\n",
19
+ "import numpy as np"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "76e80b80-604b-4a5a-a3a1-6e8196d7aa10",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "\n",
33
+ "πŸ“ Models will be saved to: /home/knordby/Documents/labeling/models\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "random.seed(42)\n",
39
+ "np.random.seed(42)\n",
40
+ "\n",
41
+ "models_dir = \"/home/knordby/Documents/labeling/models\"\n",
42
+ "os.makedirs(models_dir, exist_ok=True)\n",
43
+ "print(f\"\\nπŸ“ Models will be saved to: {models_dir}\")"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "id": "502e273f-b249-4b55-9680-8b68ce8539bd",
49
+ "metadata": {},
50
+ "source": [
51
+ "### Load the data\n",
52
+ "Here we load our embeddings and as well as our presaved labels for each article."
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 4,
58
+ "id": "0b963ac1-3ffa-4079-9a0d-fd87f0cb2267",
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "name": "stdout",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "\n",
66
+ "[1/4] Loading embeddings...\n",
67
+ " Loading general_sample_200K embeddings...\n",
68
+ " Loaded 199793 embeddings from 200K dataset\n",
69
+ " Loading ns_biased_sample_70K_labse_embedding ...\n",
70
+ " Loaded 60637 embeddings from 70K dataset\n",
71
+ " Total embeddings after merge: 260430\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "print(\"\\n[1/4] Loading embeddings...\")\n",
77
+ "\n",
78
+ "# Load 200K general embeddings\n",
79
+ "print(\" Loading general_sample_200K embeddings...\")\n",
80
+ "with gzip.open('general_sample_200K_embedding_labse.jsonl.gz', 'rt') as f:\n",
81
+ " _200k_embeddings = json.load(f)\n",
82
+ "_200k_embeddings = {k.replace('.json', ''): v for k, v in _200k_embeddings.items()}\n",
83
+ "print(f\" Loaded {len(_200k_embeddings)} embeddings from 200K dataset\")\n",
84
+ "\n",
85
+ "# Load 70K cyber-biased embeddings\n",
86
+ "print(\" Loading ns_biased_sample_70K_labse_embedding ...\")\n",
87
+ "with gzip.open('data/ns_biased_sample_70K_labse_embedding.jsonl.gz', 'rt') as f:\n",
88
+ " _70k_embeddings = json.load(f)\n",
89
+ "_70k_embeddings = {k.replace('.json', ''): v for k, v in _70k_embeddings.items()}\n",
90
+ "print(f\" Loaded {len(_70k_embeddings)} embeddings from 70K dataset\")\n",
91
+ "\n",
92
+ "# Merge embeddings\n",
93
+ "labse_embeddings_dict = _70k_embeddings | _200k_embeddings\n",
94
+ "print(f\" Total embeddings after merge: {len(labse_embeddings_dict)}\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 7,
100
+ "id": "90245f20-97e0-42ba-9cbc-04c78f7bcc01",
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "CPU times: user 228 ms, sys: 54.2 ms, total: 282 ms\n",
108
+ "Wall time: 280 ms\n"
109
+ ]
110
+ }
111
+ ],
112
+ "source": [
113
+ "%%time\n",
114
+ "data = np.load('ns_gemma_embeddings_with_ids.npz')\n",
115
+ "ids = data['ids'] # Shape: (N,)\n",
116
+ "labels = data['labels'] "
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 9,
122
+ "id": "12dcc500-b882-4675-97db-813c6d3564c6",
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "data": {
127
+ "text/plain": [
128
+ "212205"
129
+ ]
130
+ },
131
+ "execution_count": 9,
132
+ "metadata": {},
133
+ "output_type": "execute_result"
134
+ }
135
+ ],
136
+ "source": [
137
+ "embeddings_list = []\n",
138
+ "for idx in ids:\n",
139
+ " embeddings_list.append(labse_embeddings_dict[idx])\n",
140
+ "len(embeddings_list)"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 10,
146
+ "id": "9d248341-e1c9-4418-8035-1ed4215e9b65",
147
+ "metadata": {
148
+ "scrolled": true
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "embeddings = np.array(embeddings_list)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 11,
158
+ "id": "9c881f9e-7d07-45ad-9edb-473829e36791",
159
+ "metadata": {},
160
+ "outputs": [
161
+ {
162
+ "data": {
163
+ "text/plain": [
164
+ "((212205, 768), (212205,), (212205,))"
165
+ ]
166
+ },
167
+ "execution_count": 11,
168
+ "metadata": {},
169
+ "output_type": "execute_result"
170
+ }
171
+ ],
172
+ "source": [
173
+ "embeddings.shape, ids.shape, labels.shape"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "f61ed063-23c2-4919-8a3b-1a296f067290",
179
+ "metadata": {},
180
+ "source": [
181
+ "### Prepare Data"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 12,
187
+ "id": "85b8a065-adc1-4acd-ab7a-9976172f4512",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "\n",
195
+ "[3/4] Preparing train/test split...\n",
196
+ "x_train: 0.8\n",
197
+ "test size: 0.2\n"
198
+ ]
199
+ }
200
+ ],
201
+ "source": [
202
+ "from sklearn.model_selection import train_test_split\n",
203
+ "print(\"\\n[3/4] Preparing train/test split...\")\n",
204
+ "\n",
205
+ "x_train,x_test, y_train,y_test = train_test_split(embeddings, labels, train_size = 0.8, stratify = labels)\n",
206
+ "print(\"x_train: \", len(x_train)/(len(x_train)+len(x_test)))\n",
207
+ "print(\"test size: \", len(x_test)/(len(x_train)+len(x_test)))"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "markdown",
212
+ "id": "030e8ac8-e22c-4144-a79f-f74d461d88ed",
213
+ "metadata": {},
214
+ "source": [
215
+ "#### Dataset Stats"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 13,
221
+ "id": "7888c8cd-df43-4378-8599-56c031dcb9c4",
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "name": "stdout",
226
+ "output_type": "stream",
227
+ "text": [
228
+ "\n",
229
+ "πŸ“Š Dataset Statistics:\n",
230
+ " Training set shape: (169764, 768)\n",
231
+ " Test set shape: (42441, 768)\n",
232
+ " Embedding dimension: 768\n",
233
+ "\n",
234
+ " Label Distribution:\n",
235
+ " β€’ Training - NS: 46238 (27.2%)\n",
236
+ " β€’ Training - Non-NS: 123526 (72.8%)\n",
237
+ " β€’ Test - NS: 11559 (27.2%)\n",
238
+ " β€’ Test - Non-NS: 30882 (72.8%)\n"
239
+ ]
240
+ }
241
+ ],
242
+ "source": [
243
+ "print(f\"\\nπŸ“Š Dataset Statistics:\")\n",
244
+ "print(f\" Training set shape: {x_train.shape}\")\n",
245
+ "print(f\" Test set shape: {x_test.shape}\")\n",
246
+ "print(f\" Embedding dimension: {x_train.shape[1]}\")\n",
247
+ "print(f\"\\n Label Distribution:\")\n",
248
+ "print(f\" β€’ Training - NS: {sum(y_train)} ({sum(y_train)/len(y_train)*100:.1f}%)\")\n",
249
+ "print(f\" β€’ Training - Non-NS: {len(y_train)-sum(y_train)} ({(len(y_train)-sum(y_train))/len(y_train)*100:.1f}%)\")\n",
250
+ "print(f\" β€’ Test - NS: {sum(y_test)} ({sum(y_test)/len(y_test)*100:.1f}%)\")\n",
251
+ "print(f\" β€’ Test - Non-NS: {len(y_test)-sum(y_test)} ({(len(y_test)-sum(y_test))/len(y_test)*100:.1f}%)\")"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "markdown",
256
+ "id": "a6a6ba0a-274b-4de3-af75-66332a9ad399",
257
+ "metadata": {},
258
+ "source": [
259
+ "### Build the Model"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 14,
265
+ "id": "7020d7af-30dd-4f35-8028-a3eccfd9fa71",
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "Using device: cuda\n",
273
+ "======================================================================\n",
274
+ "MODEL BUILT\n",
275
+ "======================================================================\n",
276
+ "Architecture: CyberClassifier\n",
277
+ "Input dimension: 768\n",
278
+ "Hidden layers: 512 -> 256 -> 128\n",
279
+ "Output: 1 (binary classification)\n",
280
+ "Total parameters: 561,409\n",
281
+ "Trainable parameters: 561,409\n",
282
+ "Device: cuda\n",
283
+ "======================================================================\n",
284
+ "\n"
285
+ ]
286
+ }
287
+ ],
288
+ "source": [
289
+ "from torch_models import *\n",
290
+ "\n",
291
+ "# Check GPU\n",
292
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
293
+ "print(f\"Using device: {device}\")\n",
294
+ "\n",
295
+ "# Build model\n",
296
+ "model, optimizer, criterion = build_model(\n",
297
+ " input_dim=x_train.shape[1], # Auto-detect from your data\n",
298
+ " device=device\n",
299
+ ")"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 15,
305
+ "id": "5ddf9bdd-4c58-4be8-a07c-dfdbabb9ff84",
306
+ "metadata": {},
307
+ "outputs": [
308
+ {
309
+ "name": "stdout",
310
+ "output_type": "stream",
311
+ "text": [
312
+ "======================================================================\n",
313
+ "TRAINING\n",
314
+ "======================================================================\n",
315
+ "Epochs: 80\n",
316
+ "Batch size: 512\n",
317
+ "Training samples: 144299\n",
318
+ "Validation samples: 25465\n",
319
+ "Early stopping patience: 15\n",
320
+ "======================================================================\n",
321
+ "\n",
322
+ "Epoch 1/80 - Time: 3.04s\n",
323
+ " Train - Loss: 0.2658, Acc: 0.8886, AUC: 0.9416\n",
324
+ " Val - Loss: 0.2422, Acc: 0.8995, AUC: 0.9508, Precision: 0.8581, Recall: 0.7561\n",
325
+ " βœ“ Best model saved (AUC: 0.9508)\n",
326
+ "\n",
327
+ "Epoch 2/80 - Time: 6.38s\n",
328
+ " Train - Loss: 0.2168, Acc: 0.9103, AUC: 0.9615\n",
329
+ " Val - Loss: 0.2414, Acc: 0.9001, AUC: 0.9513, Precision: 0.8341, Recall: 0.7907\n",
330
+ " βœ“ Best model saved (AUC: 0.9513)\n",
331
+ "\n",
332
+ "Epoch 3/80 - Time: 3.07s\n",
333
+ " Train - Loss: 0.1930, Acc: 0.9203, AUC: 0.9698\n",
334
+ " Val - Loss: 0.2456, Acc: 0.8999, AUC: 0.9509, Precision: 0.8371, Recall: 0.7853\n",
335
+ " No improvement (patience: 1/15)\n",
336
+ "\n",
337
+ "Epoch 4/80 - Time: 7.56s\n",
338
+ " Train - Loss: 0.1695, Acc: 0.9314, AUC: 0.9770\n",
339
+ " Val - Loss: 0.2550, Acc: 0.8976, AUC: 0.9479, Precision: 0.8385, Recall: 0.7731\n",
340
+ " No improvement (patience: 2/15)\n",
341
+ "\n",
342
+ "Epoch 5/80 - Time: 3.20s\n",
343
+ " Train - Loss: 0.1455, Acc: 0.9412, AUC: 0.9834\n",
344
+ " Val - Loss: 0.2709, Acc: 0.8984, AUC: 0.9472, Precision: 0.8304, Recall: 0.7879\n",
345
+ " No improvement (patience: 3/15)\n",
346
+ "\n",
347
+ "Epoch 6/80 - Time: 3.05s\n",
348
+ " Train - Loss: 0.1191, Acc: 0.9529, AUC: 0.9891\n",
349
+ " Val - Loss: 0.2914, Acc: 0.8952, AUC: 0.9462, Precision: 0.8221, Recall: 0.7850\n",
350
+ " No improvement (patience: 4/15)\n",
351
+ "\n",
352
+ "Epoch 7/80 - Time: 6.17s\n",
353
+ " Train - Loss: 0.0957, Acc: 0.9631, AUC: 0.9931\n",
354
+ " Val - Loss: 0.3189, Acc: 0.8966, AUC: 0.9453, Precision: 0.8172, Recall: 0.7992\n",
355
+ " No improvement (patience: 5/15)\n",
356
+ "\n",
357
+ "Epoch 8/80 - Time: 2.63s\n",
358
+ " Train - Loss: 0.0750, Acc: 0.9718, AUC: 0.9958\n",
359
+ " Val - Loss: 0.3464, Acc: 0.8881, AUC: 0.9379, Precision: 0.8167, Recall: 0.7595\n",
360
+ " No improvement (patience: 6/15)\n",
361
+ "\n",
362
+ "Epoch 9/80 - Time: 2.61s\n",
363
+ " Train - Loss: 0.0359, Acc: 0.9898, AUC: 0.9993\n",
364
+ " Val - Loss: 0.3760, Acc: 0.8983, AUC: 0.9456, Precision: 0.8238, Recall: 0.7973\n",
365
+ " No improvement (patience: 7/15)\n",
366
+ "\n",
367
+ "Epoch 10/80 - Time: 5.37s\n",
368
+ " Train - Loss: 0.0167, Acc: 0.9972, AUC: 0.9999\n",
369
+ " Val - Loss: 0.4248, Acc: 0.8958, AUC: 0.9448, Precision: 0.8202, Recall: 0.7907\n",
370
+ " No improvement (patience: 8/15)\n",
371
+ "\n",
372
+ "Epoch 11/80 - Time: 2.62s\n",
373
+ " Train - Loss: 0.0110, Acc: 0.9984, AUC: 0.9999\n",
374
+ " Val - Loss: 0.4642, Acc: 0.8971, AUC: 0.9447, Precision: 0.8243, Recall: 0.7907\n",
375
+ " No improvement (patience: 9/15)\n",
376
+ "\n",
377
+ "Epoch 12/80 - Time: 5.45s\n",
378
+ " Train - Loss: 0.0090, Acc: 0.9985, AUC: 0.9999\n",
379
+ " Val - Loss: 0.5033, Acc: 0.8981, AUC: 0.9441, Precision: 0.8278, Recall: 0.7902\n",
380
+ " No improvement (patience: 10/15)\n",
381
+ "\n",
382
+ "Epoch 13/80 - Time: 2.74s\n",
383
+ " Train - Loss: 0.0087, Acc: 0.9986, AUC: 0.9999\n",
384
+ " Val - Loss: 0.5202, Acc: 0.8930, AUC: 0.9398, Precision: 0.8153, Recall: 0.7852\n",
385
+ " No improvement (patience: 11/15)\n",
386
+ "\n",
387
+ "Epoch 14/80 - Time: 2.75s\n",
388
+ " Train - Loss: 0.0105, Acc: 0.9977, AUC: 0.9999\n",
389
+ " Val - Loss: 0.5484, Acc: 0.8939, AUC: 0.9412, Precision: 0.8136, Recall: 0.7918\n",
390
+ " No improvement (patience: 12/15)\n",
391
+ "\n",
392
+ "Epoch 15/80 - Time: 6.03s\n",
393
+ " Train - Loss: 0.0063, Acc: 0.9991, AUC: 0.9999\n",
394
+ " Val - Loss: 0.5518, Acc: 0.8958, AUC: 0.9433, Precision: 0.8234, Recall: 0.7860\n",
395
+ " No improvement (patience: 13/15)\n",
396
+ "\n",
397
+ "Epoch 16/80 - Time: 2.99s\n",
398
+ " Train - Loss: 0.0031, Acc: 0.9996, AUC: 1.0000\n",
399
+ " Val - Loss: 0.5710, Acc: 0.8966, AUC: 0.9427, Precision: 0.8210, Recall: 0.7931\n",
400
+ " No improvement (patience: 14/15)\n",
401
+ "\n",
402
+ "Epoch 17/80 - Time: 2.64s\n",
403
+ " Train - Loss: 0.0028, Acc: 0.9995, AUC: 1.0000\n",
404
+ " Val - Loss: 0.5870, Acc: 0.8965, AUC: 0.9425, Precision: 0.8287, Recall: 0.7814\n",
405
+ " No improvement (patience: 15/15)\n",
406
+ "\n",
407
+ "⚠️ Early stopping triggered after 17 epochs\n",
408
+ "\n",
409
+ "======================================================================\n",
410
+ "Loading best model...\n",
411
+ "βœ… Best model loaded (AUC: 0.9513)\n",
412
+ "πŸ’Ύ Model saved to: /home/knordby/Documents/labeling/models/ns_labseEmbeddings.pt\n",
413
+ "⏱️ Total training time: 68.87s (1.15m)\n",
414
+ "======================================================================\n",
415
+ "\n"
416
+ ]
417
+ }
418
+ ],
419
+ "source": [
420
+ "# Set save path\n",
421
+ "model_path = '/home/knordby/Documents/labeling/models/ns_labseEmbeddings.pt'\n",
422
+ "\n",
423
+ "# Train\n",
424
+ "model, history = train_model(\n",
425
+ " model, optimizer, criterion,\n",
426
+ " x_train, y_train, x_test, y_test,\n",
427
+ " device=device,\n",
428
+ " epochs=80,\n",
429
+ " batch_size=512,\n",
430
+ " model_path=model_path\n",
431
+ ")"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "markdown",
436
+ "id": "736256f5-1fa1-4b37-b4da-5f38e6a9e9d6",
437
+ "metadata": {},
438
+ "source": [
439
+ "### Evaluate the Model's Performance Against the Test Set"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 16,
445
+ "id": "a1d5e970-c4b7-4218-bf64-c23414e4bc96",
446
+ "metadata": {},
447
+ "outputs": [
448
+ {
449
+ "name": "stdout",
450
+ "output_type": "stream",
451
+ "text": [
452
+ "======================================================================\n",
453
+ "πŸ“ˆ CYBERSECURITY CLASSIFIER - FINAL TEST RESULTS\n",
454
+ "======================================================================\n",
455
+ " Loss: 0.2405\n",
456
+ " Accuracy: 0.8991 (89.91%)\n",
457
+ " Precision: 0.8307\n",
458
+ " Recall: 0.7906\n",
459
+ " AUC: 0.9520\n",
460
+ " F1 Score: 0.8102\n",
461
+ "\n",
462
+ "Confusion Matrix:\n",
463
+ " Predicted\n",
464
+ " Negative Positive\n",
465
+ "Actual Negative 29020 1862\n",
466
+ " Positive 2420 9139\n",
467
+ "\n",
468
+ "Detailed Metrics:\n",
469
+ " True Positives: 9139\n",
470
+ " True Negatives: 29020\n",
471
+ " False Positives: 1862\n",
472
+ " False Negatives: 2420\n",
473
+ " Specificity: 0.9397\n",
474
+ " NPV: 0.9230\n",
475
+ "\n",
476
+ "Classification Report:\n",
477
+ " precision recall f1-score support\n",
478
+ "\n",
479
+ " Non-Cyber 0.9230 0.9397 0.9313 30882\n",
480
+ " Cyber 0.8307 0.7906 0.8102 11559\n",
481
+ "\n",
482
+ " accuracy 0.8991 42441\n",
483
+ " macro avg 0.8769 0.8652 0.8707 42441\n",
484
+ "weighted avg 0.8979 0.8991 0.8983 42441\n",
485
+ "\n",
486
+ "======================================================================\n",
487
+ "\n",
488
+ "Test AUC: 0.9520\n"
489
+ ]
490
+ }
491
+ ],
492
+ "source": [
493
+ "# Evaluate with detailed metrics\n",
494
+ "y_pred_probs, metrics = evaluate_model(\n",
495
+ " model, x_test, y_test,\n",
496
+ " device=device\n",
497
+ ")\n",
498
+ "\n",
499
+ "# Access individual metrics if needed\n",
500
+ "print(f\"Test AUC: {metrics['auc']:.4f}\")"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "markdown",
505
+ "id": "af1ddbc6-372a-4d2c-9f04-e4f3987165db",
506
+ "metadata": {},
507
+ "source": [
508
+ "### Push the Model"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": 17,
514
+ "id": "7ef72e71-49af-4d6b-9b44-3f1dcb03bcf9",
515
+ "metadata": {},
516
+ "outputs": [
517
+ {
518
+ "name": "stdout",
519
+ "output_type": "stream",
520
+ "text": [
521
+ "\n",
522
+ "======================================================================\n",
523
+ "PUSHING MODEL TO HUGGINGFACE\n",
524
+ "======================================================================\n",
525
+ "Repository: kristiangnordby/natSecLabse\n",
526
+ "Private: False\n",
527
+ "======================================================================\n",
528
+ "\n",
529
+ "βœ… Repository created/verified: kristiangnordby/natSecLabse\n",
530
+ "\n",
531
+ "πŸ“ Creating model card...\n",
532
+ "βš™οΈ Saving configuration...\n",
533
+ "πŸ—οΈ Saving model architecture...\n",
534
+ "πŸ’Ύ Preparing model checkpoint...\n",
535
+ "\n",
536
+ "πŸ“€ Uploading files to HuggingFace...\n",
537
+ " βœ“ Uploaded: README.md\n",
538
+ " βœ“ Uploaded: config.json\n",
539
+ " βœ“ Uploaded: model_architecture.py\n"
540
+ ]
541
+ },
542
+ {
543
+ "data": {
544
+ "application/vnd.jupyter.widget-view+json": {
545
+ "model_id": "4d5047df11684f3b80a564d8f16ac91a",
546
+ "version_major": 2,
547
+ "version_minor": 0
548
+ },
549
+ "text/plain": [
550
+ "Processing Files (0 / 0): | | 0.00B / 0.00B "
551
+ ]
552
+ },
553
+ "metadata": {},
554
+ "output_type": "display_data"
555
+ },
556
+ {
557
+ "data": {
558
+ "application/vnd.jupyter.widget-view+json": {
559
+ "model_id": "542c46ddcea94a4dacd99d075a29a407",
560
+ "version_major": 2,
561
+ "version_minor": 0
562
+ },
563
+ "text/plain": [
564
+ "New Data Upload: | | 0.00B / 0.00B "
565
+ ]
566
+ },
567
+ "metadata": {},
568
+ "output_type": "display_data"
569
+ },
570
+ {
571
+ "name": "stdout",
572
+ "output_type": "stream",
573
+ "text": [
574
+ " βœ“ Uploaded: model.pt\n",
575
+ "\n",
576
+ "======================================================================\n",
577
+ "βœ… MODEL SUCCESSFULLY PUSHED TO HUGGINGFACE!\n",
578
+ "======================================================================\n",
579
+ "πŸ”— View your model at: https://huggingface.co/kristiangnordby/natSecLabse\n",
580
+ "======================================================================\n",
581
+ "\n",
582
+ "Model available at: https://huggingface.co/kristiangnordby/natSecLabse\n"
583
+ ]
584
+ }
585
+ ],
586
+ "source": [
587
+ "from push_to_huggingface import push_to_huggingface\n",
588
+ "\n",
589
+ "with open(\"hf_token.txt\",'r') as f:\n",
590
+ " token = f.read()\n",
591
+ "\n",
592
+ "# Push your model (after training and evaluation)\n",
593
+ "repo_url = push_to_huggingface(\n",
594
+ " model_path='/home/knordby/Documents/labeling/models/ns_labseEmbeddings.pt',\n",
595
+ " repo_name='natSecLabse', # Choose your repo name\n",
596
+ " metrics=metrics, # From evaluate_model()\n",
597
+ " input_dim=x_train.shape[1], # Your embedding dimension\n",
598
+ " hf_token=token, # Your token\n",
599
+ " private=False # Set True if you want private repo\n",
600
+ ")\n",
601
+ "\n",
602
+ "print(f\"Model available at: {repo_url}\")"
603
+ ]
604
+ }
605
+ ],
606
+ "metadata": {
607
+ "kernelspec": {
608
+ "display_name": "vanilla",
609
+ "language": "python",
610
+ "name": "vanilla"
611
+ },
612
+ "language_info": {
613
+ "codemirror_mode": {
614
+ "name": "ipython",
615
+ "version": 3
616
+ },
617
+ "file_extension": ".py",
618
+ "mimetype": "text/x-python",
619
+ "name": "python",
620
+ "nbconvert_exporter": "python",
621
+ "pygments_lexer": "ipython3",
622
+ "version": "3.10.19"
623
+ }
624
+ },
625
+ "nbformat": 4,
626
+ "nbformat_minor": 5
627
+ }