Reynier commited on
Commit
91373a6
·
verified ·
1 Parent(s): 65e1da1

Add CNN_Patron_WL.ipynb

Browse files
Files changed (1) hide show
  1. notebooks/CNN_Patron_WL.ipynb +646 -0
notebooks/CNN_Patron_WL.ipynb ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "source": [
22
+ "from google.colab import drive # Importing the library to mount Google Drive\n",
23
+ "drive.mount('/content/drive') # Mounting Google Drive in Colab environment"
24
+ ],
25
+ "metadata": {
26
+ "colab": {
27
+ "base_uri": "https://localhost:8080/"
28
+ },
29
+ "id": "71FJxLKc1343",
30
+ "outputId": "b6eae6c8-c52a-47b1-d5e9-8f43bf78cf31"
31
+ },
32
+ "execution_count": 1,
33
+ "outputs": [
34
+ {
35
+ "output_type": "stream",
36
+ "name": "stdout",
37
+ "text": [
38
+ "Mounted at /content/drive\n"
39
+ ]
40
+ }
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "metadata": {
47
+ "colab": {
48
+ "base_uri": "https://localhost:8080/"
49
+ },
50
+ "id": "YXAm488r1DJw",
51
+ "outputId": "8d425156-f6c8-4474-ebf7-0fc11e009675"
52
+ },
53
+ "outputs": [
54
+ {
55
+ "output_type": "stream",
56
+ "name": "stdout",
57
+ "text": [
58
+ " domain family Label\n",
59
+ "0 nailconsiderable.ru suppobox dga\n",
60
+ "1 stilldelight.net suppobox dga\n",
61
+ "2 kimberleekatheryn.net suppobox dga\n",
62
+ "3 soilbeen.net suppobox dga\n",
63
+ "4 visitform.net suppobox dga\n",
64
+ "... ... ... ...\n",
65
+ "159995 dhuhaa.com legit notdga\n",
66
+ "159996 sdmetalcrew.org legit notdga\n",
67
+ "159997 melbcampcontuligol.ga legit notdga\n",
68
+ "159998 pl-enthusiast.net legit notdga\n",
69
+ "159999 rd-forum.ru legit notdga\n",
70
+ "\n",
71
+ "[160000 rows x 3 columns]\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "import pandas as pd\n",
77
+ "\n",
78
+ "# File paths\n",
79
+ "train_df_file = \"/content/drive/My Drive/MOE_DGA/train_wl.csv\"\n",
80
+ "\n",
81
+ "train_df = pd.read_csv(train_df_file)\n",
82
+ "\n",
83
+ "train_df = train_df.rename(columns={\"label\": \"Label\"})\n",
84
+ "\n",
85
+ "\n",
86
+ "print(train_df)"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "source": [
92
+ "# 📦 Instalar PyTorch si hace falta (Colab ya lo trae normalmente)\n",
93
+ "# !pip install torch torchvision scikit-learn pandas\n",
94
+ "\n",
95
+ "# 📚 1. Importar librerías\n",
96
+ "import torch\n",
97
+ "import torch.nn as nn\n",
98
+ "import torch.optim as optim\n",
99
+ "from torch.utils.data import Dataset, DataLoader\n",
100
+ "from sklearn.model_selection import train_test_split\n",
101
+ "from sklearn.metrics import classification_report\n",
102
+ "import pandas as pd\n",
103
+ "import numpy as np\n",
104
+ "import string\n",
105
+ "\n",
106
+ "# 📐 2. Definir preprocesamiento de texto\n",
107
+ "CHARS = string.ascii_lowercase + string.digits + \"-._\"\n",
108
+ "CHAR2IDX = {c: i+1 for i, c in enumerate(CHARS)} # 0 para padding\n",
109
+ "MAXLEN = 75 # Longitud máxima del dominio\n",
110
+ "\n",
111
+ "def encode_domain(domain):\n",
112
+ " domain = domain.lower()\n",
113
+ " return [CHAR2IDX.get(c, 0) for c in domain[:MAXLEN]] + [0] * (MAXLEN - len(domain))\n",
114
+ "\n",
115
+ "# 🧹 3. Dataset personalizado\n",
116
+ "class DGADataset(Dataset):\n",
117
+ " def __init__(self, df):\n",
118
+ " self.domains = [encode_domain(d) for d in df[\"domain\"]]\n",
119
+ " self.labels = [1 if label == \"dga\" else 0 for label in df[\"Label\"]]\n",
120
+ "\n",
121
+ " def __len__(self):\n",
122
+ " return len(self.domains)\n",
123
+ "\n",
124
+ " def __getitem__(self, idx):\n",
125
+ " return torch.tensor(self.domains[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)\n",
126
+ "\n",
127
+ "# 🧠 4. Modelo CNN\n",
128
+ "class DGACNN(nn.Module):\n",
129
+ " def __init__(self, vocab_size, embedding_dim=32, num_classes=2):\n",
130
+ " super().__init__()\n",
131
+ " self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)\n",
132
+ " self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=3, padding=1)\n",
133
+ " self.relu = nn.ReLU()\n",
134
+ " self.pool = nn.MaxPool1d(2)\n",
135
+ " self.dropout = nn.Dropout(0.3)\n",
136
+ " self.fc = nn.Linear(64 * (MAXLEN // 2), num_classes)\n",
137
+ "\n",
138
+ " def forward(self, x):\n",
139
+ " x = self.embedding(x).transpose(1, 2)\n",
140
+ " x = self.pool(self.relu(self.conv1(x)))\n",
141
+ " x = x.view(x.size(0), -1)\n",
142
+ " x = self.dropout(x)\n",
143
+ " return self.fc(x)\n",
144
+ "\n",
145
+ "# 🏋️‍♂️ 5. Función de entrenamiento\n",
146
+ "def train_model(model, dataloader, epochs=3, lr=1e-3):\n",
147
+ " criterion = nn.CrossEntropyLoss()\n",
148
+ " optimizer = optim.Adam(model.parameters(), lr=lr)\n",
149
+ " model.train()\n",
150
+ "\n",
151
+ " for epoch in range(epochs):\n",
152
+ " total_loss, correct = 0, 0\n",
153
+ " for x_batch, y_batch in dataloader:\n",
154
+ " x_batch, y_batch = x_batch.to(device), y_batch.to(device)\n",
155
+ " outputs = model(x_batch)\n",
156
+ " loss = criterion(outputs, y_batch)\n",
157
+ " optimizer.zero_grad()\n",
158
+ " loss.backward()\n",
159
+ " optimizer.step()\n",
160
+ "\n",
161
+ " total_loss += loss.item()\n",
162
+ " correct += (outputs.argmax(1) == y_batch).sum().item()\n",
163
+ "\n",
164
+ " acc = correct / len(dataloader.dataset)\n",
165
+ " print(f\"📈 Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={acc:.4f}\")\n",
166
+ "\n",
167
+ "# 🧪 6. Evaluación\n",
168
+ "def evaluate_model(model, dataloader):\n",
169
+ " model.eval()\n",
170
+ " y_true, y_pred = [], []\n",
171
+ " with torch.no_grad():\n",
172
+ " for x, y in dataloader:\n",
173
+ " x, y = x.to(device), y.to(device)\n",
174
+ " outputs = model(x)\n",
175
+ " preds = outputs.argmax(dim=1)\n",
176
+ " y_true.extend(y.cpu().numpy())\n",
177
+ " y_pred.extend(preds.cpu().numpy())\n",
178
+ "\n",
179
+ " print(\"\\n📊 Classification Report:\\n\")\n",
180
+ " print(classification_report(y_true, y_pred, target_names=[\"notdga\", \"dga\"]))\n",
181
+ "\n",
182
+ "# ⚙️ 7. Preparar datos (cargar tu DataFrame aquí)\n",
183
+ "# 👇 Reemplaza esto con tu método real para cargar train_df\n",
184
+ "# train_df = pd.read_csv(\"tu_archivo.csv\")\n",
185
+ "# o si ya está en memoria, asegúrate de que se llame 'train_df'\n",
186
+ "\n",
187
+ "train_df = train_df.rename(columns={\"Labels\": \"Label\"}) # Normalizar nombre de columna\n",
188
+ "\n",
189
+ "train_data, test_data = train_test_split(train_df, test_size=0.02, stratify=train_df[\"Label\"], random_state=42)\n",
190
+ "train_loader = DataLoader(DGADataset(train_data), batch_size=64, shuffle=True)\n",
191
+ "test_loader = DataLoader(DGADataset(test_data), batch_size=64)\n",
192
+ "\n",
193
+ "# 🚀 8. Entrenar y evaluar\n",
194
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
195
+ "model = DGACNN(vocab_size=len(CHAR2IDX)+1).to(device)\n",
196
+ "\n",
197
+ "train_model(model, train_loader, epochs=50)\n",
198
+ "evaluate_model(model, test_loader)\n",
199
+ "\n",
200
+ "# 💾 9. Guardar modelo entrenado (opcional)\n",
201
+ "torch.save(model.state_dict(), \"dga_cnn_model_wl.pth\")\n",
202
+ "\n"
203
+ ],
204
+ "metadata": {
205
+ "colab": {
206
+ "base_uri": "https://localhost:8080/"
207
+ },
208
+ "id": "kEk4Sbxf1_8n",
209
+ "outputId": "5938dc14-9f67-4794-c2c8-8229ac56b37b"
210
+ },
211
+ "execution_count": 3,
212
+ "outputs": [
213
+ {
214
+ "output_type": "stream",
215
+ "name": "stdout",
216
+ "text": [
217
+ "📈 Epoch 1: Loss=1227.4861, Accuracy=0.7598\n",
218
+ "📈 Epoch 2: Loss=1111.8431, Accuracy=0.7938\n",
219
+ "📈 Epoch 3: Loss=1085.7899, Accuracy=0.8011\n",
220
+ "📈 Epoch 4: Loss=1065.8432, Accuracy=0.8062\n",
221
+ "📈 Epoch 5: Loss=1053.0915, Accuracy=0.8091\n",
222
+ "📈 Epoch 6: Loss=1043.4482, Accuracy=0.8117\n",
223
+ "📈 Epoch 7: Loss=1037.5315, Accuracy=0.8147\n",
224
+ "📈 Epoch 8: Loss=1029.9196, Accuracy=0.8156\n",
225
+ "📈 Epoch 9: Loss=1029.8009, Accuracy=0.8168\n",
226
+ "📈 Epoch 10: Loss=1021.3644, Accuracy=0.8175\n",
227
+ "📈 Epoch 11: Loss=1018.4461, Accuracy=0.8186\n",
228
+ "📈 Epoch 12: Loss=1015.0625, Accuracy=0.8189\n",
229
+ "📈 Epoch 13: Loss=1011.4315, Accuracy=0.8195\n",
230
+ "📈 Epoch 14: Loss=1007.4491, Accuracy=0.8208\n",
231
+ "📈 Epoch 15: Loss=1008.6686, Accuracy=0.8221\n",
232
+ "📈 Epoch 16: Loss=1002.7266, Accuracy=0.8218\n",
233
+ "📈 Epoch 17: Loss=1003.0112, Accuracy=0.8230\n",
234
+ "📈 Epoch 18: Loss=1001.5669, Accuracy=0.8233\n",
235
+ "📈 Epoch 19: Loss=998.4341, Accuracy=0.8238\n",
236
+ "📈 Epoch 20: Loss=999.5484, Accuracy=0.8229\n",
237
+ "📈 Epoch 21: Loss=996.5214, Accuracy=0.8238\n",
238
+ "📈 Epoch 22: Loss=995.0207, Accuracy=0.8244\n",
239
+ "📈 Epoch 23: Loss=990.1331, Accuracy=0.8265\n",
240
+ "📈 Epoch 24: Loss=990.1392, Accuracy=0.8263\n",
241
+ "📈 Epoch 25: Loss=992.0144, Accuracy=0.8248\n",
242
+ "📈 Epoch 26: Loss=986.3864, Accuracy=0.8256\n",
243
+ "📈 Epoch 27: Loss=990.5259, Accuracy=0.8248\n",
244
+ "📈 Epoch 28: Loss=992.1301, Accuracy=0.8242\n",
245
+ "📈 Epoch 29: Loss=984.8474, Accuracy=0.8265\n",
246
+ "📈 Epoch 30: Loss=985.9777, Accuracy=0.8260\n",
247
+ "📈 Epoch 31: Loss=984.4458, Accuracy=0.8269\n",
248
+ "📈 Epoch 32: Loss=982.8169, Accuracy=0.8269\n",
249
+ "📈 Epoch 33: Loss=982.3414, Accuracy=0.8271\n",
250
+ "📈 Epoch 34: Loss=981.2389, Accuracy=0.8268\n",
251
+ "📈 Epoch 35: Loss=979.3540, Accuracy=0.8266\n",
252
+ "📈 Epoch 36: Loss=978.4779, Accuracy=0.8277\n",
253
+ "📈 Epoch 37: Loss=977.5119, Accuracy=0.8283\n",
254
+ "📈 Epoch 38: Loss=977.7488, Accuracy=0.8276\n",
255
+ "📈 Epoch 39: Loss=978.3020, Accuracy=0.8291\n",
256
+ "📈 Epoch 40: Loss=976.9651, Accuracy=0.8282\n",
257
+ "📈 Epoch 41: Loss=977.0974, Accuracy=0.8281\n",
258
+ "📈 Epoch 42: Loss=975.9940, Accuracy=0.8289\n",
259
+ "📈 Epoch 43: Loss=973.7795, Accuracy=0.8290\n",
260
+ "📈 Epoch 44: Loss=976.7709, Accuracy=0.8290\n",
261
+ "📈 Epoch 45: Loss=972.3411, Accuracy=0.8291\n",
262
+ "📈 Epoch 46: Loss=971.9982, Accuracy=0.8302\n",
263
+ "📈 Epoch 47: Loss=974.0462, Accuracy=0.8289\n",
264
+ "📈 Epoch 48: Loss=972.3418, Accuracy=0.8291\n",
265
+ "📈 Epoch 49: Loss=973.1150, Accuracy=0.8294\n",
266
+ "📈 Epoch 50: Loss=970.1658, Accuracy=0.8295\n",
267
+ "\n",
268
+ "📊 Classification Report:\n",
269
+ "\n",
270
+ " precision recall f1-score support\n",
271
+ "\n",
272
+ " notdga 0.82 0.87 0.85 1600\n",
273
+ " dga 0.86 0.81 0.83 1600\n",
274
+ "\n",
275
+ " accuracy 0.84 3200\n",
276
+ " macro avg 0.84 0.84 0.84 3200\n",
277
+ "weighted avg 0.84 0.84 0.84 3200\n",
278
+ "\n"
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "source": [
286
+ "def predict_domain(model, domain_name):\n",
287
+ " model.eval()\n",
288
+ " encoded = encode_domain(domain_name)\n",
289
+ " input_tensor = torch.tensor([encoded], dtype=torch.long).to(device)\n",
290
+ "\n",
291
+ " with torch.no_grad():\n",
292
+ " output = model(input_tensor)\n",
293
+ " prediction = torch.argmax(output, dim=1).item()\n",
294
+ "\n",
295
+ " return \"dga\" if prediction == 1 else \"notdga\"\n"
296
+ ],
297
+ "metadata": {
298
+ "id": "42Qef2Y_9ISA"
299
+ },
300
+ "execution_count": 4,
301
+ "outputs": []
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "source": [
306
+ "# Probar un dominio\n",
307
+ "test_domain = \"marca.com\"\n",
308
+ "result = predict_domain(model, test_domain)\n",
309
+ "print(f\"🔍 El dominio '{test_domain}' fue clasificado como: {result.upper()}\")\n"
310
+ ],
311
+ "metadata": {
312
+ "colab": {
313
+ "base_uri": "https://localhost:8080/"
314
+ },
315
+ "id": "e_e8u_dR9KOh",
316
+ "outputId": "3ff44a27-085c-477e-827e-94e3556a3a56"
317
+ },
318
+ "execution_count": 5,
319
+ "outputs": [
320
+ {
321
+ "output_type": "stream",
322
+ "name": "stdout",
323
+ "text": [
324
+ "🔍 El dominio 'marca.com' fue clasificado como: NOTDGA\n"
325
+ ]
326
+ }
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "code",
331
+ "source": [
332
+ "import time\n",
333
+ "from torch.utils.data import DataLoader\n",
334
+ "import gzip\n",
335
+ "\n",
336
+ "def predict_batch_with_timing(model, domains):\n",
337
+ " model.eval()\n",
338
+ " predictions = []\n",
339
+ " timings = []\n",
340
+ "\n",
341
+ " for domain in domains:\n",
342
+ " start_time = time.time()\n",
343
+ "\n",
344
+ " # Codificar y predecir dominio individual\n",
345
+ " encoded = encode_domain(domain)\n",
346
+ " inputs = torch.tensor([encoded], dtype=torch.long).to(device)\n",
347
+ "\n",
348
+ " with torch.no_grad():\n",
349
+ " outputs = model(inputs)\n",
350
+ " pred = outputs.argmax(dim=1).cpu().numpy()[0]\n",
351
+ "\n",
352
+ " end_time = time.time()\n",
353
+ "\n",
354
+ " predictions.append(pred)\n",
355
+ " timings.append(end_time - start_time)\n",
356
+ "\n",
357
+ " return predictions, timings\n",
358
+ "\n",
359
+ "# Código principal modificado\n",
360
+ "families = [\n",
361
+ " 'matsnu.gz',\n",
362
+ " 'suppobox.gz',\n",
363
+ " 'charbot.gz',\n",
364
+ " 'gozi.gz',\n",
365
+ " 'manuelita.gz',\n",
366
+ " 'rovnix.gz',\n",
367
+ " 'deception.gz',\n",
368
+ " 'nymaim.gz'\n",
369
+ "]\n",
370
+ "\n",
371
+ "runs = 30\n",
372
+ "for family in families:\n",
373
+ " print(f\"🔍 Procesando familia: {family}\")\n",
374
+ " dga_reader = pd.read_csv(f'/content/drive/My Drive/Familias_Test/{family}', chunksize=50)\n",
375
+ " legit_reader = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)\n",
376
+ "\n",
377
+ " for run in range(runs):\n",
378
+ " print(f\" ▶️ Run {run+1}/{runs}\", end=\"\\r\")\n",
379
+ " dga_chunk = dga_reader.get_chunk()\n",
380
+ " legit_chunk = legit_reader.get_chunk()\n",
381
+ " df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)\n",
382
+ "\n",
383
+ " # Obtener predicciones y tiempos\n",
384
+ " preds, times = predict_batch_with_timing(model, df_chunk[\"domain\"].values)\n",
385
+ "\n",
386
+ " df_chunk[\"pred\"] = preds\n",
387
+ " df_chunk[\"query_time\"] = times # ✅ Tiempo por dominio\n",
388
+ "\n",
389
+ " df_chunk.to_csv(\n",
390
+ " f\"/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz\",\n",
391
+ " index=False,\n",
392
+ " compression=\"gzip\"\n",
393
+ " )\n"
394
+ ],
395
+ "metadata": {
396
+ "colab": {
397
+ "base_uri": "https://localhost:8080/"
398
+ },
399
+ "id": "a2-uSZ0O-Jp-",
400
+ "outputId": "a54483c6-c0fe-48ce-93c1-44d5f91eacd6"
401
+ },
402
+ "execution_count": 6,
403
+ "outputs": [
404
+ {
405
+ "output_type": "stream",
406
+ "name": "stdout",
407
+ "text": [
408
+ "🔍 Procesando familia: matsnu.gz\n",
409
+ "🔍 Procesando familia: suppobox.gz\n",
410
+ "🔍 Procesando familia: charbot.gz\n",
411
+ "🔍 Procesando familia: gozi.gz\n",
412
+ "🔍 Procesando familia: manuelita.gz\n",
413
+ "🔍 Procesando familia: rovnix.gz\n",
414
+ "🔍 Procesando familia: deception.gz\n",
415
+ "🔍 Procesando familia: nymaim.gz\n"
416
+ ]
417
+ }
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "source": [
423
+ "from torch.utils.data import DataLoader\n",
424
+ "import gzip\n",
425
+ "\n",
426
+ "def predict_batch_with_timing(model, domains):\n",
427
+ " model.eval()\n",
428
+ " predictions = []\n",
429
+ " timings = []\n",
430
+ "\n",
431
+ " for domain in domains:\n",
432
+ " start_time = time.time()\n",
433
+ "\n",
434
+ " # Codificar y predecir dominio individual\n",
435
+ " encoded = encode_domain(domain)\n",
436
+ " inputs = torch.tensor([encoded], dtype=torch.long).to(device)\n",
437
+ "\n",
438
+ " with torch.no_grad():\n",
439
+ " outputs = model(inputs)\n",
440
+ " pred = outputs.argmax(dim=1).cpu().numpy()[0]\n",
441
+ "\n",
442
+ " end_time = time.time()\n",
443
+ "\n",
444
+ " predictions.append(pred)\n",
445
+ " timings.append(end_time - start_time)\n",
446
+ "\n",
447
+ " return predictions, timings\n",
448
+ "\n",
449
+ "\n",
450
+ "\n",
451
+ "families = ['bigviktor.gz',\n",
452
+ " 'pizd.gz',\n",
453
+ " 'ngioweb.gz'\n",
454
+ "\n",
455
+ "\n",
456
+ " ]\n",
457
+ "\n",
458
+ "runs = 30\n",
459
+ "for family in families:\n",
460
+ " print(f\"🔍 Procesando familia: {family}\")\n",
461
+ "\n",
462
+ " dga_reader = pd.read_csv(f'/content/drive/My Drive/New_Families/{family}', chunksize=50)\n",
463
+ " legit_reader = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)\n",
464
+ "\n",
465
+ " # Saltar los primeros 30 chunks de legit\n",
466
+ " for _ in range(30):\n",
467
+ " legit_reader.get_chunk()\n",
468
+ "\n",
469
+ " for run in range(runs):\n",
470
+ " print(f\" ▶️ Run {run+1}/{runs}\", end=\"\\r\")\n",
471
+ " dga_chunk = dga_reader.get_chunk()\n",
472
+ " legit_chunk = legit_reader.get_chunk()\n",
473
+ " df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)\n",
474
+ "\n",
475
+ " # Obtener predicciones y tiempos\n",
476
+ " preds, times = predict_batch_with_timing(model, df_chunk[\"domain\"].values)\n",
477
+ "\n",
478
+ " df_chunk[\"pred\"] = preds\n",
479
+ " df_chunk[\"query_time\"] = times # ✅ Tiempo por dominio\n",
480
+ "\n",
481
+ " df_chunk.to_csv(\n",
482
+ " f\"/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz\",\n",
483
+ " index=False,\n",
484
+ " compression=\"gzip\"\n",
485
+ " )\n",
486
+ "\n",
487
+ "\n",
488
+ "\n",
489
+ "\n"
490
+ ],
491
+ "metadata": {
492
+ "colab": {
493
+ "base_uri": "https://localhost:8080/"
494
+ },
495
+ "id": "cB-p7y_5xn6G",
496
+ "outputId": "ebd3eaa1-a4fb-4e20-ccab-1a373f8bda40"
497
+ },
498
+ "execution_count": 7,
499
+ "outputs": [
500
+ {
501
+ "output_type": "stream",
502
+ "name": "stdout",
503
+ "text": [
504
+ "🔍 Procesando familia: bigviktor.gz\n",
505
+ "🔍 Procesando familia: pizd.gz\n",
506
+ "🔍 Procesando familia: ngioweb.gz\n"
507
+ ]
508
+ }
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "source": [
514
+ "families = [\n",
515
+ " 'matsnu.gz',\n",
516
+ " 'suppobox.gz',\n",
517
+ " 'charbot.gz',\n",
518
+ " 'gozi.gz',\n",
519
+ " 'manuelita.gz',\n",
520
+ " 'rovnix.gz',\n",
521
+ " 'deception.gz',\n",
522
+ " 'nymaim.gz',\n",
523
+ " 'bigviktor.gz',\n",
524
+ " 'pizd.gz',\n",
525
+ " 'ngioweb.gz'\n",
526
+ "]\n",
527
+ "\n",
528
+ "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix\n",
529
+ "import numpy as np\n",
530
+ "import pandas as pd\n",
531
+ "\n",
532
+ "def fpr_tpr(y, ypred):\n",
533
+ " tn, fp, fn, tp = confusion_matrix(y, ypred).ravel()\n",
534
+ " fpr = fp / (fp + tn) if (fp + tn) > 0 else 0\n",
535
+ " tpr = tp / (tp + fn) if (tp + fn) > 0 else 0\n",
536
+ " return fpr, tpr\n",
537
+ "\n",
538
+ "# Listas para métricas globales\n",
539
+ "all_acc, all_pre, all_rec, all_f1 = [], [], [], []\n",
540
+ "all_fpr, all_tpr, all_qt, all_qts = [], [], [], []\n",
541
+ "total_unknowns_global = 0\n",
542
+ "\n",
543
+ "\n",
544
+ "for family in families:\n",
545
+ " acc = []\n",
546
+ " pre = []\n",
547
+ " rec = []\n",
548
+ " f1 = []\n",
549
+ " fpr = []\n",
550
+ " tpr = []\n",
551
+ " qt = []\n",
552
+ " qts = []\n",
553
+ " total_unknowns = 0\n",
554
+ " for run in range(runs):\n",
555
+ " path = f'/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz'\n",
556
+ " df = pd.read_csv(path)\n",
557
+ " #print(df)\n",
558
+ " y_true = (df[\"label\"] == 'dga').astype(int)\n",
559
+ " y_pred = df[\"pred\"]\n",
560
+ "\n",
561
+ " # Métricas\n",
562
+ " acc.append(accuracy_score(y_true, y_pred))\n",
563
+ " pre.append(precision_score(y_true, y_pred, zero_division=0))\n",
564
+ " rec.append(recall_score(y_true, y_pred, zero_division=0))\n",
565
+ " f1.append(f1_score(y_true, y_pred, zero_division=0))\n",
566
+ " fpr_val, tpr_val = fpr_tpr(y_true, y_pred)\n",
567
+ " fpr.append(fpr_val)\n",
568
+ " tpr.append(tpr_val)\n",
569
+ "\n",
570
+ " if 'query_time' in df.columns:\n",
571
+ " qt.append(df['query_time'].mean())\n",
572
+ " qts.append(df['query_time'].std())\n",
573
+ "\n",
574
+ " # Promedios por familia\n",
575
+ " if acc: # solo si hubo archivos válidos\n",
576
+ " print(f'{family.split(\".\")[0]:15}: '\n",
577
+ " f'acc:{np.mean(acc):.2f}±{np.std(acc):.3f} '\n",
578
+ " f'f1:{np.mean(f1):.2f}±{np.std(f1):.3f} '\n",
579
+ " f'pre:{np.mean(pre):.2f}±{np.std(pre):.3f} '\n",
580
+ " f'rec:{np.mean(rec):.2f}±{np.std(rec):.3f} '\n",
581
+ " f'FPR:{np.mean(fpr):.2f}±{np.std(fpr):.3f} '\n",
582
+ " f'TPR:{np.mean(tpr):.2f}±{np.std(tpr):.3f} '\n",
583
+ " f'QT:{np.mean(qt):.5f}±{np.std(qt):.5f} '\n",
584
+ " f'Unknowns: {total_unknowns}')\n",
585
+ "\n",
586
+ " all_acc.append(np.mean(acc))\n",
587
+ " all_pre.append(np.mean(pre))\n",
588
+ " all_rec.append(np.mean(rec))\n",
589
+ " all_f1.append(np.mean(f1))\n",
590
+ " all_fpr.append(np.mean(fpr))\n",
591
+ " all_tpr.append(np.mean(tpr))\n",
592
+ " all_qt.append(np.mean(qt))\n",
593
+ " all_qts.append(np.mean(qts))\n",
594
+ " total_unknowns_global += total_unknowns\n",
595
+ "\n",
596
+ "# 🔍 Métricas globales\n",
597
+ "print(\"\\n### 📊 Métricas globales ###\")\n",
598
+ "print(f'Accuracy : {np.mean(all_acc):.2f}')\n",
599
+ "print(f'F1-Score : {np.mean(all_f1):.2f}')\n",
600
+ "print(f'Precision : {np.mean(all_pre):.2f}')\n",
601
+ "print(f'Recall : {np.mean(all_rec):.2f}')\n",
602
+ "print(f'FPR : {np.mean(all_fpr):.2f}')\n",
603
+ "print(f'TPR : {np.mean(all_tpr):.2f}')\n",
604
+ "print(f'Query time : {np.mean(all_qt):.5f} ± {np.mean(all_qts):.5f}')\n",
605
+ "print(f'Total unknown classifications: {total_unknowns_global}')\n"
606
+ ],
607
+ "metadata": {
608
+ "colab": {
609
+ "base_uri": "https://localhost:8080/"
610
+ },
611
+ "id": "Ta40upzq_H5c",
612
+ "outputId": "eb0dfd23-7ab8-41cb-da46-21d00a721b18"
613
+ },
614
+ "execution_count": 8,
615
+ "outputs": [
616
+ {
617
+ "output_type": "stream",
618
+ "name": "stdout",
619
+ "text": [
620
+ "matsnu : acc:0.90±0.029 f1:0.90±0.026 pre:0.86±0.042 rec:0.95±0.033 FPR:0.15±0.055 TPR:0.95±0.033 QT:0.00043±0.00006 Unknowns: 0\n",
621
+ "suppobox : acc:0.92±0.027 f1:0.93±0.024 pre:0.87±0.041 rec:1.00±0.004 FPR:0.15±0.055 TPR:1.00±0.004 QT:0.00043±0.00004 Unknowns: 0\n",
622
+ "charbot : acc:0.80±0.037 f1:0.79±0.039 pre:0.83±0.051 rec:0.76±0.051 FPR:0.15±0.055 TPR:0.76±0.051 QT:0.00043±0.00003 Unknowns: 0\n",
623
+ "gozi : acc:0.81±0.060 f1:0.80±0.071 pre:0.83±0.055 rec:0.77±0.110 FPR:0.15±0.055 TPR:0.77±0.110 QT:0.00052±0.00008 Unknowns: 0\n",
624
+ "manuelita : acc:0.50±0.038 f1:0.23±0.060 pre:0.50±0.134 rec:0.15±0.041 FPR:0.15±0.055 TPR:0.15±0.041 QT:0.00049±0.00011 Unknowns: 0\n",
625
+ "rovnix : acc:0.92±0.030 f1:0.92±0.026 pre:0.87±0.042 rec:0.99±0.014 FPR:0.15±0.055 TPR:0.99±0.014 QT:0.00044±0.00006 Unknowns: 0\n",
626
+ "deception : acc:0.92±0.028 f1:0.92±0.024 pre:0.87±0.042 rec:0.99±0.012 FPR:0.15±0.055 TPR:0.99±0.012 QT:0.00042±0.00003 Unknowns: 0\n",
627
+ "nymaim : acc:0.82±0.043 f1:0.82±0.046 pre:0.84±0.051 rec:0.80±0.060 FPR:0.15±0.055 TPR:0.80±0.060 QT:0.00043±0.00005 Unknowns: 0\n",
628
+ "bigviktor : acc:0.60±0.041 f1:0.47±0.061 pre:0.70±0.086 rec:0.36±0.057 FPR:0.16±0.054 TPR:0.36±0.057 QT:0.00043±0.00005 Unknowns: 0\n",
629
+ "pizd : acc:0.91±0.028 f1:0.91±0.025 pre:0.86±0.041 rec:0.97±0.019 FPR:0.16±0.054 TPR:0.97±0.019 QT:0.00049±0.00009 Unknowns: 0\n",
630
+ "ngioweb : acc:0.66±0.052 f1:0.58±0.072 pre:0.75±0.080 rec:0.47±0.071 FPR:0.16±0.054 TPR:0.47±0.071 QT:0.00059±0.00012 Unknowns: 0\n",
631
+ "\n",
632
+ "### 📊 Métricas globales ###\n",
633
+ "Accuracy : 0.80\n",
634
+ "F1-Score : 0.75\n",
635
+ "Precision : 0.80\n",
636
+ "Recall : 0.75\n",
637
+ "FPR : 0.15\n",
638
+ "TPR : 0.75\n",
639
+ "Query time : 0.00046 ± 0.00012\n",
640
+ "Total unknown classifications: 0\n"
641
+ ]
642
+ }
643
+ ]
644
+ }
645
+ ]
646
+ }