Che237 commited on
Commit
3d2a643
Β·
verified Β·
1 Parent(s): e4ce7b0

Add 02_feature_engineering.ipynb

Browse files
notebooks/02_feature_engineering.ipynb ADDED
@@ -0,0 +1,752 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b101ef36",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 02 - Feature Engineering\n",
9
+ "\n",
10
+ "## CyberForge AI - Security-Focused Feature Extraction\n",
11
+ "\n",
12
+ "This notebook performs feature engineering for cybersecurity ML models.\n",
13
+ "\n",
14
+ "### Feature Categories:\n",
15
+ "1. **URL Features** - Domain, path, query analysis\n",
16
+ "2. **Network Features** - Request patterns, headers, protocols\n",
17
+ "3. **JavaScript Behavior** - Script patterns, suspicious calls\n",
18
+ "4. **Browser Artifacts** - Cookies, localStorage, fingerprinting\n",
19
+ "5. **Security Indicators** - SSL, headers, CSP\n",
20
+ "\n",
21
+ "### Alignment with Backend:\n",
22
+ "- Features match WebScraperAPIService output format\n",
23
+ "- Compatible with ThreatService detection patterns\n",
24
+ "- Supports real-time inference requirements"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "id": "13b7ad76",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "import json\n",
35
+ "import pandas as pd\n",
36
+ "import numpy as np\n",
37
+ "from pathlib import Path\n",
38
+ "from typing import Dict, List, Any, Optional\n",
39
+ "from urllib.parse import urlparse, parse_qs\n",
40
+ "import re\n",
41
+ "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
42
+ "import warnings\n",
43
+ "warnings.filterwarnings('ignore')\n",
44
+ "\n",
45
+ "# Load configuration\n",
46
+ "config_path = Path(\"../notebook_config.json\")\n",
47
+ "with open(config_path) as f:\n",
48
+ " CONFIG = json.load(f)\n",
49
+ "\n",
50
+ "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n",
51
+ "PROCESSED_DIR = DATASETS_DIR / \"processed\"\n",
52
+ "FEATURES_DIR = DATASETS_DIR / \"features\"\n",
53
+ "FEATURES_DIR.mkdir(exist_ok=True)\n",
54
+ "\n",
55
+ "print(f\"βœ“ Configuration loaded\")\n",
56
+ "print(f\"βœ“ Features output: {FEATURES_DIR}\")"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "id": "1a336f82",
62
+ "metadata": {},
63
+ "source": [
64
+ "## 1. URL Feature Extraction\n",
65
+ "\n",
66
+ "Extract security-relevant features from URLs."
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "id": "6aab702d",
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "try:\n",
77
+ " import tldextract\n",
78
+ "except ImportError:\n",
79
+ " import subprocess\n",
80
+ " subprocess.run(['pip', 'install', 'tldextract', '-q'])\n",
81
+ " import tldextract\n",
82
+ "\n",
83
+ "class URLFeatureExtractor:\n",
84
+ " \"\"\"\n",
85
+ " Extract security-relevant features from URLs.\n",
86
+ " Aligned with backend ThreatService URL analysis.\n",
87
+ " \"\"\"\n",
88
+ " \n",
89
+ " # Suspicious patterns from ThreatService\n",
90
+ " SUSPICIOUS_KEYWORDS = ['phishing', 'malware', 'suspicious', 'hack', 'scam', \n",
91
+ " 'login', 'verify', 'account', 'secure', 'update']\n",
92
+ " INJECTION_PATTERNS = [r'data:text/html', r'javascript:', r'vbscript:']\n",
93
+ " \n",
94
+ " def __init__(self):\n",
95
+ " self.ip_pattern = re.compile(r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}')\n",
96
+ " \n",
97
+ " def extract(self, url: str) -> Dict[str, Any]:\n",
98
+ " \"\"\"Extract all URL features\"\"\"\n",
99
+ " if not isinstance(url, str) or not url:\n",
100
+ " return self._empty_features()\n",
101
+ " \n",
102
+ " try:\n",
103
+ " parsed = urlparse(url)\n",
104
+ " extracted = tldextract.extract(url)\n",
105
+ " \n",
106
+ " features = {\n",
107
+ " # Basic URL structure\n",
108
+ " 'url_length': len(url),\n",
109
+ " 'domain_length': len(parsed.netloc),\n",
110
+ " 'path_length': len(parsed.path),\n",
111
+ " 'query_length': len(parsed.query),\n",
112
+ " \n",
113
+ " # Domain analysis\n",
114
+ " 'subdomain_count': len(extracted.subdomain.split('.')) if extracted.subdomain else 0,\n",
115
+ " 'domain_depth': url.count('/') - 2, # Minus protocol slashes\n",
116
+ " 'has_subdomain': len(extracted.subdomain) > 0,\n",
117
+ " \n",
118
+ " # Protocol security\n",
119
+ " 'is_https': parsed.scheme == 'https',\n",
120
+ " 'has_port': parsed.port is not None,\n",
121
+ " 'non_standard_port': parsed.port not in [None, 80, 443],\n",
122
+ " \n",
123
+ " # Suspicious indicators\n",
124
+ " 'has_ip_address': bool(self.ip_pattern.search(url)),\n",
125
+ " 'suspicious_keyword_count': sum(1 for kw in self.SUSPICIOUS_KEYWORDS if kw in url.lower()),\n",
126
+ " 'has_injection_pattern': any(re.search(p, url, re.I) for p in self.INJECTION_PATTERNS),\n",
127
+ " \n",
128
+ " # Character analysis\n",
129
+ " 'digit_count': sum(c.isdigit() for c in url),\n",
130
+ " 'special_char_count': sum(not c.isalnum() and c not in '/:.' for c in url),\n",
131
+ " 'hyphen_count': url.count('-'),\n",
132
+ " 'underscore_count': url.count('_'),\n",
133
+ " 'at_symbol': '@' in url,\n",
134
+ " \n",
135
+ " # Query parameters\n",
136
+ " 'param_count': len(parse_qs(parsed.query)),\n",
137
+ " 'has_query': len(parsed.query) > 0,\n",
138
+ " \n",
139
+ " # TLD analysis\n",
140
+ " 'tld': extracted.suffix,\n",
141
+ " 'tld_length': len(extracted.suffix),\n",
142
+ " 'is_common_tld': extracted.suffix in ['com', 'org', 'net', 'edu', 'gov'],\n",
143
+ " }\n",
144
+ " \n",
145
+ " return features\n",
146
+ " \n",
147
+ " except Exception as e:\n",
148
+ " return self._empty_features()\n",
149
+ " \n",
150
+ " def _empty_features(self) -> Dict:\n",
151
+ " \"\"\"Return empty feature dict for invalid URLs\"\"\"\n",
152
+ " return {\n",
153
+ " 'url_length': 0, 'domain_length': 0, 'path_length': 0, 'query_length': 0,\n",
154
+ " 'subdomain_count': 0, 'domain_depth': 0, 'has_subdomain': False,\n",
155
+ " 'is_https': False, 'has_port': False, 'non_standard_port': False,\n",
156
+ " 'has_ip_address': False, 'suspicious_keyword_count': 0, 'has_injection_pattern': False,\n",
157
+ " 'digit_count': 0, 'special_char_count': 0, 'hyphen_count': 0, 'underscore_count': 0,\n",
158
+ " 'at_symbol': False, 'param_count': 0, 'has_query': False,\n",
159
+ " 'tld': '', 'tld_length': 0, 'is_common_tld': False\n",
160
+ " }\n",
161
+ " \n",
162
+ " def extract_batch(self, urls: List[str]) -> pd.DataFrame:\n",
163
+ " \"\"\"Extract features from multiple URLs\"\"\"\n",
164
+ " features = [self.extract(url) for url in urls]\n",
165
+ " return pd.DataFrame(features)\n",
166
+ "\n",
167
+ "url_extractor = URLFeatureExtractor()\n",
168
+ "print(\"βœ“ URL Feature Extractor initialized\")\n",
169
+ "\n",
170
+ "# Test\n",
171
+ "test_features = url_extractor.extract(\"https://suspicious-login.example.com/verify?id=123\")\n",
172
+ "print(f\"\\nTest features extracted: {len(test_features)} features\")"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "markdown",
177
+ "id": "d907161a",
178
+ "metadata": {},
179
+ "source": [
180
+ "## 2. Network Request Feature Extraction\n",
181
+ "\n",
182
+ "Features for HTTP request analysis (aligned with WebScraperAPIService)."
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "id": "191e80a3",
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "class NetworkFeatureExtractor:\n",
193
+ " \"\"\"\n",
194
+ " Extract features from network request data.\n",
195
+ " Matches WebScraperAPIService network_requests format.\n",
196
+ " \"\"\"\n",
197
+ " \n",
198
+ " RISKY_CONTENT_TYPES = ['application/javascript', 'text/javascript', 'application/x-javascript']\n",
199
+ " \n",
200
+ " def extract_from_requests(self, requests: List[Dict]) -> Dict[str, Any]:\n",
201
+ " \"\"\"Extract features from a list of network requests\"\"\"\n",
202
+ " if not requests:\n",
203
+ " return self._empty_features()\n",
204
+ " \n",
205
+ " # Request type counts\n",
206
+ " types = [r.get('type', 'unknown').lower() for r in requests]\n",
207
+ " methods = [r.get('method', 'GET').upper() for r in requests]\n",
208
+ " statuses = [r.get('status', 0) for r in requests]\n",
209
+ " \n",
210
+ " return {\n",
211
+ " # Volume metrics\n",
212
+ " 'total_requests': len(requests),\n",
213
+ " 'script_requests': types.count('script'),\n",
214
+ " 'xhr_requests': types.count('xhr'),\n",
215
+ " 'image_requests': types.count('image'),\n",
216
+ " 'stylesheet_requests': types.count('stylesheet'),\n",
217
+ " 'document_requests': types.count('document'),\n",
218
+ " \n",
219
+ " # Method distribution\n",
220
+ " 'get_requests': methods.count('GET'),\n",
221
+ " 'post_requests': methods.count('POST'),\n",
222
+ " 'other_method_requests': len([m for m in methods if m not in ['GET', 'POST']]),\n",
223
+ " \n",
224
+ " # Status analysis\n",
225
+ " 'successful_requests': sum(1 for s in statuses if 200 <= s < 300),\n",
226
+ " 'redirect_requests': sum(1 for s in statuses if 300 <= s < 400),\n",
227
+ " 'client_error_requests': sum(1 for s in statuses if 400 <= s < 500),\n",
228
+ " 'server_error_requests': sum(1 for s in statuses if s >= 500),\n",
229
+ " 'failed_request_ratio': sum(1 for s in statuses if s >= 400) / max(len(requests), 1),\n",
230
+ " \n",
231
+ " # Size metrics\n",
232
+ " 'total_size_kb': sum(r.get('size', 0) for r in requests) / 1024,\n",
233
+ " 'avg_request_size': np.mean([r.get('size', 0) for r in requests]) if requests else 0,\n",
234
+ " \n",
235
+ " # Domain diversity\n",
236
+ " 'unique_domains': len(set(self._extract_domain(r.get('url', '')) for r in requests)),\n",
237
+ " 'third_party_ratio': self._calculate_third_party_ratio(requests),\n",
238
+ " }\n",
239
+ " \n",
240
+ " def _extract_domain(self, url: str) -> str:\n",
241
+ " try:\n",
242
+ " return urlparse(url).netloc\n",
243
+ " except:\n",
244
+ " return ''\n",
245
+ " \n",
246
+ " def _calculate_third_party_ratio(self, requests: List[Dict]) -> float:\n",
247
+ " if not requests:\n",
248
+ " return 0.0\n",
249
+ " domains = [self._extract_domain(r.get('url', '')) for r in requests]\n",
250
+ " if not domains:\n",
251
+ " return 0.0\n",
252
+ " main_domain = max(set(domains), key=domains.count) if domains else ''\n",
253
+ " third_party = sum(1 for d in domains if d and d != main_domain)\n",
254
+ " return third_party / len(requests)\n",
255
+ " \n",
256
+ " def _empty_features(self) -> Dict:\n",
257
+ " return {\n",
258
+ " 'total_requests': 0, 'script_requests': 0, 'xhr_requests': 0,\n",
259
+ " 'image_requests': 0, 'stylesheet_requests': 0, 'document_requests': 0,\n",
260
+ " 'get_requests': 0, 'post_requests': 0, 'other_method_requests': 0,\n",
261
+ " 'successful_requests': 0, 'redirect_requests': 0,\n",
262
+ " 'client_error_requests': 0, 'server_error_requests': 0, 'failed_request_ratio': 0,\n",
263
+ " 'total_size_kb': 0, 'avg_request_size': 0,\n",
264
+ " 'unique_domains': 0, 'third_party_ratio': 0\n",
265
+ " }\n",
266
+ "\n",
267
+ "network_extractor = NetworkFeatureExtractor()\n",
268
+ "print(\"βœ“ Network Feature Extractor initialized\")"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "markdown",
273
+ "id": "32d319c6",
274
+ "metadata": {},
275
+ "source": [
276
+ "## 3. Security Header Feature Extraction\n",
277
+ "\n",
278
+ "Features based on HTTP security headers."
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "id": "cddfef62",
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "class SecurityHeaderExtractor:\n",
289
+ " \"\"\"\n",
290
+ " Extract features from HTTP security headers.\n",
291
+ " Aligned with WebScraperAPIService security_report.\n",
292
+ " \"\"\"\n",
293
+ " \n",
294
+ " SECURITY_HEADERS = [\n",
295
+ " 'Content-Security-Policy',\n",
296
+ " 'X-Content-Type-Options',\n",
297
+ " 'X-Frame-Options',\n",
298
+ " 'X-XSS-Protection',\n",
299
+ " 'Strict-Transport-Security',\n",
300
+ " 'Referrer-Policy',\n",
301
+ " 'Permissions-Policy',\n",
302
+ " 'X-Permitted-Cross-Domain-Policies'\n",
303
+ " ]\n",
304
+ " \n",
305
+ " def extract(self, headers: Dict[str, str], security_report: Dict = None) -> Dict[str, Any]:\n",
306
+ " \"\"\"Extract security header features\"\"\"\n",
307
+ " headers_lower = {k.lower(): v for k, v in (headers or {}).items()}\n",
308
+ " \n",
309
+ " features = {}\n",
310
+ " \n",
311
+ " # Check each security header\n",
312
+ " for header in self.SECURITY_HEADERS:\n",
313
+ " key = f\"has_{header.lower().replace('-', '_')}\"\n",
314
+ " features[key] = header.lower() in headers_lower\n",
315
+ " \n",
316
+ " # Aggregate metrics\n",
317
+ " features['security_headers_count'] = sum(1 for h in self.SECURITY_HEADERS if h.lower() in headers_lower)\n",
318
+ " features['security_headers_ratio'] = features['security_headers_count'] / len(self.SECURITY_HEADERS)\n",
319
+ " features['missing_security_headers'] = len(self.SECURITY_HEADERS) - features['security_headers_count']\n",
320
+ " \n",
321
+ " # From security report if available\n",
322
+ " if security_report:\n",
323
+ " features['is_https'] = security_report.get('is_https', False)\n",
324
+ " features['has_mixed_content'] = security_report.get('mixed_content', False)\n",
325
+ " features['has_insecure_cookies'] = security_report.get('insecure_cookies', False)\n",
326
+ " \n",
327
+ " return features\n",
328
+ " \n",
329
+ " def calculate_security_score(self, features: Dict) -> float:\n",
330
+ " \"\"\"Calculate overall security score (0-100)\"\"\"\n",
331
+ " score = 0\n",
332
+ " \n",
333
+ " # Headers (40 points max)\n",
334
+ " score += features.get('security_headers_ratio', 0) * 40\n",
335
+ " \n",
336
+ " # HTTPS (30 points)\n",
337
+ " if features.get('is_https', False):\n",
338
+ " score += 30\n",
339
+ " \n",
340
+ " # No mixed content (15 points)\n",
341
+ " if not features.get('has_mixed_content', True):\n",
342
+ " score += 15\n",
343
+ " \n",
344
+ " # Secure cookies (15 points)\n",
345
+ " if not features.get('has_insecure_cookies', True):\n",
346
+ " score += 15\n",
347
+ " \n",
348
+ " return min(100, max(0, score))\n",
349
+ "\n",
350
+ "header_extractor = SecurityHeaderExtractor()\n",
351
+ "print(\"βœ“ Security Header Extractor initialized\")"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "markdown",
356
+ "id": "c176789d",
357
+ "metadata": {},
358
+ "source": [
359
+ "## 4. JavaScript Behavior Feature Extraction"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": null,
365
+ "id": "7443a87a",
366
+ "metadata": {},
367
+ "outputs": [],
368
+ "source": [
369
+ "class JavaScriptFeatureExtractor:\n",
370
+ " \"\"\"\n",
371
+ " Extract features from JavaScript behavior analysis.\n",
372
+ " Supports desktop app browser monitoring.\n",
373
+ " \"\"\"\n",
374
+ " \n",
375
+ " SUSPICIOUS_APIS = [\n",
376
+ " 'eval', 'document.write', 'innerHTML', 'outerHTML',\n",
377
+ " 'localStorage', 'sessionStorage', 'indexedDB',\n",
378
+ " 'navigator.geolocation', 'navigator.credentials',\n",
379
+ " 'crypto.subtle', 'WebSocket'\n",
380
+ " ]\n",
381
+ " \n",
382
+ " OBFUSCATION_PATTERNS = [\n",
383
+ " r'\\\\x[0-9a-fA-F]{2}', # Hex encoding\n",
384
+ " r'\\\\u[0-9a-fA-F]{4}', # Unicode encoding\n",
385
+ " r'atob\\(', # Base64 decode\n",
386
+ " r'String\\.fromCharCode', # Char code obfuscation\n",
387
+ " r'unescape\\(', # URL decode\n",
388
+ " ]\n",
389
+ " \n",
390
+ " def extract_from_console_logs(self, logs: List[Dict]) -> Dict[str, Any]:\n",
391
+ " \"\"\"Extract features from console logs\"\"\"\n",
392
+ " if not logs:\n",
393
+ " return self._empty_features()\n",
394
+ " \n",
395
+ " levels = [log.get('level', 'log').lower() for log in logs]\n",
396
+ " messages = [log.get('message', '') for log in logs]\n",
397
+ " all_text = ' '.join(messages)\n",
398
+ " \n",
399
+ " return {\n",
400
+ " 'console_log_count': len(logs),\n",
401
+ " 'console_error_count': levels.count('error'),\n",
402
+ " 'console_warning_count': levels.count('warning'),\n",
403
+ " 'console_info_count': levels.count('info'),\n",
404
+ " 'error_ratio': levels.count('error') / max(len(logs), 1),\n",
405
+ " 'has_security_errors': any('security' in m.lower() or 'cors' in m.lower() for m in messages),\n",
406
+ " 'has_csp_violations': any('content security policy' in m.lower() for m in messages),\n",
407
+ " }\n",
408
+ " \n",
409
+ " def analyze_script_content(self, script: str) -> Dict[str, Any]:\n",
410
+ " \"\"\"Analyze JavaScript code for suspicious patterns\"\"\"\n",
411
+ " if not script:\n",
412
+ " return self._empty_script_features()\n",
413
+ " \n",
414
+ " return {\n",
415
+ " 'script_length': len(script),\n",
416
+ " 'suspicious_api_count': sum(1 for api in self.SUSPICIOUS_APIS if api in script),\n",
417
+ " 'obfuscation_score': sum(len(re.findall(p, script)) for p in self.OBFUSCATION_PATTERNS),\n",
418
+ " 'has_eval': 'eval(' in script or 'eval (' in script,\n",
419
+ " 'has_document_write': 'document.write' in script,\n",
420
+ " 'has_inline_event_handlers': bool(re.search(r'on\\w+\\s*=', script)),\n",
421
+ " 'external_url_count': len(re.findall(r'https?://[^\\s\"\\')]+', script)),\n",
422
+ " 'function_count': len(re.findall(r'function\\s*\\w*\\s*\\(', script)),\n",
423
+ " }\n",
424
+ " \n",
425
+ " def _empty_features(self) -> Dict:\n",
426
+ " return {\n",
427
+ " 'console_log_count': 0, 'console_error_count': 0, 'console_warning_count': 0,\n",
428
+ " 'console_info_count': 0, 'error_ratio': 0, 'has_security_errors': False,\n",
429
+ " 'has_csp_violations': False\n",
430
+ " }\n",
431
+ " \n",
432
+ " def _empty_script_features(self) -> Dict:\n",
433
+ " return {\n",
434
+ " 'script_length': 0, 'suspicious_api_count': 0, 'obfuscation_score': 0,\n",
435
+ " 'has_eval': False, 'has_document_write': False, 'has_inline_event_handlers': False,\n",
436
+ " 'external_url_count': 0, 'function_count': 0\n",
437
+ " }\n",
438
+ "\n",
439
+ "js_extractor = JavaScriptFeatureExtractor()\n",
440
+ "print(\"βœ“ JavaScript Feature Extractor initialized\")"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "markdown",
445
+ "id": "5b31de89",
446
+ "metadata": {},
447
+ "source": [
448
+ "## 5. Unified Feature Pipeline"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": null,
454
+ "id": "b9fd30ae",
455
+ "metadata": {},
456
+ "outputs": [],
457
+ "source": [
458
+ "class CyberForgeFeaturePipeline:\n",
459
+ " \"\"\"\n",
460
+ " Unified feature extraction pipeline for CyberForge AI.\n",
461
+ " Combines all extractors for comprehensive security feature engineering.\n",
462
+ " \"\"\"\n",
463
+ " \n",
464
+ " def __init__(self):\n",
465
+ " self.url_extractor = URLFeatureExtractor()\n",
466
+ " self.network_extractor = NetworkFeatureExtractor()\n",
467
+ " self.header_extractor = SecurityHeaderExtractor()\n",
468
+ " self.js_extractor = JavaScriptFeatureExtractor()\n",
469
+ " self.scaler = StandardScaler()\n",
470
+ " self.label_encoder = LabelEncoder()\n",
471
+ " self.feature_names = []\n",
472
+ " \n",
473
+ " def extract_website_features(self, scraped_data: Dict) -> Dict[str, Any]:\n",
474
+ " \"\"\"Extract all features from website scraped data\"\"\"\n",
475
+ " features = {}\n",
476
+ " \n",
477
+ " # URL features\n",
478
+ " url_features = self.url_extractor.extract(scraped_data.get('url', ''))\n",
479
+ " features.update({f\"url_{k}\": v for k, v in url_features.items() if k != 'tld'})\n",
480
+ " \n",
481
+ " # Network features\n",
482
+ " network_features = self.network_extractor.extract_from_requests(\n",
483
+ " scraped_data.get('network_requests', [])\n",
484
+ " )\n",
485
+ " features.update({f\"net_{k}\": v for k, v in network_features.items()})\n",
486
+ " \n",
487
+ " # Security header features\n",
488
+ " header_features = self.header_extractor.extract(\n",
489
+ " scraped_data.get('response_headers', {}),\n",
490
+ " scraped_data.get('security_report', {})\n",
491
+ " )\n",
492
+ " features.update({f\"sec_{k}\": v for k, v in header_features.items()})\n",
493
+ " \n",
494
+ " # JavaScript features\n",
495
+ " js_features = self.js_extractor.extract_from_console_logs(\n",
496
+ " scraped_data.get('console_logs', [])\n",
497
+ " )\n",
498
+ " features.update({f\"js_{k}\": v for k, v in js_features.items()})\n",
499
+ " \n",
500
+ " # Calculate risk score\n",
501
+ " features['security_score'] = self.header_extractor.calculate_security_score(header_features)\n",
502
+ " \n",
503
+ " return features\n",
504
+ " \n",
505
+ " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n",
506
+ " \"\"\"Process a dataset and extract URL features\"\"\"\n",
507
+ " if url_column not in df.columns:\n",
508
+ " print(f\" ⚠ No '{url_column}' column found\")\n",
509
+ " return df\n",
510
+ " \n",
511
+ " # Extract URL features\n",
512
+ " url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n",
513
+ " url_df = pd.DataFrame(url_features.tolist())\n",
514
+ " url_df.columns = [f\"url_{c}\" for c in url_df.columns if c != 'tld']\n",
515
+ " \n",
516
+ " # Combine with original features\n",
517
+ " result = pd.concat([df.reset_index(drop=True), url_df.reset_index(drop=True)], axis=1)\n",
518
+ " \n",
519
+ " return result\n",
520
+ " \n",
521
+ " def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n",
522
+ " \"\"\"Prepare features for model training\"\"\"\n",
523
+ " df = df.copy()\n",
524
+ " \n",
525
+ " # Separate features and labels\n",
526
+ " if label_column in df.columns:\n",
527
+ " y = df[label_column]\n",
528
+ " X = df.drop(columns=[label_column])\n",
529
+ " else:\n",
530
+ " y = None\n",
531
+ " X = df\n",
532
+ " \n",
533
+ " # Select numeric columns only\n",
534
+ " numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
535
+ " X_numeric = X[numeric_cols].fillna(0)\n",
536
+ " \n",
537
+ " # Convert boolean to int\n",
538
+ " bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n",
539
+ " for col in bool_cols:\n",
540
+ " X_numeric[col] = X[col].astype(int)\n",
541
+ " \n",
542
+ " self.feature_names = X_numeric.columns.tolist()\n",
543
+ " \n",
544
+ " # Encode labels if present\n",
545
+ " if y is not None:\n",
546
+ " if y.dtype == 'object':\n",
547
+ " y = self.label_encoder.fit_transform(y)\n",
548
+ " else:\n",
549
+ " y = y.values\n",
550
+ " \n",
551
+ " return X_numeric, y\n",
552
+ "\n",
553
+ "pipeline = CyberForgeFeaturePipeline()\n",
554
+ "print(\"βœ“ Feature Pipeline initialized\")"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "markdown",
559
+ "id": "cd70536a",
560
+ "metadata": {},
561
+ "source": [
562
+ "## 6. Process Datasets"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": null,
568
+ "id": "7e334044",
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": [
572
+ "# Load manifest\n",
573
+ "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
574
+ "if manifest_path.exists():\n",
575
+ " with open(manifest_path) as f:\n",
576
+ " manifest = json.load(f)\n",
577
+ " print(f\"βœ“ Loaded manifest with {len(manifest)} datasets\")\n",
578
+ "else:\n",
579
+ " print(\"⚠ No manifest found. Run 01_data_acquisition.ipynb first.\")\n",
580
+ " manifest = []"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": null,
586
+ "id": "0b049596",
587
+ "metadata": {},
588
+ "outputs": [],
589
+ "source": [
590
+ "# Process each dataset\n",
591
+ "processed_datasets = {}\n",
592
+ "feature_stats = []\n",
593
+ "\n",
594
+ "print(\"Processing datasets for feature engineering...\\n\")\n",
595
+ "\n",
596
+ "for entry in manifest:\n",
597
+ " name = entry['name']\n",
598
+ " path = Path(\"..\") / entry['path']\n",
599
+ " \n",
600
+ " if not path.exists():\n",
601
+ " print(f\" ⚠ {name}: File not found\")\n",
602
+ " continue\n",
603
+ " \n",
604
+ " print(f\" Processing: {name}\")\n",
605
+ " \n",
606
+ " try:\n",
607
+ " df = pd.read_csv(path)\n",
608
+ " \n",
609
+ " # Check for URL column to extract URL features\n",
610
+ " url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
611
+ " if url_cols:\n",
612
+ " df = pipeline.process_dataset(df, url_column=url_cols[0])\n",
613
+ " \n",
614
+ " # Prepare for training\n",
615
+ " X, y = pipeline.prepare_for_training(df)\n",
616
+ " \n",
617
+ " processed_datasets[name] = {\n",
618
+ " 'X': X,\n",
619
+ " 'y': y,\n",
620
+ " 'feature_names': pipeline.feature_names,\n",
621
+ " 'n_samples': len(X),\n",
622
+ " 'n_features': len(pipeline.feature_names)\n",
623
+ " }\n",
624
+ " \n",
625
+ " print(f\" βœ“ {len(X)} samples, {len(pipeline.feature_names)} features\")\n",
626
+ " \n",
627
+ " feature_stats.append({\n",
628
+ " 'name': name,\n",
629
+ " 'samples': len(X),\n",
630
+ " 'features': len(pipeline.feature_names),\n",
631
+ " 'has_labels': y is not None\n",
632
+ " })\n",
633
+ " \n",
634
+ " except Exception as e:\n",
635
+ " print(f\" ⚠ Error: {e}\")\n",
636
+ "\n",
637
+ "print(f\"\\nβœ“ Processed {len(processed_datasets)} datasets\")"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "markdown",
642
+ "id": "096db774",
643
+ "metadata": {},
644
+ "source": [
645
+ "## 7. Save Feature-Engineered Data"
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "code",
650
+ "execution_count": null,
651
+ "id": "9bb49674",
652
+ "metadata": {},
653
+ "outputs": [],
654
+ "source": [
655
+ "import joblib\n",
656
+ "\n",
657
+ "# Save processed datasets\n",
658
+ "feature_manifest = []\n",
659
+ "\n",
660
+ "print(\"Saving feature-engineered datasets...\")\n",
661
+ "\n",
662
+ "for name, data in processed_datasets.items():\n",
663
+ " # Save as parquet for efficiency\n",
664
+ " output_path = FEATURES_DIR / f\"{name}_features.parquet\"\n",
665
+ " \n",
666
+ " # Create dataframe with features\n",
667
+ " df_features = data['X'].copy()\n",
668
+ " if data['y'] is not None:\n",
669
+ " df_features['label'] = data['y']\n",
670
+ " \n",
671
+ " df_features.to_parquet(output_path, index=False)\n",
672
+ " \n",
673
+ " feature_manifest.append({\n",
674
+ " 'name': name,\n",
675
+ " 'path': str(output_path.relative_to(DATASETS_DIR.parent)),\n",
676
+ " 'samples': data['n_samples'],\n",
677
+ " 'features': data['n_features'],\n",
678
+ " 'feature_names': data['feature_names'],\n",
679
+ " 'has_labels': data['y'] is not None\n",
680
+ " })\n",
681
+ " \n",
682
+ " print(f\" βœ“ Saved: {output_path.name}\")\n",
683
+ "\n",
684
+ "# Save feature manifest\n",
685
+ "manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
686
+ "with open(manifest_path, \"w\") as f:\n",
687
+ " json.dump(feature_manifest, f, indent=2)\n",
688
+ "\n",
689
+ "# Save pipeline for inference\n",
690
+ "pipeline_path = FEATURES_DIR / \"feature_pipeline.pkl\"\n",
691
+ "joblib.dump(pipeline, pipeline_path)\n",
692
+ "\n",
693
+ "print(f\"\\nβœ“ Feature manifest saved to: {manifest_path}\")\n",
694
+ "print(f\"βœ“ Feature pipeline saved to: {pipeline_path}\")"
695
+ ]
696
+ },
697
+ {
698
+ "cell_type": "markdown",
699
+ "id": "1fe65eae",
700
+ "metadata": {},
701
+ "source": [
702
+ "## 8. Summary"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": null,
708
+ "id": "02cc2a14",
709
+ "metadata": {},
710
+ "outputs": [],
711
+ "source": [
712
+ "print(\"\\n\" + \"=\" * 60)\n",
713
+ "print(\"FEATURE ENGINEERING COMPLETE\")\n",
714
+ "print(\"=\" * 60)\n",
715
+ "\n",
716
+ "total_samples = sum(d['n_samples'] for d in processed_datasets.values())\n",
717
+ "total_features = max(d['n_features'] for d in processed_datasets.values()) if processed_datasets else 0\n",
718
+ "\n",
719
+ "print(f\"\"\"\n",
720
+ "πŸ”§ Feature Engineering Summary:\n",
721
+ " - Datasets processed: {len(processed_datasets)}\n",
722
+ " - Total samples: {total_samples:,}\n",
723
+ " - Max features: {total_features}\n",
724
+ " - Output directory: {FEATURES_DIR}\n",
725
+ "\n",
726
+ "πŸ“Š Feature Categories:\n",
727
+ " - URL Features: Domain, path, security indicators\n",
728
+ " - Network Features: Request patterns, status codes\n",
729
+ " - Security Headers: CSP, HSTS, X-Frame-Options\n",
730
+ " - JavaScript: Console logs, suspicious APIs\n",
731
+ "\n",
732
+ "πŸ“ Datasets Ready for Training:\"\"\")\n",
733
+ "\n",
734
+ "for entry in feature_manifest:\n",
735
+ " print(f\" βœ“ {entry['name']}: {entry['samples']:,} samples, {entry['features']} features\")\n",
736
+ "\n",
737
+ "print(f\"\"\"\n",
738
+ "Next step:\n",
739
+ " β†’ 03_model_training.ipynb\n",
740
+ "\"\"\")\n",
741
+ "print(\"=\" * 60)"
742
+ ]
743
+ }
744
+ ],
745
+ "metadata": {
746
+ "language_info": {
747
+ "name": "python"
748
+ }
749
+ },
750
+ "nbformat": 4,
751
+ "nbformat_minor": 5
752
+ }