{ "cells": [ { "cell_type": "markdown", "id": "b101ef36", "metadata": {}, "source": [ "# 02 - Feature Engineering\n", "\n", "## CyberForge AI - Security-Focused Feature Extraction\n", "\n", "This notebook performs feature engineering for cybersecurity ML models.\n", "\n", "### Feature Categories:\n", "1. **URL Features** - Domain, path, query analysis\n", "2. **Network Features** - Request patterns, headers, protocols\n", "3. **JavaScript Behavior** - Script patterns, suspicious calls\n", "4. **Browser Artifacts** - Cookies, localStorage, fingerprinting\n", "5. **Security Indicators** - SSL, headers, CSP\n", "\n", "### Alignment with Backend:\n", "- Features match WebScraperAPIService output format\n", "- Compatible with ThreatService detection patterns\n", "- Supports real-time inference requirements" ] }, { "cell_type": "code", "execution_count": null, "id": "13b7ad76", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "from typing import Dict, List, Any, Optional\n", "from urllib.parse import urlparse, parse_qs\n", "import re\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Load configuration\n", "config_path = Path(\"notebook_config.json\")\n", "if not config_path.exists():\n", " config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n", "with open(config_path) as f:\n", " CONFIG = json.load(f)\n", "\n", "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n", "PROCESSED_DIR = DATASETS_DIR / \"processed\"\n", "FEATURES_DIR = DATASETS_DIR / \"features\"\n", "FEATURES_DIR.mkdir(exist_ok=True)\n", "\n", "print(f\"\u2713 Configuration loaded\")\n", "print(f\"\u2713 Features output: {FEATURES_DIR}\")" ] }, { "cell_type": "markdown", "id": "1a336f82", "metadata": {}, "source": [ "## 1. URL Feature Extraction\n", "\n", "Extract security-relevant features from URLs." ] }, { "cell_type": "code", "execution_count": null, "id": "6aab702d", "metadata": {}, "outputs": [], "source": [ "try:\n", " import tldextract\n", "except ImportError:\n", " import subprocess\n", " subprocess.run(['pip', 'install', 'tldextract', '-q'])\n", " import tldextract\n", "\n", "class URLFeatureExtractor:\n", " \"\"\"\n", " Extract security-relevant features from URLs.\n", " Aligned with backend ThreatService URL analysis.\n", " \"\"\"\n", " \n", " # Suspicious patterns from ThreatService\n", " SUSPICIOUS_KEYWORDS = ['phishing', 'malware', 'suspicious', 'hack', 'scam', \n", " 'login', 'verify', 'account', 'secure', 'update']\n", " INJECTION_PATTERNS = [r'data:text/html', r'javascript:', r'vbscript:']\n", " \n", " def __init__(self):\n", " self.ip_pattern = re.compile(r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}')\n", " \n", " def extract(self, url: str) -> Dict[str, Any]:\n", " \"\"\"Extract all URL features\"\"\"\n", " if not isinstance(url, str) or not url:\n", " return self._empty_features()\n", " \n", " try:\n", " parsed = urlparse(url)\n", " extracted = tldextract.extract(url)\n", " \n", " features = {\n", " # Basic URL structure\n", " 'url_length': len(url),\n", " 'domain_length': len(parsed.netloc),\n", " 'path_length': len(parsed.path),\n", " 'query_length': len(parsed.query),\n", " \n", " # Domain analysis\n", " 'subdomain_count': len(extracted.subdomain.split('.')) if extracted.subdomain else 0,\n", " 'domain_depth': url.count('/') - 2, # Minus protocol slashes\n", " 'has_subdomain': len(extracted.subdomain) > 0,\n", " \n", " # Protocol security\n", " 'is_https': parsed.scheme == 'https',\n", " 'has_port': parsed.port is not None,\n", " 'non_standard_port': parsed.port not in [None, 80, 443],\n", " \n", " # Suspicious indicators\n", " 'has_ip_address': bool(self.ip_pattern.search(url)),\n", " 'suspicious_keyword_count': sum(1 for kw in self.SUSPICIOUS_KEYWORDS if kw in url.lower()),\n", " 'has_injection_pattern': any(re.search(p, url, re.I) for p in self.INJECTION_PATTERNS),\n", " \n", " # Character analysis\n", " 'digit_count': sum(c.isdigit() for c in url),\n", " 'special_char_count': sum(not c.isalnum() and c not in '/:.' for c in url),\n", " 'hyphen_count': url.count('-'),\n", " 'underscore_count': url.count('_'),\n", " 'at_symbol': '@' in url,\n", " \n", " # Query parameters\n", " 'param_count': len(parse_qs(parsed.query)),\n", " 'has_query': len(parsed.query) > 0,\n", " \n", " # TLD analysis\n", " 'tld': extracted.suffix,\n", " 'tld_length': len(extracted.suffix),\n", " 'is_common_tld': extracted.suffix in ['com', 'org', 'net', 'edu', 'gov'],\n", " }\n", " \n", " return features\n", " \n", " except Exception as e:\n", " return self._empty_features()\n", " \n", " def _empty_features(self) -> Dict:\n", " \"\"\"Return empty feature dict for invalid URLs\"\"\"\n", " return {\n", " 'url_length': 0, 'domain_length': 0, 'path_length': 0, 'query_length': 0,\n", " 'subdomain_count': 0, 'domain_depth': 0, 'has_subdomain': False,\n", " 'is_https': False, 'has_port': False, 'non_standard_port': False,\n", " 'has_ip_address': False, 'suspicious_keyword_count': 0, 'has_injection_pattern': False,\n", " 'digit_count': 0, 'special_char_count': 0, 'hyphen_count': 0, 'underscore_count': 0,\n", " 'at_symbol': False, 'param_count': 0, 'has_query': False,\n", " 'tld': '', 'tld_length': 0, 'is_common_tld': False\n", " }\n", " \n", " def extract_batch(self, urls: List[str]) -> pd.DataFrame:\n", " \"\"\"Extract features from multiple URLs\"\"\"\n", " features = [self.extract(url) for url in urls]\n", " return pd.DataFrame(features)\n", "\n", "url_extractor = URLFeatureExtractor()\n", "print(\"\u2713 URL Feature Extractor initialized\")\n", "\n", "# Test\n", "test_features = url_extractor.extract(\"https://suspicious-login.example.com/verify?id=123\")\n", "print(f\"\\nTest features extracted: {len(test_features)} features\")" ] }, { "cell_type": "markdown", "id": "d907161a", "metadata": {}, "source": [ "## 2. Network Request Feature Extraction\n", "\n", "Features for HTTP request analysis (aligned with WebScraperAPIService)." ] }, { "cell_type": "code", "execution_count": null, "id": "191e80a3", "metadata": {}, "outputs": [], "source": [ "class NetworkFeatureExtractor:\n", " \"\"\"\n", " Extract features from network request data.\n", " Matches WebScraperAPIService network_requests format.\n", " \"\"\"\n", " \n", " RISKY_CONTENT_TYPES = ['application/javascript', 'text/javascript', 'application/x-javascript']\n", " \n", " def extract_from_requests(self, requests: List[Dict]) -> Dict[str, Any]:\n", " \"\"\"Extract features from a list of network requests\"\"\"\n", " if not requests:\n", " return self._empty_features()\n", " \n", " # Request type counts\n", " types = [r.get('type', 'unknown').lower() for r in requests]\n", " methods = [r.get('method', 'GET').upper() for r in requests]\n", " statuses = [r.get('status', 0) for r in requests]\n", " \n", " return {\n", " # Volume metrics\n", " 'total_requests': len(requests),\n", " 'script_requests': types.count('script'),\n", " 'xhr_requests': types.count('xhr'),\n", " 'image_requests': types.count('image'),\n", " 'stylesheet_requests': types.count('stylesheet'),\n", " 'document_requests': types.count('document'),\n", " \n", " # Method distribution\n", " 'get_requests': methods.count('GET'),\n", " 'post_requests': methods.count('POST'),\n", " 'other_method_requests': len([m for m in methods if m not in ['GET', 'POST']]),\n", " \n", " # Status analysis\n", " 'successful_requests': sum(1 for s in statuses if 200 <= s < 300),\n", " 'redirect_requests': sum(1 for s in statuses if 300 <= s < 400),\n", " 'client_error_requests': sum(1 for s in statuses if 400 <= s < 500),\n", " 'server_error_requests': sum(1 for s in statuses if s >= 500),\n", " 'failed_request_ratio': sum(1 for s in statuses if s >= 400) / max(len(requests), 1),\n", " \n", " # Size metrics\n", " 'total_size_kb': sum(r.get('size', 0) for r in requests) / 1024,\n", " 'avg_request_size': np.mean([r.get('size', 0) for r in requests]) if requests else 0,\n", " \n", " # Domain diversity\n", " 'unique_domains': len(set(self._extract_domain(r.get('url', '')) for r in requests)),\n", " 'third_party_ratio': self._calculate_third_party_ratio(requests),\n", " }\n", " \n", " def _extract_domain(self, url: str) -> str:\n", " try:\n", " return urlparse(url).netloc\n", " except:\n", " return ''\n", " \n", " def _calculate_third_party_ratio(self, requests: List[Dict]) -> float:\n", " if not requests:\n", " return 0.0\n", " domains = [self._extract_domain(r.get('url', '')) for r in requests]\n", " if not domains:\n", " return 0.0\n", " main_domain = max(set(domains), key=domains.count) if domains else ''\n", " third_party = sum(1 for d in domains if d and d != main_domain)\n", " return third_party / len(requests)\n", " \n", " def _empty_features(self) -> Dict:\n", " return {\n", " 'total_requests': 0, 'script_requests': 0, 'xhr_requests': 0,\n", " 'image_requests': 0, 'stylesheet_requests': 0, 'document_requests': 0,\n", " 'get_requests': 0, 'post_requests': 0, 'other_method_requests': 0,\n", " 'successful_requests': 0, 'redirect_requests': 0,\n", " 'client_error_requests': 0, 'server_error_requests': 0, 'failed_request_ratio': 0,\n", " 'total_size_kb': 0, 'avg_request_size': 0,\n", " 'unique_domains': 0, 'third_party_ratio': 0\n", " }\n", "\n", "network_extractor = NetworkFeatureExtractor()\n", "print(\"\u2713 Network Feature Extractor initialized\")" ] }, { "cell_type": "markdown", "id": "32d319c6", "metadata": {}, "source": [ "## 3. Security Header Feature Extraction\n", "\n", "Features based on HTTP security headers." ] }, { "cell_type": "code", "execution_count": null, "id": "cddfef62", "metadata": {}, "outputs": [], "source": [ "class SecurityHeaderExtractor:\n", " \"\"\"\n", " Extract features from HTTP security headers.\n", " Aligned with WebScraperAPIService security_report.\n", " \"\"\"\n", " \n", " SECURITY_HEADERS = [\n", " 'Content-Security-Policy',\n", " 'X-Content-Type-Options',\n", " 'X-Frame-Options',\n", " 'X-XSS-Protection',\n", " 'Strict-Transport-Security',\n", " 'Referrer-Policy',\n", " 'Permissions-Policy',\n", " 'X-Permitted-Cross-Domain-Policies'\n", " ]\n", " \n", " def extract(self, headers: Dict[str, str], security_report: Dict = None) -> Dict[str, Any]:\n", " \"\"\"Extract security header features\"\"\"\n", " headers_lower = {k.lower(): v for k, v in (headers or {}).items()}\n", " \n", " features = {}\n", " \n", " # Check each security header\n", " for header in self.SECURITY_HEADERS:\n", " key = f\"has_{header.lower().replace('-', '_')}\"\n", " features[key] = header.lower() in headers_lower\n", " \n", " # Aggregate metrics\n", " features['security_headers_count'] = sum(1 for h in self.SECURITY_HEADERS if h.lower() in headers_lower)\n", " features['security_headers_ratio'] = features['security_headers_count'] / len(self.SECURITY_HEADERS)\n", " features['missing_security_headers'] = len(self.SECURITY_HEADERS) - features['security_headers_count']\n", " \n", " # From security report if available\n", " if security_report:\n", " features['is_https'] = security_report.get('is_https', False)\n", " features['has_mixed_content'] = security_report.get('mixed_content', False)\n", " features['has_insecure_cookies'] = security_report.get('insecure_cookies', False)\n", " \n", " return features\n", " \n", " def calculate_security_score(self, features: Dict) -> float:\n", " \"\"\"Calculate overall security score (0-100)\"\"\"\n", " score = 0\n", " \n", " # Headers (40 points max)\n", " score += features.get('security_headers_ratio', 0) * 40\n", " \n", " # HTTPS (30 points)\n", " if features.get('is_https', False):\n", " score += 30\n", " \n", " # No mixed content (15 points)\n", " if not features.get('has_mixed_content', True):\n", " score += 15\n", " \n", " # Secure cookies (15 points)\n", " if not features.get('has_insecure_cookies', True):\n", " score += 15\n", " \n", " return min(100, max(0, score))\n", "\n", "header_extractor = SecurityHeaderExtractor()\n", "print(\"\u2713 Security Header Extractor initialized\")" ] }, { "cell_type": "markdown", "id": "c176789d", "metadata": {}, "source": [ "## 4. JavaScript Behavior Feature Extraction" ] }, { "cell_type": "code", "execution_count": null, "id": "7443a87a", "metadata": {}, "outputs": [], "source": [ "class JavaScriptFeatureExtractor:\n", " \"\"\"\n", " Extract features from JavaScript behavior analysis.\n", " Supports desktop app browser monitoring.\n", " \"\"\"\n", " \n", " SUSPICIOUS_APIS = [\n", " 'eval', 'document.write', 'innerHTML', 'outerHTML',\n", " 'localStorage', 'sessionStorage', 'indexedDB',\n", " 'navigator.geolocation', 'navigator.credentials',\n", " 'crypto.subtle', 'WebSocket'\n", " ]\n", " \n", " OBFUSCATION_PATTERNS = [\n", " r'\\\\x[0-9a-fA-F]{2}', # Hex encoding\n", " r'\\\\u[0-9a-fA-F]{4}', # Unicode encoding\n", " r'atob\\(', # Base64 decode\n", " r'String\\.fromCharCode', # Char code obfuscation\n", " r'unescape\\(', # URL decode\n", " ]\n", " \n", " def extract_from_console_logs(self, logs: List[Dict]) -> Dict[str, Any]:\n", " \"\"\"Extract features from console logs\"\"\"\n", " if not logs:\n", " return self._empty_features()\n", " \n", " levels = [log.get('level', 'log').lower() for log in logs]\n", " messages = [log.get('message', '') for log in logs]\n", " all_text = ' '.join(messages)\n", " \n", " return {\n", " 'console_log_count': len(logs),\n", " 'console_error_count': levels.count('error'),\n", " 'console_warning_count': levels.count('warning'),\n", " 'console_info_count': levels.count('info'),\n", " 'error_ratio': levels.count('error') / max(len(logs), 1),\n", " 'has_security_errors': any('security' in m.lower() or 'cors' in m.lower() for m in messages),\n", " 'has_csp_violations': any('content security policy' in m.lower() for m in messages),\n", " }\n", " \n", " def analyze_script_content(self, script: str) -> Dict[str, Any]:\n", " \"\"\"Analyze JavaScript code for suspicious patterns\"\"\"\n", " if not script:\n", " return self._empty_script_features()\n", " \n", " return {\n", " 'script_length': len(script),\n", " 'suspicious_api_count': sum(1 for api in self.SUSPICIOUS_APIS if api in script),\n", " 'obfuscation_score': sum(len(re.findall(p, script)) for p in self.OBFUSCATION_PATTERNS),\n", " 'has_eval': 'eval(' in script or 'eval (' in script,\n", " 'has_document_write': 'document.write' in script,\n", " 'has_inline_event_handlers': bool(re.search(r'on\\w+\\s*=', script)),\n", " 'external_url_count': len(re.findall(r'https?://[^\\s\"\\')]+', script)),\n", " 'function_count': len(re.findall(r'function\\s*\\w*\\s*\\(', script)),\n", " }\n", " \n", " def _empty_features(self) -> Dict:\n", " return {\n", " 'console_log_count': 0, 'console_error_count': 0, 'console_warning_count': 0,\n", " 'console_info_count': 0, 'error_ratio': 0, 'has_security_errors': False,\n", " 'has_csp_violations': False\n", " }\n", " \n", " def _empty_script_features(self) -> Dict:\n", " return {\n", " 'script_length': 0, 'suspicious_api_count': 0, 'obfuscation_score': 0,\n", " 'has_eval': False, 'has_document_write': False, 'has_inline_event_handlers': False,\n", " 'external_url_count': 0, 'function_count': 0\n", " }\n", "\n", "js_extractor = JavaScriptFeatureExtractor()\n", "print(\"\u2713 JavaScript Feature Extractor initialized\")" ] }, { "cell_type": "markdown", "id": "5b31de89", "metadata": {}, "source": [ "## 5. Unified Feature Pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "b9fd30ae", "metadata": {}, "outputs": [], "source": [ "class CyberForgeFeaturePipeline:\n", " \"\"\"\n", " Unified feature extraction pipeline.\n", " Combines all extractors for complete feature engineering.\n", " \"\"\"\n", " \n", " def __init__(self):\n", " self.url_extractor = url_extractor\n", " self.network_extractor = network_extractor\n", " self.header_extractor = header_extractor\n", " self.js_extractor = js_extractor\n", " self.scaler = StandardScaler()\n", " self.label_encoder = LabelEncoder()\n", " self.feature_names = []\n", " \n", " def extract_website_features(self, scraped_data: Dict) -> Dict[str, Any]:\n", " \"\"\"Extract all features from website scraped data\"\"\"\n", " features = {}\n", " \n", " # URL features\n", " url_features = self.url_extractor.extract(scraped_data.get('url', ''))\n", " features.update({f\"url_{k}\": v for k, v in url_features.items() if k != 'tld'})\n", " \n", " # Network features\n", " network_features = self.network_extractor.extract_from_requests(\n", " scraped_data.get('network_requests', [])\n", " )\n", " features.update({f\"net_{k}\": v for k, v in network_features.items()})\n", " \n", " # Security header features\n", " header_features = self.header_extractor.extract(\n", " scraped_data.get('response_headers', {}),\n", " scraped_data.get('security_report', {})\n", " )\n", " features.update({f\"sec_{k}\": v for k, v in header_features.items()})\n", " \n", " # JavaScript features\n", " js_features = self.js_extractor.extract_from_console_logs(\n", " scraped_data.get('console_logs', [])\n", " )\n", " features.update({f\"js_{k}\": v for k, v in js_features.items()})\n", " \n", " # Calculate risk score\n", " features['security_score'] = self.header_extractor.calculate_security_score(header_features)\n", " \n", " return features\n", " \n", " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n", " \"\"\"Process a dataset and extract URL features\"\"\"\n", " if url_column not in df.columns:\n", " print(f\" Warning: No '{url_column}' column found\")\n", " return df\n", " \n", " try:\n", " # Extract URL features\n", " url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n", " url_df = pd.DataFrame(url_features.tolist())\n", " \n", " # Drop non-numeric 'tld' column before renaming\n", " if 'tld' in url_df.columns:\n", " url_df = url_df.drop(columns=['tld'])\n", " \n", " # Rename columns with url_ prefix\n", " url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n", " \n", " # Combine with original features (drop original url column to avoid issues)\n", " result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n", " result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n", " \n", " return result\n", " except Exception as e:\n", " print(f\" Warning: URL feature extraction error: {e}\")\n", " return df\n", " \n", " def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n", " \"\"\"Prepare features for model training\"\"\"\n", " df = df.copy()\n", " \n", " # Find label column (case insensitive, multiple names)\n", " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n", " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n", " actual_label_col = None\n", " for col in df.columns:\n", " if col.lower() in [lc.lower() for lc in label_candidates]:\n", " actual_label_col = col\n", " break\n", " \n", " # Separate features and labels\n", " if actual_label_col:\n", " y = df[actual_label_col]\n", " X = df.drop(columns=[actual_label_col])\n", " else:\n", " y = None\n", " X = df\n", " \n", " # Select numeric columns only\n", " numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n", " bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n", " \n", " X_numeric = X[numeric_cols].fillna(0)\n", " \n", " # Convert boolean to int\n", " for col in bool_cols:\n", " if col in X.columns:\n", " X_numeric[col] = X[col].astype(int)\n", " \n", " self.feature_names = X_numeric.columns.tolist()\n", " \n", " # Encode labels if present\n", " if y is not None:\n", " if y.dtype == 'object':\n", " y = self.label_encoder.fit_transform(y)\n", " else:\n", " y = y.values\n", " \n", " return X_numeric, y\n", "\n", "pipeline = CyberForgeFeaturePipeline()\n", "print(\"Feature Pipeline initialized\")\n" ] }, { "cell_type": "markdown", "id": "cd70536a", "metadata": {}, "source": [ "## 6. Process Datasets" ] }, { "cell_type": "code", "execution_count": null, "id": "7e334044", "metadata": {}, "outputs": [], "source": [ "# Load manifest\n", "manifest_path = PROCESSED_DIR / \"manifest.json\"\n", "if manifest_path.exists():\n", " with open(manifest_path) as f:\n", " manifest = json.load(f)\n", " print(f\"\u2713 Loaded manifest with {len(manifest)} datasets\")\n", "else:\n", " print(\"\u26a0 No manifest found. Run 01_data_acquisition.ipynb first.\")\n", " manifest = []" ] }, { "cell_type": "code", "execution_count": null, "id": "0b049596", "metadata": {}, "outputs": [], "source": [ "# Process each dataset\n", "processed_datasets = {}\n", "feature_stats = []\n", "\n", "print(\"Processing datasets for feature engineering...\\n\")\n", "\n", "for entry in manifest:\n", " name = entry['name']\n", " path = Path(\"..\") / entry['path']\n", " \n", " if not path.exists():\n", " print(f\" \u26a0 {name}: File not found\")\n", " continue\n", " \n", " print(f\" Processing: {name}\")\n", " \n", " try:\n", " df = pd.read_csv(path)\n", " \n", " # IMPORTANT: Extract and preserve label column BEFORE processing\n", " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n", " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n", " original_label = None\n", " label_col_name = None\n", " for col in df.columns:\n", " if col.lower() in [lc.lower() for lc in label_candidates]:\n", " original_label = df[col].copy()\n", " label_col_name = col\n", " print(f\" Found label column: {col}\")\n", " break\n", " \n", " # Check for URL column to extract URL features\n", " url_cols = [c for c in df.columns if 'url' in c.lower()]\n", " if url_cols:\n", " df = pipeline.process_dataset(df, url_column=url_cols[0])\n", " \n", " # Prepare for training\n", " X, y = pipeline.prepare_for_training(df)\n", " \n", " # If y is None but we found original_label, use that\n", " if y is None and original_label is not None:\n", " y = original_label.values\n", " print(f\" Restored label from original: {label_col_name}\")\n", " \n", " processed_datasets[name] = {\n", " 'X': X,\n", " 'y': y,\n", " 'feature_names': pipeline.feature_names,\n", " 'n_samples': len(X),\n", " 'n_features': len(pipeline.feature_names)\n", " }\n", " \n", " label_status = \"with labels\" if y is not None else \"no labels\"\n", " print(f\" \u2713 {len(X)} samples, {len(pipeline.feature_names)} features ({label_status})\")\n", " \n", " feature_stats.append({\n", " 'name': name,\n", " 'samples': len(X),\n", " 'features': len(pipeline.feature_names),\n", " 'has_labels': y is not None\n", " })\n", " \n", " except Exception as e:\n", " print(f\" \u26a0 Error: {e}\")\n", "\n", "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")\n", "\n" ] }, { "cell_type": "markdown", "id": "096db774", "metadata": {}, "source": [ "## 7. Save Feature-Engineered Data" ] }, { "cell_type": "code", "execution_count": null, "id": "9bb49674", "metadata": {}, "outputs": [], "source": [ "import joblib\n", "\n", "# Save processed datasets\n", "feature_manifest = []\n", "\n", "print(\"Saving feature-engineered datasets...\")\n", "\n", "for name, data in processed_datasets.items():\n", " # Save as parquet for efficiency\n", " output_path = FEATURES_DIR / f\"{name}_features.parquet\"\n", " \n", " # Create dataframe with features\n", " df_features = data['X'].copy()\n", " if data['y'] is not None:\n", " df_features['label'] = data['y']\n", " \n", " df_features.to_parquet(output_path, index=False)\n", " \n", " feature_manifest.append({\n", " 'name': name,\n", " 'path': str(output_path.relative_to(DATASETS_DIR.parent)),\n", " 'samples': data['n_samples'],\n", " 'features': data['n_features'],\n", " 'feature_names': data['feature_names'],\n", " 'has_labels': data['y'] is not None\n", " })\n", " \n", " print(f\" \u2713 Saved: {output_path.name}\")\n", "\n", "# Save feature manifest\n", "manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n", "with open(manifest_path, \"w\") as f:\n", " json.dump(feature_manifest, f, indent=2)\n", "\n", "# Save pipeline for inference\n", "pipeline_path = FEATURES_DIR / \"feature_pipeline.pkl\"\n", "joblib.dump(pipeline, pipeline_path)\n", "\n", "print(f\"\\n\u2713 Feature manifest saved to: {manifest_path}\")\n", "print(f\"\u2713 Feature pipeline saved to: {pipeline_path}\")" ] }, { "cell_type": "markdown", "id": "1fe65eae", "metadata": {}, "source": [ "## 8. Summary" ] }, { "cell_type": "code", "execution_count": null, "id": "02cc2a14", "metadata": {}, "outputs": [], "source": [ "print(\"\\n\" + \"=\" * 60)\n", "print(\"FEATURE ENGINEERING COMPLETE\")\n", "print(\"=\" * 60)\n", "\n", "total_samples = sum(d['n_samples'] for d in processed_datasets.values())\n", "total_features = max(d['n_features'] for d in processed_datasets.values()) if processed_datasets else 0\n", "\n", "print(f\"\"\"\n", "\ud83d\udd27 Feature Engineering Summary:\n", " - Datasets processed: {len(processed_datasets)}\n", " - Total samples: {total_samples:,}\n", " - Max features: {total_features}\n", " - Output directory: {FEATURES_DIR}\n", "\n", "\ud83d\udcca Feature Categories:\n", " - URL Features: Domain, path, security indicators\n", " - Network Features: Request patterns, status codes\n", " - Security Headers: CSP, HSTS, X-Frame-Options\n", " - JavaScript: Console logs, suspicious APIs\n", "\n", "\ud83d\udcc1 Datasets Ready for Training:\"\"\")\n", "\n", "for entry in feature_manifest:\n", " print(f\" \u2713 {entry['name']}: {entry['samples']:,} samples, {entry['features']} features\")\n", "\n", "print(f\"\"\"\n", "Next step:\n", " \u2192 03_model_training.ipynb\n", "\"\"\")\n", "print(\"=\" * 60)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }