Spaces:

Che237
/

cyberforge

Running

File size: 32,688 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b101ef36",
   "metadata": {},
   "source": [
    "# 02 - Feature Engineering\n",
    "\n",
    "## CyberForge AI - Security-Focused Feature Extraction\n",
    "\n",
    "This notebook performs feature engineering for cybersecurity ML models.\n",
    "\n",
    "### Feature Categories:\n",
    "1. **URL Features** - Domain, path, query analysis\n",
    "2. **Network Features** - Request patterns, headers, protocols\n",
    "3. **JavaScript Behavior** - Script patterns, suspicious calls\n",
    "4. **Browser Artifacts** - Cookies, localStorage, fingerprinting\n",
    "5. **Security Indicators** - SSL, headers, CSP\n",
    "\n",
    "### Alignment with Backend:\n",
    "- Features match WebScraperAPIService output format\n",
    "- Compatible with ThreatService detection patterns\n",
    "- Supports real-time inference requirements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13b7ad76",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "from typing import Dict, List, Any, Optional\n",
    "from urllib.parse import urlparse, parse_qs\n",
    "import re\n",
    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Load configuration\n",
    "config_path = Path(\"notebook_config.json\")\n",
    "if not config_path.exists():\n",
    "    config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n",
    "with open(config_path) as f:\n",
    "    CONFIG = json.load(f)\n",
    "\n",
    "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n",
    "PROCESSED_DIR = DATASETS_DIR / \"processed\"\n",
    "FEATURES_DIR = DATASETS_DIR / \"features\"\n",
    "FEATURES_DIR.mkdir(exist_ok=True)\n",
    "\n",
    "print(f\"\u2713 Configuration loaded\")\n",
    "print(f\"\u2713 Features output: {FEATURES_DIR}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a336f82",
   "metadata": {},
   "source": [
    "## 1. URL Feature Extraction\n",
    "\n",
    "Extract security-relevant features from URLs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aab702d",
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    import tldextract\n",
    "except ImportError:\n",
    "    import subprocess\n",
    "    subprocess.run(['pip', 'install', 'tldextract', '-q'])\n",
    "    import tldextract\n",
    "\n",
    "class URLFeatureExtractor:\n",
    "    \"\"\"\n",
    "    Extract security-relevant features from URLs.\n",
    "    Aligned with backend ThreatService URL analysis.\n",
    "    \"\"\"\n",
    "    \n",
    "    # Suspicious patterns from ThreatService\n",
    "    SUSPICIOUS_KEYWORDS = ['phishing', 'malware', 'suspicious', 'hack', 'scam', \n",
    "                          'login', 'verify', 'account', 'secure', 'update']\n",
    "    INJECTION_PATTERNS = [r'data:text/html', r'javascript:', r'vbscript:']\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.ip_pattern = re.compile(r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}')\n",
    "    \n",
    "    def extract(self, url: str) -> Dict[str, Any]:\n",
    "        \"\"\"Extract all URL features\"\"\"\n",
    "        if not isinstance(url, str) or not url:\n",
    "            return self._empty_features()\n",
    "        \n",
    "        try:\n",
    "            parsed = urlparse(url)\n",
    "            extracted = tldextract.extract(url)\n",
    "            \n",
    "            features = {\n",
    "                # Basic URL structure\n",
    "                'url_length': len(url),\n",
    "                'domain_length': len(parsed.netloc),\n",
    "                'path_length': len(parsed.path),\n",
    "                'query_length': len(parsed.query),\n",
    "                \n",
    "                # Domain analysis\n",
    "                'subdomain_count': len(extracted.subdomain.split('.')) if extracted.subdomain else 0,\n",
    "                'domain_depth': url.count('/') - 2,  # Minus protocol slashes\n",
    "                'has_subdomain': len(extracted.subdomain) > 0,\n",
    "                \n",
    "                # Protocol security\n",
    "                'is_https': parsed.scheme == 'https',\n",
    "                'has_port': parsed.port is not None,\n",
    "                'non_standard_port': parsed.port not in [None, 80, 443],\n",
    "                \n",
    "                # Suspicious indicators\n",
    "                'has_ip_address': bool(self.ip_pattern.search(url)),\n",
    "                'suspicious_keyword_count': sum(1 for kw in self.SUSPICIOUS_KEYWORDS if kw in url.lower()),\n",
    "                'has_injection_pattern': any(re.search(p, url, re.I) for p in self.INJECTION_PATTERNS),\n",
    "                \n",
    "                # Character analysis\n",
    "                'digit_count': sum(c.isdigit() for c in url),\n",
    "                'special_char_count': sum(not c.isalnum() and c not in '/:.' for c in url),\n",
    "                'hyphen_count': url.count('-'),\n",
    "                'underscore_count': url.count('_'),\n",
    "                'at_symbol': '@' in url,\n",
    "                \n",
    "                # Query parameters\n",
    "                'param_count': len(parse_qs(parsed.query)),\n",
    "                'has_query': len(parsed.query) > 0,\n",
    "                \n",
    "                # TLD analysis\n",
    "                'tld': extracted.suffix,\n",
    "                'tld_length': len(extracted.suffix),\n",
    "                'is_common_tld': extracted.suffix in ['com', 'org', 'net', 'edu', 'gov'],\n",
    "            }\n",
    "            \n",
    "            return features\n",
    "            \n",
    "        except Exception as e:\n",
    "            return self._empty_features()\n",
    "    \n",
    "    def _empty_features(self) -> Dict:\n",
    "        \"\"\"Return empty feature dict for invalid URLs\"\"\"\n",
    "        return {\n",
    "            'url_length': 0, 'domain_length': 0, 'path_length': 0, 'query_length': 0,\n",
    "            'subdomain_count': 0, 'domain_depth': 0, 'has_subdomain': False,\n",
    "            'is_https': False, 'has_port': False, 'non_standard_port': False,\n",
    "            'has_ip_address': False, 'suspicious_keyword_count': 0, 'has_injection_pattern': False,\n",
    "            'digit_count': 0, 'special_char_count': 0, 'hyphen_count': 0, 'underscore_count': 0,\n",
    "            'at_symbol': False, 'param_count': 0, 'has_query': False,\n",
    "            'tld': '', 'tld_length': 0, 'is_common_tld': False\n",
    "        }\n",
    "    \n",
    "    def extract_batch(self, urls: List[str]) -> pd.DataFrame:\n",
    "        \"\"\"Extract features from multiple URLs\"\"\"\n",
    "        features = [self.extract(url) for url in urls]\n",
    "        return pd.DataFrame(features)\n",
    "\n",
    "url_extractor = URLFeatureExtractor()\n",
    "print(\"\u2713 URL Feature Extractor initialized\")\n",
    "\n",
    "# Test\n",
    "test_features = url_extractor.extract(\"https://suspicious-login.example.com/verify?id=123\")\n",
    "print(f\"\\nTest features extracted: {len(test_features)} features\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d907161a",
   "metadata": {},
   "source": [
    "## 2. Network Request Feature Extraction\n",
    "\n",
    "Features for HTTP request analysis (aligned with WebScraperAPIService)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "191e80a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class NetworkFeatureExtractor:\n",
    "    \"\"\"\n",
    "    Extract features from network request data.\n",
    "    Matches WebScraperAPIService network_requests format.\n",
    "    \"\"\"\n",
    "    \n",
    "    RISKY_CONTENT_TYPES = ['application/javascript', 'text/javascript', 'application/x-javascript']\n",
    "    \n",
    "    def extract_from_requests(self, requests: List[Dict]) -> Dict[str, Any]:\n",
    "        \"\"\"Extract features from a list of network requests\"\"\"\n",
    "        if not requests:\n",
    "            return self._empty_features()\n",
    "        \n",
    "        # Request type counts\n",
    "        types = [r.get('type', 'unknown').lower() for r in requests]\n",
    "        methods = [r.get('method', 'GET').upper() for r in requests]\n",
    "        statuses = [r.get('status', 0) for r in requests]\n",
    "        \n",
    "        return {\n",
    "            # Volume metrics\n",
    "            'total_requests': len(requests),\n",
    "            'script_requests': types.count('script'),\n",
    "            'xhr_requests': types.count('xhr'),\n",
    "            'image_requests': types.count('image'),\n",
    "            'stylesheet_requests': types.count('stylesheet'),\n",
    "            'document_requests': types.count('document'),\n",
    "            \n",
    "            # Method distribution\n",
    "            'get_requests': methods.count('GET'),\n",
    "            'post_requests': methods.count('POST'),\n",
    "            'other_method_requests': len([m for m in methods if m not in ['GET', 'POST']]),\n",
    "            \n",
    "            # Status analysis\n",
    "            'successful_requests': sum(1 for s in statuses if 200 <= s < 300),\n",
    "            'redirect_requests': sum(1 for s in statuses if 300 <= s < 400),\n",
    "            'client_error_requests': sum(1 for s in statuses if 400 <= s < 500),\n",
    "            'server_error_requests': sum(1 for s in statuses if s >= 500),\n",
    "            'failed_request_ratio': sum(1 for s in statuses if s >= 400) / max(len(requests), 1),\n",
    "            \n",
    "            # Size metrics\n",
    "            'total_size_kb': sum(r.get('size', 0) for r in requests) / 1024,\n",
    "            'avg_request_size': np.mean([r.get('size', 0) for r in requests]) if requests else 0,\n",
    "            \n",
    "            # Domain diversity\n",
    "            'unique_domains': len(set(self._extract_domain(r.get('url', '')) for r in requests)),\n",
    "            'third_party_ratio': self._calculate_third_party_ratio(requests),\n",
    "        }\n",
    "    \n",
    "    def _extract_domain(self, url: str) -> str:\n",
    "        try:\n",
    "            return urlparse(url).netloc\n",
    "        except:\n",
    "            return ''\n",
    "    \n",
    "    def _calculate_third_party_ratio(self, requests: List[Dict]) -> float:\n",
    "        if not requests:\n",
    "            return 0.0\n",
    "        domains = [self._extract_domain(r.get('url', '')) for r in requests]\n",
    "        if not domains:\n",
    "            return 0.0\n",
    "        main_domain = max(set(domains), key=domains.count) if domains else ''\n",
    "        third_party = sum(1 for d in domains if d and d != main_domain)\n",
    "        return third_party / len(requests)\n",
    "    \n",
    "    def _empty_features(self) -> Dict:\n",
    "        return {\n",
    "            'total_requests': 0, 'script_requests': 0, 'xhr_requests': 0,\n",
    "            'image_requests': 0, 'stylesheet_requests': 0, 'document_requests': 0,\n",
    "            'get_requests': 0, 'post_requests': 0, 'other_method_requests': 0,\n",
    "            'successful_requests': 0, 'redirect_requests': 0,\n",
    "            'client_error_requests': 0, 'server_error_requests': 0, 'failed_request_ratio': 0,\n",
    "            'total_size_kb': 0, 'avg_request_size': 0,\n",
    "            'unique_domains': 0, 'third_party_ratio': 0\n",
    "        }\n",
    "\n",
    "network_extractor = NetworkFeatureExtractor()\n",
    "print(\"\u2713 Network Feature Extractor initialized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32d319c6",
   "metadata": {},
   "source": [
    "## 3. Security Header Feature Extraction\n",
    "\n",
    "Features based on HTTP security headers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cddfef62",
   "metadata": {},
   "outputs": [],
   "source": [
    "class SecurityHeaderExtractor:\n",
    "    \"\"\"\n",
    "    Extract features from HTTP security headers.\n",
    "    Aligned with WebScraperAPIService security_report.\n",
    "    \"\"\"\n",
    "    \n",
    "    SECURITY_HEADERS = [\n",
    "        'Content-Security-Policy',\n",
    "        'X-Content-Type-Options',\n",
    "        'X-Frame-Options',\n",
    "        'X-XSS-Protection',\n",
    "        'Strict-Transport-Security',\n",
    "        'Referrer-Policy',\n",
    "        'Permissions-Policy',\n",
    "        'X-Permitted-Cross-Domain-Policies'\n",
    "    ]\n",
    "    \n",
    "    def extract(self, headers: Dict[str, str], security_report: Dict = None) -> Dict[str, Any]:\n",
    "        \"\"\"Extract security header features\"\"\"\n",
    "        headers_lower = {k.lower(): v for k, v in (headers or {}).items()}\n",
    "        \n",
    "        features = {}\n",
    "        \n",
    "        # Check each security header\n",
    "        for header in self.SECURITY_HEADERS:\n",
    "            key = f\"has_{header.lower().replace('-', '_')}\"\n",
    "            features[key] = header.lower() in headers_lower\n",
    "        \n",
    "        # Aggregate metrics\n",
    "        features['security_headers_count'] = sum(1 for h in self.SECURITY_HEADERS if h.lower() in headers_lower)\n",
    "        features['security_headers_ratio'] = features['security_headers_count'] / len(self.SECURITY_HEADERS)\n",
    "        features['missing_security_headers'] = len(self.SECURITY_HEADERS) - features['security_headers_count']\n",
    "        \n",
    "        # From security report if available\n",
    "        if security_report:\n",
    "            features['is_https'] = security_report.get('is_https', False)\n",
    "            features['has_mixed_content'] = security_report.get('mixed_content', False)\n",
    "            features['has_insecure_cookies'] = security_report.get('insecure_cookies', False)\n",
    "        \n",
    "        return features\n",
    "    \n",
    "    def calculate_security_score(self, features: Dict) -> float:\n",
    "        \"\"\"Calculate overall security score (0-100)\"\"\"\n",
    "        score = 0\n",
    "        \n",
    "        # Headers (40 points max)\n",
    "        score += features.get('security_headers_ratio', 0) * 40\n",
    "        \n",
    "        # HTTPS (30 points)\n",
    "        if features.get('is_https', False):\n",
    "            score += 30\n",
    "        \n",
    "        # No mixed content (15 points)\n",
    "        if not features.get('has_mixed_content', True):\n",
    "            score += 15\n",
    "        \n",
    "        # Secure cookies (15 points)\n",
    "        if not features.get('has_insecure_cookies', True):\n",
    "            score += 15\n",
    "        \n",
    "        return min(100, max(0, score))\n",
    "\n",
    "header_extractor = SecurityHeaderExtractor()\n",
    "print(\"\u2713 Security Header Extractor initialized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c176789d",
   "metadata": {},
   "source": [
    "## 4. JavaScript Behavior Feature Extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7443a87a",
   "metadata": {},
   "outputs": [],
   "source": [
    "class JavaScriptFeatureExtractor:\n",
    "    \"\"\"\n",
    "    Extract features from JavaScript behavior analysis.\n",
    "    Supports desktop app browser monitoring.\n",
    "    \"\"\"\n",
    "    \n",
    "    SUSPICIOUS_APIS = [\n",
    "        'eval', 'document.write', 'innerHTML', 'outerHTML',\n",
    "        'localStorage', 'sessionStorage', 'indexedDB',\n",
    "        'navigator.geolocation', 'navigator.credentials',\n",
    "        'crypto.subtle', 'WebSocket'\n",
    "    ]\n",
    "    \n",
    "    OBFUSCATION_PATTERNS = [\n",
    "        r'\\\\x[0-9a-fA-F]{2}',  # Hex encoding\n",
    "        r'\\\\u[0-9a-fA-F]{4}',  # Unicode encoding\n",
    "        r'atob\\(',              # Base64 decode\n",
    "        r'String\\.fromCharCode', # Char code obfuscation\n",
    "        r'unescape\\(',          # URL decode\n",
    "    ]\n",
    "    \n",
    "    def extract_from_console_logs(self, logs: List[Dict]) -> Dict[str, Any]:\n",
    "        \"\"\"Extract features from console logs\"\"\"\n",
    "        if not logs:\n",
    "            return self._empty_features()\n",
    "        \n",
    "        levels = [log.get('level', 'log').lower() for log in logs]\n",
    "        messages = [log.get('message', '') for log in logs]\n",
    "        all_text = ' '.join(messages)\n",
    "        \n",
    "        return {\n",
    "            'console_log_count': len(logs),\n",
    "            'console_error_count': levels.count('error'),\n",
    "            'console_warning_count': levels.count('warning'),\n",
    "            'console_info_count': levels.count('info'),\n",
    "            'error_ratio': levels.count('error') / max(len(logs), 1),\n",
    "            'has_security_errors': any('security' in m.lower() or 'cors' in m.lower() for m in messages),\n",
    "            'has_csp_violations': any('content security policy' in m.lower() for m in messages),\n",
    "        }\n",
    "    \n",
    "    def analyze_script_content(self, script: str) -> Dict[str, Any]:\n",
    "        \"\"\"Analyze JavaScript code for suspicious patterns\"\"\"\n",
    "        if not script:\n",
    "            return self._empty_script_features()\n",
    "        \n",
    "        return {\n",
    "            'script_length': len(script),\n",
    "            'suspicious_api_count': sum(1 for api in self.SUSPICIOUS_APIS if api in script),\n",
    "            'obfuscation_score': sum(len(re.findall(p, script)) for p in self.OBFUSCATION_PATTERNS),\n",
    "            'has_eval': 'eval(' in script or 'eval (' in script,\n",
    "            'has_document_write': 'document.write' in script,\n",
    "            'has_inline_event_handlers': bool(re.search(r'on\\w+\\s*=', script)),\n",
    "            'external_url_count': len(re.findall(r'https?://[^\\s\"\\')]+', script)),\n",
    "            'function_count': len(re.findall(r'function\\s*\\w*\\s*\\(', script)),\n",
    "        }\n",
    "    \n",
    "    def _empty_features(self) -> Dict:\n",
    "        return {\n",
    "            'console_log_count': 0, 'console_error_count': 0, 'console_warning_count': 0,\n",
    "            'console_info_count': 0, 'error_ratio': 0, 'has_security_errors': False,\n",
    "            'has_csp_violations': False\n",
    "        }\n",
    "    \n",
    "    def _empty_script_features(self) -> Dict:\n",
    "        return {\n",
    "            'script_length': 0, 'suspicious_api_count': 0, 'obfuscation_score': 0,\n",
    "            'has_eval': False, 'has_document_write': False, 'has_inline_event_handlers': False,\n",
    "            'external_url_count': 0, 'function_count': 0\n",
    "        }\n",
    "\n",
    "js_extractor = JavaScriptFeatureExtractor()\n",
    "print(\"\u2713 JavaScript Feature Extractor initialized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b31de89",
   "metadata": {},
   "source": [
    "## 5. Unified Feature Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9fd30ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "class CyberForgeFeaturePipeline:\n",
    "    \"\"\"\n",
    "    Unified feature extraction pipeline.\n",
    "    Combines all extractors for complete feature engineering.\n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.url_extractor = url_extractor\n",
    "        self.network_extractor = network_extractor\n",
    "        self.header_extractor = header_extractor\n",
    "        self.js_extractor = js_extractor\n",
    "        self.scaler = StandardScaler()\n",
    "        self.label_encoder = LabelEncoder()\n",
    "        self.feature_names = []\n",
    "    \n",
    "    def extract_website_features(self, scraped_data: Dict) -> Dict[str, Any]:\n",
    "        \"\"\"Extract all features from website scraped data\"\"\"\n",
    "        features = {}\n",
    "        \n",
    "        # URL features\n",
    "        url_features = self.url_extractor.extract(scraped_data.get('url', ''))\n",
    "        features.update({f\"url_{k}\": v for k, v in url_features.items() if k != 'tld'})\n",
    "        \n",
    "        # Network features\n",
    "        network_features = self.network_extractor.extract_from_requests(\n",
    "            scraped_data.get('network_requests', [])\n",
    "        )\n",
    "        features.update({f\"net_{k}\": v for k, v in network_features.items()})\n",
    "        \n",
    "        # Security header features\n",
    "        header_features = self.header_extractor.extract(\n",
    "            scraped_data.get('response_headers', {}),\n",
    "            scraped_data.get('security_report', {})\n",
    "        )\n",
    "        features.update({f\"sec_{k}\": v for k, v in header_features.items()})\n",
    "        \n",
    "        # JavaScript features\n",
    "        js_features = self.js_extractor.extract_from_console_logs(\n",
    "            scraped_data.get('console_logs', [])\n",
    "        )\n",
    "        features.update({f\"js_{k}\": v for k, v in js_features.items()})\n",
    "        \n",
    "        # Calculate risk score\n",
    "        features['security_score'] = self.header_extractor.calculate_security_score(header_features)\n",
    "        \n",
    "        return features\n",
    "    \n",
    "    def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n",
    "        \"\"\"Process a dataset and extract URL features\"\"\"\n",
    "        if url_column not in df.columns:\n",
    "            print(f\"  Warning: No '{url_column}' column found\")\n",
    "            return df\n",
    "        \n",
    "        try:\n",
    "            # Extract URL features\n",
    "            url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n",
    "            url_df = pd.DataFrame(url_features.tolist())\n",
    "            \n",
    "            # Drop non-numeric 'tld' column before renaming\n",
    "            if 'tld' in url_df.columns:\n",
    "                url_df = url_df.drop(columns=['tld'])\n",
    "            \n",
    "            # Rename columns with url_ prefix\n",
    "            url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n",
    "            \n",
    "            # Combine with original features (drop original url column to avoid issues)\n",
    "            result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n",
    "            result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n",
    "            \n",
    "            return result\n",
    "        except Exception as e:\n",
    "            print(f\"  Warning: URL feature extraction error: {e}\")\n",
    "            return df\n",
    "    \n",
    "    def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n",
    "        \"\"\"Prepare features for model training\"\"\"\n",
    "        df = df.copy()\n",
    "        \n",
    "        # Find label column (case insensitive, multiple names)\n",
    "        label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
    "                           'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
    "        actual_label_col = None\n",
    "        for col in df.columns:\n",
    "            if col.lower() in [lc.lower() for lc in label_candidates]:\n",
    "                actual_label_col = col\n",
    "                break\n",
    "        \n",
    "        # Separate features and labels\n",
    "        if actual_label_col:\n",
    "            y = df[actual_label_col]\n",
    "            X = df.drop(columns=[actual_label_col])\n",
    "        else:\n",
    "            y = None\n",
    "            X = df\n",
    "        \n",
    "        # Select numeric columns only\n",
    "        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
    "        bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n",
    "        \n",
    "        X_numeric = X[numeric_cols].fillna(0)\n",
    "        \n",
    "        # Convert boolean to int\n",
    "        for col in bool_cols:\n",
    "            if col in X.columns:\n",
    "                X_numeric[col] = X[col].astype(int)\n",
    "        \n",
    "        self.feature_names = X_numeric.columns.tolist()\n",
    "        \n",
    "        # Encode labels if present\n",
    "        if y is not None:\n",
    "            if y.dtype == 'object':\n",
    "                y = self.label_encoder.fit_transform(y)\n",
    "            else:\n",
    "                y = y.values\n",
    "        \n",
    "        return X_numeric, y\n",
    "\n",
    "pipeline = CyberForgeFeaturePipeline()\n",
    "print(\"Feature Pipeline initialized\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd70536a",
   "metadata": {},
   "source": [
    "## 6. Process Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e334044",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load manifest\n",
    "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
    "if manifest_path.exists():\n",
    "    with open(manifest_path) as f:\n",
    "        manifest = json.load(f)\n",
    "    print(f\"\u2713 Loaded manifest with {len(manifest)} datasets\")\n",
    "else:\n",
    "    print(\"\u26a0 No manifest found. Run 01_data_acquisition.ipynb first.\")\n",
    "    manifest = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b049596",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process each dataset\n",
    "processed_datasets = {}\n",
    "feature_stats = []\n",
    "\n",
    "print(\"Processing datasets for feature engineering...\\n\")\n",
    "\n",
    "for entry in manifest:\n",
    "    name = entry['name']\n",
    "    path = Path(\"..\") / entry['path']\n",
    "    \n",
    "    if not path.exists():\n",
    "        print(f\"  \u26a0 {name}: File not found\")\n",
    "        continue\n",
    "    \n",
    "    print(f\"  Processing: {name}\")\n",
    "    \n",
    "    try:\n",
    "        df = pd.read_csv(path)\n",
    "        \n",
    "        # IMPORTANT: Extract and preserve label column BEFORE processing\n",
    "        label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
    "                           'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
    "        original_label = None\n",
    "        label_col_name = None\n",
    "        for col in df.columns:\n",
    "            if col.lower() in [lc.lower() for lc in label_candidates]:\n",
    "                original_label = df[col].copy()\n",
    "                label_col_name = col\n",
    "                print(f\"    Found label column: {col}\")\n",
    "                break\n",
    "        \n",
    "        # Check for URL column to extract URL features\n",
    "        url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
    "        if url_cols:\n",
    "            df = pipeline.process_dataset(df, url_column=url_cols[0])\n",
    "        \n",
    "        # Prepare for training\n",
    "        X, y = pipeline.prepare_for_training(df)\n",
    "        \n",
    "        # If y is None but we found original_label, use that\n",
    "        if y is None and original_label is not None:\n",
    "            y = original_label.values\n",
    "            print(f\"    Restored label from original: {label_col_name}\")\n",
    "        \n",
    "        processed_datasets[name] = {\n",
    "            'X': X,\n",
    "            'y': y,\n",
    "            'feature_names': pipeline.feature_names,\n",
    "            'n_samples': len(X),\n",
    "            'n_features': len(pipeline.feature_names)\n",
    "        }\n",
    "        \n",
    "        label_status = \"with labels\" if y is not None else \"no labels\"\n",
    "        print(f\"    \u2713 {len(X)} samples, {len(pipeline.feature_names)} features ({label_status})\")\n",
    "        \n",
    "        feature_stats.append({\n",
    "            'name': name,\n",
    "            'samples': len(X),\n",
    "            'features': len(pipeline.feature_names),\n",
    "            'has_labels': y is not None\n",
    "        })\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"    \u26a0 Error: {e}\")\n",
    "\n",
    "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "096db774",
   "metadata": {},
   "source": [
    "## 7. Save Feature-Engineered Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bb49674",
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "\n",
    "# Save processed datasets\n",
    "feature_manifest = []\n",
    "\n",
    "print(\"Saving feature-engineered datasets...\")\n",
    "\n",
    "for name, data in processed_datasets.items():\n",
    "    # Save as parquet for efficiency\n",
    "    output_path = FEATURES_DIR / f\"{name}_features.parquet\"\n",
    "    \n",
    "    # Create dataframe with features\n",
    "    df_features = data['X'].copy()\n",
    "    if data['y'] is not None:\n",
    "        df_features['label'] = data['y']\n",
    "    \n",
    "    df_features.to_parquet(output_path, index=False)\n",
    "    \n",
    "    feature_manifest.append({\n",
    "        'name': name,\n",
    "        'path': str(output_path.relative_to(DATASETS_DIR.parent)),\n",
    "        'samples': data['n_samples'],\n",
    "        'features': data['n_features'],\n",
    "        'feature_names': data['feature_names'],\n",
    "        'has_labels': data['y'] is not None\n",
    "    })\n",
    "    \n",
    "    print(f\"  \u2713 Saved: {output_path.name}\")\n",
    "\n",
    "# Save feature manifest\n",
    "manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
    "with open(manifest_path, \"w\") as f:\n",
    "    json.dump(feature_manifest, f, indent=2)\n",
    "\n",
    "# Save pipeline for inference\n",
    "pipeline_path = FEATURES_DIR / \"feature_pipeline.pkl\"\n",
    "joblib.dump(pipeline, pipeline_path)\n",
    "\n",
    "print(f\"\\n\u2713 Feature manifest saved to: {manifest_path}\")\n",
    "print(f\"\u2713 Feature pipeline saved to: {pipeline_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1fe65eae",
   "metadata": {},
   "source": [
    "## 8. Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02cc2a14",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\" * 60)\n",
    "print(\"FEATURE ENGINEERING COMPLETE\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "total_samples = sum(d['n_samples'] for d in processed_datasets.values())\n",
    "total_features = max(d['n_features'] for d in processed_datasets.values()) if processed_datasets else 0\n",
    "\n",
    "print(f\"\"\"\n",
    "\ud83d\udd27 Feature Engineering Summary:\n",
    "   - Datasets processed: {len(processed_datasets)}\n",
    "   - Total samples: {total_samples:,}\n",
    "   - Max features: {total_features}\n",
    "   - Output directory: {FEATURES_DIR}\n",
    "\n",
    "\ud83d\udcca Feature Categories:\n",
    "   - URL Features: Domain, path, security indicators\n",
    "   - Network Features: Request patterns, status codes\n",
    "   - Security Headers: CSP, HSTS, X-Frame-Options\n",
    "   - JavaScript: Console logs, suspicious APIs\n",
    "\n",
    "\ud83d\udcc1 Datasets Ready for Training:\"\"\")\n",
    "\n",
    "for entry in feature_manifest:\n",
    "    print(f\"   \u2713 {entry['name']}: {entry['samples']:,} samples, {entry['features']} features\")\n",
    "\n",
    "print(f\"\"\"\n",
    "Next step:\n",
    "  \u2192 03_model_training.ipynb\n",
    "\"\"\")\n",
    "print(\"=\" * 60)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}