Spaces:

Che237
/

cyberforge

Running

App Files Files Community

Che237 commited on 13 days ago

Commit

e4ce7b0

verified ·

1 Parent(s): f808c77

Add 01_data_acquisition.ipynb

Browse files

Files changed (1) hide show

notebooks/01_data_acquisition.ipynb +549 -0

notebooks/01_data_acquisition.ipynb ADDED Viewed

	@@ -0,0 +1,549 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "417baa98",
+   "metadata": {},
+   "source": [
+    "# 01 - Data Acquisition\n",
+    "\n",
+    "## CyberForge AI - Cybersecurity Data Collection & Preparation\n",
+    "\n",
+    "This notebook handles all data acquisition for the CyberForge AI ML pipeline.\n",
+    "\n",
+    "### Data Sources:\n",
+    "1. **Public Datasets** - Legal, publicly available cybersecurity datasets\n",
+    "2. **Web Scraper API** - Real-time website security data collection\n",
+    "3. **Hugging Face Datasets** - Pre-uploaded CyberForge datasets\n",
+    "4. **Synthetic Data** - Generated data for edge cases\n",
+    "\n",
+    "### Output:\n",
+    "- Cleaned, normalized datasets ready for feature engineering\n",
+    "- Data validation reports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e0aeada",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load configuration from environment setup\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import httpx\n",
+    "import asyncio\n",
+    "from datetime import datetime\n",
+    "from typing import Dict, List, Any, Optional\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Load notebook config\n",
+    "config_path = Path(\"../notebook_config.json\")\n",
+    "if config_path.exists():\n",
+    "    with open(config_path) as f:\n",
+    "        CONFIG = json.load(f)\n",
+    "    print(\"✓ Configuration loaded\")\n",
+    "else:\n",
+    "    raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
+    "\n",
+    "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n",
+    "print(f\"✓ Datasets directory: {DATASETS_DIR}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9493ab50",
+   "metadata": {},
+   "source": [
+    "## 1. Web Scraper API Data Collection\n",
+    "\n",
+    "Collect real-time website security data using the WebScrapper.live API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "876b9e05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class WebScraperDataCollector:\n",
+    "    \"\"\"\n",
+    "    Collects website security data via WebScrapper.live API.\n",
+    "    This aligns with the backend WebScraperAPIService.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.api_url = \"http://webscrapper.live/api/scrape\"\n",
+    "        self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
+    "        self.timeout = 60.0\n",
+    "    \n",
+    "    async def scrape_website(self, url: str) -> Dict[str, Any]:\n",
+    "        \"\"\"Scrape a single website and return security data\"\"\"\n",
+    "        try:\n",
+    "            async with httpx.AsyncClient(timeout=self.timeout) as client:\n",
+    "                response = await client.post(\n",
+    "                    self.api_url,\n",
+    "                    json={\"url\": url},\n",
+    "                    headers={\n",
+    "                        \"Content-Type\": \"application/json\",\n",
+    "                        \"X-API-Key\": self.api_key\n",
+    "                    }\n",
+    "                )\n",
+    "                \n",
+    "                if response.status_code == 200:\n",
+    "                    data = response.json()\n",
+    "                    return {\n",
+    "                        \"success\": True,\n",
+    "                        \"url\": url,\n",
+    "                        \"security_report\": data.get(\"security_report\", {}),\n",
+    "                        \"network_requests\": data.get(\"network_requests\", []),\n",
+    "                        \"console_logs\": data.get(\"console_logs\", []),\n",
+    "                        \"performance_metrics\": data.get(\"performance_metrics\", {}),\n",
+    "                        \"response_headers\": data.get(\"response_headers\", {}),\n",
+    "                        \"html_length\": len(data.get(\"html\", \"\") or data.get(\"content\", \"\")),\n",
+    "                        \"scraped_at\": datetime.now().isoformat()\n",
+    "                    }\n",
+    "                else:\n",
+    "                    return {\"success\": False, \"url\": url, \"error\": f\"Status {response.status_code}\"}\n",
+    "        except Exception as e:\n",
+    "            return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
+    "    \n",
+    "    async def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
+    "        \"\"\"Collect data from multiple URLs\"\"\"\n",
+    "        results = []\n",
+    "        for url in urls:\n",
+    "            print(f\"  Scraping: {url}\")\n",
+    "            result = await self.scrape_website(url)\n",
+    "            results.append(result)\n",
+    "            await asyncio.sleep(1)  # Rate limiting\n",
+    "        return results\n",
+    "    \n",
+    "    def extract_security_features(self, data: Dict) -> Dict:\n",
+    "        \"\"\"Extract security-relevant features from scraped data\"\"\"\n",
+    "        if not data.get(\"success\"):\n",
+    "            return None\n",
+    "        \n",
+    "        security_report = data.get(\"security_report\", {})\n",
+    "        network_requests = data.get(\"network_requests\", [])\n",
+    "        console_logs = data.get(\"console_logs\", [])\n",
+    "        \n",
+    "        # Extract features aligned with backend needs\n",
+    "        return {\n",
+    "            \"url\": data[\"url\"],\n",
+    "            \"is_https\": security_report.get(\"is_https\", False),\n",
+    "            \"has_mixed_content\": security_report.get(\"mixed_content\", False),\n",
+    "            \"missing_headers_count\": len(security_report.get(\"missing_security_headers\", [])),\n",
+    "            \"has_insecure_cookies\": security_report.get(\"insecure_cookies\", False),\n",
+    "            \"total_requests\": len(network_requests),\n",
+    "            \"external_requests\": sum(1 for r in network_requests if self._is_external(r, data[\"url\"])),\n",
+    "            \"failed_requests\": sum(1 for r in network_requests if r.get(\"status\", 200) >= 400),\n",
+    "            \"console_errors\": sum(1 for log in console_logs if log.get(\"level\") == \"error\"),\n",
+    "            \"console_warnings\": sum(1 for log in console_logs if log.get(\"level\") == \"warning\"),\n",
+    "            \"html_size\": data.get(\"html_length\", 0),\n",
+    "            \"scraped_at\": data[\"scraped_at\"]\n",
+    "        }\n",
+    "    \n",
+    "    def _is_external(self, request: Dict, base_url: str) -> bool:\n",
+    "        \"\"\"Check if a request is to an external domain\"\"\"\n",
+    "        try:\n",
+    "            from urllib.parse import urlparse\n",
+    "            base_domain = urlparse(base_url).netloc\n",
+    "            req_domain = urlparse(request.get(\"url\", \"\")).netloc\n",
+    "            return base_domain != req_domain\n",
+    "        except:\n",
+    "            return False\n",
+    "\n",
+    "scraper = WebScraperDataCollector()\n",
+    "print(\"✓ Web Scraper Data Collector initialized\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aec0ba34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Collect sample data from known safe/unsafe websites for training\n",
+    "SAMPLE_URLS = [\n",
+    "    # Known safe websites\n",
+    "    \"https://www.google.com\",\n",
+    "    \"https://github.com\",\n",
+    "    \"https://www.microsoft.com\",\n",
+    "    \"https://www.amazon.com\",\n",
+    "    \"https://www.wikipedia.org\",\n",
+    "    # Test sites\n",
+    "    \"https://example.com\",\n",
+    "    \"https://httpbin.org\",\n",
+    "]\n",
+    "\n",
+    "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
+    "\n",
+    "# Run async collection\n",
+    "loop = asyncio.get_event_loop()\n",
+    "scraped_data = loop.run_until_complete(scraper.collect_batch(SAMPLE_URLS[:3]))  # Limited for demo\n",
+    "\n",
+    "# Extract features\n",
+    "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
+    "if features_list:\n",
+    "    web_scraper_df = pd.DataFrame(features_list)\n",
+    "    print(f\"\\n✓ Collected {len(web_scraper_df)} website security profiles\")\n",
+    "    display(web_scraper_df.head())\n",
+    "else:\n",
+    "    print(\"⚠ No data collected - API may be unavailable\")\n",
+    "    web_scraper_df = pd.DataFrame()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dadb1ac",
+   "metadata": {},
+   "source": [
+    "## 2. Load Hugging Face Datasets\n",
+    "\n",
+    "Load pre-uploaded CyberForge datasets from Hugging Face."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9be784df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import hf_hub_download, list_repo_files\n",
+    "import os\n",
+    "\n",
+    "HF_DATASET_REPO = \"Che237/cyberforge-datasets\"\n",
+    "\n",
+    "def list_available_datasets():\n",
+    "    \"\"\"List all available datasets in the HF repository\"\"\"\n",
+    "    try:\n",
+    "        files = list_repo_files(HF_DATASET_REPO, repo_type=\"dataset\")\n",
+    "        csv_files = [f for f in files if f.endswith('.csv')]\n",
+    "        return csv_files\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠ Could not list HF datasets: {e}\")\n",
+    "        return []\n",
+    "\n",
+    "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
+    "    \"\"\"Download a specific dataset from Hugging Face\"\"\"\n",
+    "    try:\n",
+    "        local_path = hf_hub_download(\n",
+    "            repo_id=HF_DATASET_REPO,\n",
+    "            filename=file_path,\n",
+    "            repo_type=\"dataset\",\n",
+    "            cache_dir=str(local_dir / \"cache\")\n",
+    "        )\n",
+    "        return Path(local_path)\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠ Could not download {file_path}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "# List available datasets\n",
+    "print(\"Available datasets on Hugging Face:\")\n",
+    "available_files = list_available_datasets()\n",
+    "for f in available_files[:20]:  # Show first 20\n",
+    "    print(f\"  - {f}\")\n",
+    "print(f\"  ... and {len(available_files) - 20} more\" if len(available_files) > 20 else \"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc9ef68b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Priority datasets for training (aligned with backend requirements)\n",
+    "PRIORITY_DATASETS = {\n",
+    "    \"network_intrusion\": \"network_intrusion/network_intrusion_processed.csv\",\n",
+    "    \"phishing_detection\": \"phishing_detection/phishing_detection_processed.csv\",\n",
+    "    \"malware_detection\": \"malware_detection/malware_detection_processed.csv\",\n",
+    "    \"anomaly_detection\": \"anomaly_detection/anomaly_detection_processed.csv\",\n",
+    "    \"web_attack_detection\": \"web_attack_detection/web_attack_detection_processed.csv\",\n",
+    "}\n",
+    "\n",
+    "loaded_datasets = {}\n",
+    "\n",
+    "print(\"Downloading priority datasets...\")\n",
+    "for name, path in PRIORITY_DATASETS.items():\n",
+    "    if path in available_files:\n",
+    "        print(f\"  Downloading: {name}\")\n",
+    "        local_path = download_dataset(path)\n",
+    "        if local_path:\n",
+    "            try:\n",
+    "                df = pd.read_csv(local_path)\n",
+    "                loaded_datasets[name] = df\n",
+    "                print(f\"    ✓ {name}: {len(df)} samples, {len(df.columns)} features\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"    ⚠ Could not load {name}: {e}\")\n",
+    "    else:\n",
+    "        print(f\"  ⚠ {name}: Not found in repository\")\n",
+    "\n",
+    "print(f\"\\n✓ Loaded {len(loaded_datasets)} datasets\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3fadcbfc",
+   "metadata": {},
+   "source": [
+    "## 3. Load Local Datasets\n",
+    "\n",
+    "Load any datasets already present in the local datasets directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40cad5c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_local_datasets(datasets_dir: Path) -> Dict[str, pd.DataFrame]:\n",
+    "    \"\"\"Load all CSV datasets from local directory\"\"\"\n",
+    "    datasets = {}\n",
+    "    \n",
+    "    for csv_file in datasets_dir.rglob(\"*_processed.csv\"):\n",
+    "        try:\n",
+    "            name = csv_file.stem.replace(\"_processed\", \"\")\n",
+    "            df = pd.read_csv(csv_file)\n",
+    "            datasets[name] = df\n",
+    "            print(f\"  ✓ {name}: {len(df)} samples\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"  ⚠ {csv_file.name}: {e}\")\n",
+    "    \n",
+    "    return datasets\n",
+    "\n",
+    "print(\"Loading local datasets...\")\n",
+    "local_datasets = load_local_datasets(DATASETS_DIR)\n",
+    "\n",
+    "# Merge with HF datasets (local takes precedence)\n",
+    "for name, df in local_datasets.items():\n",
+    "    if name not in loaded_datasets:\n",
+    "        loaded_datasets[name] = df\n",
+    "\n",
+    "print(f\"\\n✓ Total datasets available: {len(loaded_datasets)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3584926",
+   "metadata": {},
+   "source": [
+    "## 4. Data Validation & Quality Checks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4adefa74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def validate_dataset(name: str, df: pd.DataFrame) -> Dict[str, Any]:\n",
+    "    \"\"\"Validate dataset quality and return report\"\"\"\n",
+    "    report = {\n",
+    "        \"name\": name,\n",
+    "        \"samples\": len(df),\n",
+    "        \"features\": len(df.columns),\n",
+    "        \"missing_values\": df.isnull().sum().sum(),\n",
+    "        \"missing_pct\": (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,\n",
+    "        \"duplicate_rows\": df.duplicated().sum(),\n",
+    "        \"numeric_columns\": len(df.select_dtypes(include=[np.number]).columns),\n",
+    "        \"categorical_columns\": len(df.select_dtypes(include=['object', 'category']).columns),\n",
+    "        \"memory_mb\": df.memory_usage(deep=True).sum() / (1024 * 1024),\n",
+    "        \"has_label\": any(col in df.columns for col in ['label', 'target', 'class', 'is_malicious', 'attack_type']),\n",
+    "        \"valid\": True\n",
+    "    }\n",
+    "    \n",
+    "    # Validation checks\n",
+    "    issues = []\n",
+    "    if report[\"samples\"] < 100:\n",
+    "        issues.append(\"Too few samples (<100)\")\n",
+    "    if report[\"missing_pct\"] > 50:\n",
+    "        issues.append(\"Too many missing values (>50%)\")\n",
+    "    if not report[\"has_label\"]:\n",
+    "        issues.append(\"No label column found\")\n",
+    "    \n",
+    "    report[\"issues\"] = issues\n",
+    "    report[\"valid\"] = len(issues) == 0\n",
+    "    \n",
+    "    return report\n",
+    "\n",
+    "# Validate all datasets\n",
+    "validation_reports = []\n",
+    "print(\"Validating datasets...\\n\")\n",
+    "print(f\"{'Dataset':<30} {'Samples':>10} {'Features':>10} {'Missing %':>10} {'Valid':>8}\")\n",
+    "print(\"-\" * 75)\n",
+    "\n",
+    "for name, df in loaded_datasets.items():\n",
+    "    report = validate_dataset(name, df)\n",
+    "    validation_reports.append(report)\n",
+    "    status = \"✓\" if report[\"valid\"] else \"⚠\"\n",
+    "    print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
+    "\n",
+    "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
+    "print(f\"\\n✓ {len(valid_datasets)} datasets passed validation\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0603447f",
+   "metadata": {},
+   "source": [
+    "## 5. Data Normalization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6544e100",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalize_dataset(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
+    "    \"\"\"Normalize dataset for consistent processing\"\"\"\n",
+    "    df = df.copy()\n",
+    "    \n",
+    "    # Standardize column names\n",
+    "    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')\n",
+    "    \n",
+    "    # Find and standardize label column\n",
+    "    label_columns = ['label', 'target', 'class', 'is_malicious', 'attack_type', 'attack', 'category']\n",
+    "    for col in label_columns:\n",
+    "        if col in df.columns:\n",
+    "            df = df.rename(columns={col: 'label'})\n",
+    "            break\n",
+    "    \n",
+    "    # Handle missing values\n",
+    "    numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
+    "    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())\n",
+    "    \n",
+    "    categorical_cols = df.select_dtypes(include=['object', 'category']).columns\n",
+    "    for col in categorical_cols:\n",
+    "        if col != 'label':\n",
+    "            df[col] = df[col].fillna('unknown')\n",
+    "    \n",
+    "    # Remove duplicates\n",
+    "    df = df.drop_duplicates()\n",
+    "    \n",
+    "    # Add metadata\n",
+    "    df.attrs['dataset_name'] = name\n",
+    "    df.attrs['processed_at'] = datetime.now().isoformat()\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "# Normalize all valid datasets\n",
+    "normalized_datasets = {}\n",
+    "print(\"Normalizing datasets...\")\n",
+    "\n",
+    "for name in valid_datasets:\n",
+    "    if name in loaded_datasets:\n",
+    "        df = normalize_dataset(loaded_datasets[name], name)\n",
+    "        normalized_datasets[name] = df\n",
+    "        print(f\"  ✓ {name}: {len(df)} samples after normalization\")\n",
+    "\n",
+    "print(f\"\\n✓ Normalized {len(normalized_datasets)} datasets\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a9a016d4",
+   "metadata": {},
+   "source": [
+    "## 6. Save Processed Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91248c3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create processed data directory\n",
+    "PROCESSED_DIR = DATASETS_DIR / \"processed\"\n",
+    "PROCESSED_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "# Save each normalized dataset\n",
+    "print(\"Saving processed datasets...\")\n",
+    "dataset_manifest = []\n",
+    "\n",
+    "for name, df in normalized_datasets.items():\n",
+    "    output_path = PROCESSED_DIR / f\"{name}_ready.csv\"\n",
+    "    df.to_csv(output_path, index=False)\n",
+    "    \n",
+    "    manifest_entry = {\n",
+    "        \"name\": name,\n",
+    "        \"path\": str(output_path.relative_to(DATASETS_DIR.parent)),\n",
+    "        \"samples\": len(df),\n",
+    "        \"features\": len(df.columns),\n",
+    "        \"has_label\": \"label\" in df.columns,\n",
+    "        \"processed_at\": datetime.now().isoformat()\n",
+    "    }\n",
+    "    dataset_manifest.append(manifest_entry)\n",
+    "    print(f\"  ✓ Saved: {output_path.name}\")\n",
+    "\n",
+    "# Save manifest\n",
+    "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
+    "with open(manifest_path, \"w\") as f:\n",
+    "    json.dump(dataset_manifest, f, indent=2)\n",
+    "\n",
+    "print(f\"\\n✓ Dataset manifest saved to: {manifest_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40ba332c",
+   "metadata": {},
+   "source": [
+    "## 7. Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ef2f995",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"DATA ACQUISITION COMPLETE\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
+    "\n",
+    "print(f\"\"\"\n",
+    "📊 Data Collection Summary:\n",
+    "   - Datasets processed: {len(normalized_datasets)}\n",
+    "   - Total samples: {total_samples:,}\n",
+    "   - Output directory: {PROCESSED_DIR}\n",
+    "\n",
+    "📁 Datasets Ready for Feature Engineering:\"\"\")\n",
+    "\n",
+    "for entry in dataset_manifest:\n",
+    "    print(f\"   ✓ {entry['name']}: {entry['samples']:,} samples\")\n",
+    "\n",
+    "print(f\"\"\"\n",
+    "Next step:\n",
+    "  → 02_feature_engineering.ipynb\n",
+    "\"\"\")\n",
+    "print(\"=\" * 60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}