Spaces:

Che237
/

cyberforge

Sleeping

App Files Files Community

Che237 commited on Jan 31

Commit

460801e

verified ·

1 Parent(s): ca20dd7

Fix async to sync httpx - remove asyncio.get_event_loop

Browse files

Files changed (1) hide show

notebooks/01_data_acquisition.ipynb +37 -38

notebooks/01_data_acquisition.ipynb CHANGED Viewed

@@ -50,12 +50,12 @@
     "if config_path.exists():\n",
     "    with open(config_path) as f:\n",
     "        CONFIG = json.load(f)\n",
-    "    print(\"✓ Configuration loaded\")\n",
     "else:\n",
     "    raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
     "\n",
     "DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
-    "print(f\"✓ Datasets directory: {DATASETS_DIR}\")"
    ]
   },
   {
@@ -78,7 +78,7 @@
     "class WebScraperDataCollector:\n",
     "    \"\"\"\n",
     "    Collects website security data via WebScrapper.live API.\n",
-    "    This aligns with the backend WebScraperAPIService.\n",
     "    \"\"\"\n",
     "    \n",
     "    def __init__(self):\n",
@@ -86,11 +86,11 @@
     "        self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
     "        self.timeout = 60.0\n",
     "    \n",
-    "    async def scrape_website(self, url: str) -> Dict[str, Any]:\n",
-    "        \"\"\"Scrape a single website and return security data\"\"\"\n",
     "        try:\n",
-    "            async with httpx.AsyncClient(timeout=self.timeout) as client:\n",
-    "                response = await client.post(\n",
     "                    self.api_url,\n",
     "                    json={\"url\": url},\n",
     "                    headers={\n",
@@ -117,14 +117,15 @@
     "        except Exception as e:\n",
     "            return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
     "    \n",
-    "    async def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
-    "        \"\"\"Collect data from multiple URLs\"\"\"\n",
     "        results = []\n",
     "        for url in urls:\n",
     "            print(f\"  Scraping: {url}\")\n",
-    "            result = await self.scrape_website(url)\n",
     "            results.append(result)\n",
-    "            await asyncio.sleep(1)  # Rate limiting\n",
     "        return results\n",
     "    \n",
     "    def extract_security_features(self, data: Dict) -> Dict:\n",
@@ -136,7 +137,6 @@
     "        network_requests = data.get(\"network_requests\", [])\n",
     "        console_logs = data.get(\"console_logs\", [])\n",
     "        \n",
-    "        # Extract features aligned with backend needs\n",
     "        return {\n",
     "            \"url\": data[\"url\"],\n",
     "            \"is_https\": security_report.get(\"is_https\", False),\n",
@@ -163,7 +163,7 @@
     "            return False\n",
     "\n",
     "scraper = WebScraperDataCollector()\n",
-    "print(\"✓ Web Scraper Data Collector initialized\")"
    ]
   },
   {
@@ -188,18 +188,17 @@
     "\n",
     "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
     "\n",
-    "# Run async collection\n",
-    "loop = asyncio.get_event_loop()\n",
-    "scraped_data = loop.run_until_complete(scraper.collect_batch(SAMPLE_URLS[:3]))  # Limited for demo\n",
     "\n",
     "# Extract features\n",
     "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
     "if features_list:\n",
     "    web_scraper_df = pd.DataFrame(features_list)\n",
-    "    print(f\"\\n✓ Collected {len(web_scraper_df)} website security profiles\")\n",
     "    display(web_scraper_df.head())\n",
     "else:\n",
-    "    print(\"⚠ No data collected - API may be unavailable\")\n",
     "    web_scraper_df = pd.DataFrame()"
    ]
   },
@@ -232,7 +231,7 @@
     "        csv_files = [f for f in files if f.endswith('.csv')]\n",
     "        return csv_files\n",
     "    except Exception as e:\n",
-    "        print(f\"⚠ Could not list HF datasets: {e}\")\n",
     "        return []\n",
     "\n",
     "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
@@ -246,7 +245,7 @@
     "        )\n",
     "        return Path(local_path)\n",
     "    except Exception as e:\n",
-    "        print(f\"⚠ Could not download {file_path}: {e}\")\n",
     "        return None\n",
     "\n",
     "# List available datasets\n",
@@ -284,13 +283,13 @@
     "            try:\n",
     "                df = pd.read_csv(local_path)\n",
     "                loaded_datasets[name] = df\n",
-    "                print(f\"    ✓ {name}: {len(df)} samples, {len(df.columns)} features\")\n",
     "            except Exception as e:\n",
-    "                print(f\"    ⚠ Could not load {name}: {e}\")\n",
     "    else:\n",
-    "        print(f\"  ⚠ {name}: Not found in repository\")\n",
     "\n",
-    "print(f\"\\n✓ Loaded {len(loaded_datasets)} datasets\")"
    ]
   },
   {
@@ -319,9 +318,9 @@
     "            name = csv_file.stem.replace(\"_processed\", \"\")\n",
     "            df = pd.read_csv(csv_file)\n",
     "            datasets[name] = df\n",
-    "            print(f\"  ✓ {name}: {len(df)} samples\")\n",
     "        except Exception as e:\n",
-    "            print(f\"  ⚠ {csv_file.name}: {e}\")\n",
     "    \n",
     "    return datasets\n",
     "\n",
@@ -333,7 +332,7 @@
     "    if name not in loaded_datasets:\n",
     "        loaded_datasets[name] = df\n",
     "\n",
-    "print(f\"\\n✓ Total datasets available: {len(loaded_datasets)}\")"
    ]
   },
   {
@@ -390,11 +389,11 @@
     "for name, df in loaded_datasets.items():\n",
     "    report = validate_dataset(name, df)\n",
     "    validation_reports.append(report)\n",
-    "    status = \"✓\" if report[\"valid\"] else \"⚠\"\n",
     "    print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
     "\n",
     "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
-    "print(f\"\\n✓ {len(valid_datasets)} datasets passed validation\")"
    ]
   },
   {
@@ -452,9 +451,9 @@
     "    if name in loaded_datasets:\n",
     "        df = normalize_dataset(loaded_datasets[name], name)\n",
     "        normalized_datasets[name] = df\n",
-    "        print(f\"  ✓ {name}: {len(df)} samples after normalization\")\n",
     "\n",
-    "print(f\"\\n✓ Normalized {len(normalized_datasets)} datasets\")"
    ]
   },
   {
@@ -493,14 +492,14 @@
     "        \"processed_at\": datetime.now().isoformat()\n",
     "    }\n",
     "    dataset_manifest.append(manifest_entry)\n",
-    "    print(f\"  ✓ Saved: {output_path.name}\")\n",
     "\n",
     "# Save manifest\n",
     "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
     "with open(manifest_path, \"w\") as f:\n",
     "    json.dump(dataset_manifest, f, indent=2)\n",
     "\n",
-    "print(f\"\\n✓ Dataset manifest saved to: {manifest_path}\")"
    ]
   },
   {
@@ -525,19 +524,19 @@
     "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
     "\n",
     "print(f\"\"\"\n",
-    "📊 Data Collection Summary:\n",
     "   - Datasets processed: {len(normalized_datasets)}\n",
     "   - Total samples: {total_samples:,}\n",
     "   - Output directory: {PROCESSED_DIR}\n",
     "\n",
-    "📁 Datasets Ready for Feature Engineering:\"\"\")\n",
     "\n",
     "for entry in dataset_manifest:\n",
-    "    print(f\"   ✓ {entry['name']}: {entry['samples']:,} samples\")\n",
     "\n",
     "print(f\"\"\"\n",
     "Next step:\n",
-    "  → 02_feature_engineering.ipynb\n",
     "\"\"\")\n",
     "print(\"=\" * 60)"
    ]
@@ -550,4 +549,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

     "if config_path.exists():\n",
     "    with open(config_path) as f:\n",
     "        CONFIG = json.load(f)\n",
+    "    print(\"\u2713 Configuration loaded\")\n",
     "else:\n",
     "    raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
     "\n",
     "DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
+    "print(f\"\u2713 Datasets directory: {DATASETS_DIR}\")"
    ]
   },
   {
     "class WebScraperDataCollector:\n",
     "    \"\"\"\n",
     "    Collects website security data via WebScrapper.live API.\n",
+    "    Uses synchronous httpx for Jupyter compatibility.\n",
     "    \"\"\"\n",
     "    \n",
     "    def __init__(self):\n",
     "        self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
     "        self.timeout = 60.0\n",
     "    \n",
+    "    def scrape_website(self, url: str) -> Dict[str, Any]:\n",
+    "        \"\"\"Scrape a single website and return security data (sync version)\"\"\"\n",
     "        try:\n",
+    "            with httpx.Client(timeout=self.timeout) as client:\n",
+    "                response = client.post(\n",
     "                    self.api_url,\n",
     "                    json={\"url\": url},\n",
     "                    headers={\n",
     "        except Exception as e:\n",
     "            return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
     "    \n",
+    "    def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
+    "        \"\"\"Collect data from multiple URLs (sync version)\"\"\"\n",
+    "        import time\n",
     "        results = []\n",
     "        for url in urls:\n",
     "            print(f\"  Scraping: {url}\")\n",
+    "            result = self.scrape_website(url)\n",
     "            results.append(result)\n",
+    "            time.sleep(1)  # Rate limiting\n",
     "        return results\n",
     "    \n",
     "    def extract_security_features(self, data: Dict) -> Dict:\n",
     "        network_requests = data.get(\"network_requests\", [])\n",
     "        console_logs = data.get(\"console_logs\", [])\n",
     "        \n",
     "        return {\n",
     "            \"url\": data[\"url\"],\n",
     "            \"is_https\": security_report.get(\"is_https\", False),\n",
     "            return False\n",
     "\n",
     "scraper = WebScraperDataCollector()\n",
+    "print(\"\u2713 Web Scraper Data Collector initialized\")\n"
    ]
   },
   {
     "\n",
     "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
     "\n",
+    "# Run synchronous collection (Jupyter-compatible)\n",
+    "scraped_data = scraper.collect_batch(SAMPLE_URLS[:3])  # Limited for demo\n",
     "\n",
     "# Extract features\n",
     "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
     "if features_list:\n",
     "    web_scraper_df = pd.DataFrame(features_list)\n",
+    "    print(f\"\\n\u2713 Collected {len(web_scraper_df)} website security profiles\")\n",
     "    display(web_scraper_df.head())\n",
     "else:\n",
+    "    print(\"\u26a0 No data collected - API may be unavailable\")\n",
     "    web_scraper_df = pd.DataFrame()"
    ]
   },
     "        csv_files = [f for f in files if f.endswith('.csv')]\n",
     "        return csv_files\n",
     "    except Exception as e:\n",
+    "        print(f\"\u26a0 Could not list HF datasets: {e}\")\n",
     "        return []\n",
     "\n",
     "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
     "        )\n",
     "        return Path(local_path)\n",
     "    except Exception as e:\n",
+    "        print(f\"\u26a0 Could not download {file_path}: {e}\")\n",
     "        return None\n",
     "\n",
     "# List available datasets\n",
     "            try:\n",
     "                df = pd.read_csv(local_path)\n",
     "                loaded_datasets[name] = df\n",
+    "                print(f\"    \u2713 {name}: {len(df)} samples, {len(df.columns)} features\")\n",
     "            except Exception as e:\n",
+    "                print(f\"    \u26a0 Could not load {name}: {e}\")\n",
     "    else:\n",
+    "        print(f\"  \u26a0 {name}: Not found in repository\")\n",
     "\n",
+    "print(f\"\\n\u2713 Loaded {len(loaded_datasets)} datasets\")"
    ]
   },
   {
     "            name = csv_file.stem.replace(\"_processed\", \"\")\n",
     "            df = pd.read_csv(csv_file)\n",
     "            datasets[name] = df\n",
+    "            print(f\"  \u2713 {name}: {len(df)} samples\")\n",
     "        except Exception as e:\n",
+    "            print(f\"  \u26a0 {csv_file.name}: {e}\")\n",
     "    \n",
     "    return datasets\n",
     "\n",
     "    if name not in loaded_datasets:\n",
     "        loaded_datasets[name] = df\n",
     "\n",
+    "print(f\"\\n\u2713 Total datasets available: {len(loaded_datasets)}\")"
    ]
   },
   {
     "for name, df in loaded_datasets.items():\n",
     "    report = validate_dataset(name, df)\n",
     "    validation_reports.append(report)\n",
+    "    status = \"\u2713\" if report[\"valid\"] else \"\u26a0\"\n",
     "    print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
     "\n",
     "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
+    "print(f\"\\n\u2713 {len(valid_datasets)} datasets passed validation\")"
    ]
   },
   {
     "    if name in loaded_datasets:\n",
     "        df = normalize_dataset(loaded_datasets[name], name)\n",
     "        normalized_datasets[name] = df\n",
+    "        print(f\"  \u2713 {name}: {len(df)} samples after normalization\")\n",
     "\n",
+    "print(f\"\\n\u2713 Normalized {len(normalized_datasets)} datasets\")"
    ]
   },
   {
     "        \"processed_at\": datetime.now().isoformat()\n",
     "    }\n",
     "    dataset_manifest.append(manifest_entry)\n",
+    "    print(f\"  \u2713 Saved: {output_path.name}\")\n",
     "\n",
     "# Save manifest\n",
     "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
     "with open(manifest_path, \"w\") as f:\n",
     "    json.dump(dataset_manifest, f, indent=2)\n",
     "\n",
+    "print(f\"\\n\u2713 Dataset manifest saved to: {manifest_path}\")"
    ]
   },
   {
     "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
     "\n",
     "print(f\"\"\"\n",
+    "\ud83d\udcca Data Collection Summary:\n",
     "   - Datasets processed: {len(normalized_datasets)}\n",
     "   - Total samples: {total_samples:,}\n",
     "   - Output directory: {PROCESSED_DIR}\n",
     "\n",
+    "\ud83d\udcc1 Datasets Ready for Feature Engineering:\"\"\")\n",
     "\n",
     "for entry in dataset_manifest:\n",
+    "    print(f\"   \u2713 {entry['name']}: {entry['samples']:,} samples\")\n",
     "\n",
     "print(f\"\"\"\n",
     "Next step:\n",
+    "  \u2192 02_feature_engineering.ipynb\n",
     "\"\"\")\n",
     "print(\"=\" * 60)"
    ]
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}