Spaces:
Running
Running
Fix async to sync httpx - remove asyncio.get_event_loop
Browse files
notebooks/01_data_acquisition.ipynb
CHANGED
|
@@ -50,12 +50,12 @@
|
|
| 50 |
"if config_path.exists():\n",
|
| 51 |
" with open(config_path) as f:\n",
|
| 52 |
" CONFIG = json.load(f)\n",
|
| 53 |
-
" print(\"
|
| 54 |
"else:\n",
|
| 55 |
" raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
|
| 56 |
"\n",
|
| 57 |
"DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
|
| 58 |
-
"print(f\"
|
| 59 |
]
|
| 60 |
},
|
| 61 |
{
|
|
@@ -78,7 +78,7 @@
|
|
| 78 |
"class WebScraperDataCollector:\n",
|
| 79 |
" \"\"\"\n",
|
| 80 |
" Collects website security data via WebScrapper.live API.\n",
|
| 81 |
-
"
|
| 82 |
" \"\"\"\n",
|
| 83 |
" \n",
|
| 84 |
" def __init__(self):\n",
|
|
@@ -86,11 +86,11 @@
|
|
| 86 |
" self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
|
| 87 |
" self.timeout = 60.0\n",
|
| 88 |
" \n",
|
| 89 |
-
"
|
| 90 |
-
" \"\"\"Scrape a single website and return security data\"\"\"\n",
|
| 91 |
" try:\n",
|
| 92 |
-
"
|
| 93 |
-
" response =
|
| 94 |
" self.api_url,\n",
|
| 95 |
" json={\"url\": url},\n",
|
| 96 |
" headers={\n",
|
|
@@ -117,14 +117,15 @@
|
|
| 117 |
" except Exception as e:\n",
|
| 118 |
" return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
|
| 119 |
" \n",
|
| 120 |
-
"
|
| 121 |
-
" \"\"\"Collect data from multiple URLs\"\"\"\n",
|
|
|
|
| 122 |
" results = []\n",
|
| 123 |
" for url in urls:\n",
|
| 124 |
" print(f\" Scraping: {url}\")\n",
|
| 125 |
-
" result =
|
| 126 |
" results.append(result)\n",
|
| 127 |
-
"
|
| 128 |
" return results\n",
|
| 129 |
" \n",
|
| 130 |
" def extract_security_features(self, data: Dict) -> Dict:\n",
|
|
@@ -136,7 +137,6 @@
|
|
| 136 |
" network_requests = data.get(\"network_requests\", [])\n",
|
| 137 |
" console_logs = data.get(\"console_logs\", [])\n",
|
| 138 |
" \n",
|
| 139 |
-
" # Extract features aligned with backend needs\n",
|
| 140 |
" return {\n",
|
| 141 |
" \"url\": data[\"url\"],\n",
|
| 142 |
" \"is_https\": security_report.get(\"is_https\", False),\n",
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
" return False\n",
|
| 164 |
"\n",
|
| 165 |
"scraper = WebScraperDataCollector()\n",
|
| 166 |
-
"print(\"
|
| 167 |
]
|
| 168 |
},
|
| 169 |
{
|
|
@@ -188,18 +188,17 @@
|
|
| 188 |
"\n",
|
| 189 |
"print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
|
| 190 |
"\n",
|
| 191 |
-
"# Run
|
| 192 |
-
"
|
| 193 |
-
"scraped_data = loop.run_until_complete(scraper.collect_batch(SAMPLE_URLS[:3])) # Limited for demo\n",
|
| 194 |
"\n",
|
| 195 |
"# Extract features\n",
|
| 196 |
"features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
|
| 197 |
"if features_list:\n",
|
| 198 |
" web_scraper_df = pd.DataFrame(features_list)\n",
|
| 199 |
-
" print(f\"\\n
|
| 200 |
" display(web_scraper_df.head())\n",
|
| 201 |
"else:\n",
|
| 202 |
-
" print(\"
|
| 203 |
" web_scraper_df = pd.DataFrame()"
|
| 204 |
]
|
| 205 |
},
|
|
@@ -232,7 +231,7 @@
|
|
| 232 |
" csv_files = [f for f in files if f.endswith('.csv')]\n",
|
| 233 |
" return csv_files\n",
|
| 234 |
" except Exception as e:\n",
|
| 235 |
-
" print(f\"
|
| 236 |
" return []\n",
|
| 237 |
"\n",
|
| 238 |
"def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
|
|
@@ -246,7 +245,7 @@
|
|
| 246 |
" )\n",
|
| 247 |
" return Path(local_path)\n",
|
| 248 |
" except Exception as e:\n",
|
| 249 |
-
" print(f\"
|
| 250 |
" return None\n",
|
| 251 |
"\n",
|
| 252 |
"# List available datasets\n",
|
|
@@ -284,13 +283,13 @@
|
|
| 284 |
" try:\n",
|
| 285 |
" df = pd.read_csv(local_path)\n",
|
| 286 |
" loaded_datasets[name] = df\n",
|
| 287 |
-
" print(f\"
|
| 288 |
" except Exception as e:\n",
|
| 289 |
-
" print(f\"
|
| 290 |
" else:\n",
|
| 291 |
-
" print(f\"
|
| 292 |
"\n",
|
| 293 |
-
"print(f\"\\n
|
| 294 |
]
|
| 295 |
},
|
| 296 |
{
|
|
@@ -319,9 +318,9 @@
|
|
| 319 |
" name = csv_file.stem.replace(\"_processed\", \"\")\n",
|
| 320 |
" df = pd.read_csv(csv_file)\n",
|
| 321 |
" datasets[name] = df\n",
|
| 322 |
-
" print(f\"
|
| 323 |
" except Exception as e:\n",
|
| 324 |
-
" print(f\"
|
| 325 |
" \n",
|
| 326 |
" return datasets\n",
|
| 327 |
"\n",
|
|
@@ -333,7 +332,7 @@
|
|
| 333 |
" if name not in loaded_datasets:\n",
|
| 334 |
" loaded_datasets[name] = df\n",
|
| 335 |
"\n",
|
| 336 |
-
"print(f\"\\n
|
| 337 |
]
|
| 338 |
},
|
| 339 |
{
|
|
@@ -390,11 +389,11 @@
|
|
| 390 |
"for name, df in loaded_datasets.items():\n",
|
| 391 |
" report = validate_dataset(name, df)\n",
|
| 392 |
" validation_reports.append(report)\n",
|
| 393 |
-
" status = \"
|
| 394 |
" print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
|
| 395 |
"\n",
|
| 396 |
"valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
|
| 397 |
-
"print(f\"\\n
|
| 398 |
]
|
| 399 |
},
|
| 400 |
{
|
|
@@ -452,9 +451,9 @@
|
|
| 452 |
" if name in loaded_datasets:\n",
|
| 453 |
" df = normalize_dataset(loaded_datasets[name], name)\n",
|
| 454 |
" normalized_datasets[name] = df\n",
|
| 455 |
-
" print(f\"
|
| 456 |
"\n",
|
| 457 |
-
"print(f\"\\n
|
| 458 |
]
|
| 459 |
},
|
| 460 |
{
|
|
@@ -493,14 +492,14 @@
|
|
| 493 |
" \"processed_at\": datetime.now().isoformat()\n",
|
| 494 |
" }\n",
|
| 495 |
" dataset_manifest.append(manifest_entry)\n",
|
| 496 |
-
" print(f\"
|
| 497 |
"\n",
|
| 498 |
"# Save manifest\n",
|
| 499 |
"manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
|
| 500 |
"with open(manifest_path, \"w\") as f:\n",
|
| 501 |
" json.dump(dataset_manifest, f, indent=2)\n",
|
| 502 |
"\n",
|
| 503 |
-
"print(f\"\\n
|
| 504 |
]
|
| 505 |
},
|
| 506 |
{
|
|
@@ -525,19 +524,19 @@
|
|
| 525 |
"total_samples = sum(len(df) for df in normalized_datasets.values())\n",
|
| 526 |
"\n",
|
| 527 |
"print(f\"\"\"\n",
|
| 528 |
-
"
|
| 529 |
" - Datasets processed: {len(normalized_datasets)}\n",
|
| 530 |
" - Total samples: {total_samples:,}\n",
|
| 531 |
" - Output directory: {PROCESSED_DIR}\n",
|
| 532 |
"\n",
|
| 533 |
-
"
|
| 534 |
"\n",
|
| 535 |
"for entry in dataset_manifest:\n",
|
| 536 |
-
" print(f\"
|
| 537 |
"\n",
|
| 538 |
"print(f\"\"\"\n",
|
| 539 |
"Next step:\n",
|
| 540 |
-
"
|
| 541 |
"\"\"\")\n",
|
| 542 |
"print(\"=\" * 60)"
|
| 543 |
]
|
|
@@ -550,4 +549,4 @@
|
|
| 550 |
},
|
| 551 |
"nbformat": 4,
|
| 552 |
"nbformat_minor": 5
|
| 553 |
-
}
|
|
|
|
| 50 |
"if config_path.exists():\n",
|
| 51 |
" with open(config_path) as f:\n",
|
| 52 |
" CONFIG = json.load(f)\n",
|
| 53 |
+
" print(\"\u2713 Configuration loaded\")\n",
|
| 54 |
"else:\n",
|
| 55 |
" raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
|
| 56 |
"\n",
|
| 57 |
"DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
|
| 58 |
+
"print(f\"\u2713 Datasets directory: {DATASETS_DIR}\")"
|
| 59 |
]
|
| 60 |
},
|
| 61 |
{
|
|
|
|
| 78 |
"class WebScraperDataCollector:\n",
|
| 79 |
" \"\"\"\n",
|
| 80 |
" Collects website security data via WebScrapper.live API.\n",
|
| 81 |
+
" Uses synchronous httpx for Jupyter compatibility.\n",
|
| 82 |
" \"\"\"\n",
|
| 83 |
" \n",
|
| 84 |
" def __init__(self):\n",
|
|
|
|
| 86 |
" self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
|
| 87 |
" self.timeout = 60.0\n",
|
| 88 |
" \n",
|
| 89 |
+
" def scrape_website(self, url: str) -> Dict[str, Any]:\n",
|
| 90 |
+
" \"\"\"Scrape a single website and return security data (sync version)\"\"\"\n",
|
| 91 |
" try:\n",
|
| 92 |
+
" with httpx.Client(timeout=self.timeout) as client:\n",
|
| 93 |
+
" response = client.post(\n",
|
| 94 |
" self.api_url,\n",
|
| 95 |
" json={\"url\": url},\n",
|
| 96 |
" headers={\n",
|
|
|
|
| 117 |
" except Exception as e:\n",
|
| 118 |
" return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
|
| 119 |
" \n",
|
| 120 |
+
" def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
|
| 121 |
+
" \"\"\"Collect data from multiple URLs (sync version)\"\"\"\n",
|
| 122 |
+
" import time\n",
|
| 123 |
" results = []\n",
|
| 124 |
" for url in urls:\n",
|
| 125 |
" print(f\" Scraping: {url}\")\n",
|
| 126 |
+
" result = self.scrape_website(url)\n",
|
| 127 |
" results.append(result)\n",
|
| 128 |
+
" time.sleep(1) # Rate limiting\n",
|
| 129 |
" return results\n",
|
| 130 |
" \n",
|
| 131 |
" def extract_security_features(self, data: Dict) -> Dict:\n",
|
|
|
|
| 137 |
" network_requests = data.get(\"network_requests\", [])\n",
|
| 138 |
" console_logs = data.get(\"console_logs\", [])\n",
|
| 139 |
" \n",
|
|
|
|
| 140 |
" return {\n",
|
| 141 |
" \"url\": data[\"url\"],\n",
|
| 142 |
" \"is_https\": security_report.get(\"is_https\", False),\n",
|
|
|
|
| 163 |
" return False\n",
|
| 164 |
"\n",
|
| 165 |
"scraper = WebScraperDataCollector()\n",
|
| 166 |
+
"print(\"\u2713 Web Scraper Data Collector initialized\")\n"
|
| 167 |
]
|
| 168 |
},
|
| 169 |
{
|
|
|
|
| 188 |
"\n",
|
| 189 |
"print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
|
| 190 |
"\n",
|
| 191 |
+
"# Run synchronous collection (Jupyter-compatible)\n",
|
| 192 |
+
"scraped_data = scraper.collect_batch(SAMPLE_URLS[:3]) # Limited for demo\n",
|
|
|
|
| 193 |
"\n",
|
| 194 |
"# Extract features\n",
|
| 195 |
"features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
|
| 196 |
"if features_list:\n",
|
| 197 |
" web_scraper_df = pd.DataFrame(features_list)\n",
|
| 198 |
+
" print(f\"\\n\u2713 Collected {len(web_scraper_df)} website security profiles\")\n",
|
| 199 |
" display(web_scraper_df.head())\n",
|
| 200 |
"else:\n",
|
| 201 |
+
" print(\"\u26a0 No data collected - API may be unavailable\")\n",
|
| 202 |
" web_scraper_df = pd.DataFrame()"
|
| 203 |
]
|
| 204 |
},
|
|
|
|
| 231 |
" csv_files = [f for f in files if f.endswith('.csv')]\n",
|
| 232 |
" return csv_files\n",
|
| 233 |
" except Exception as e:\n",
|
| 234 |
+
" print(f\"\u26a0 Could not list HF datasets: {e}\")\n",
|
| 235 |
" return []\n",
|
| 236 |
"\n",
|
| 237 |
"def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
|
|
|
|
| 245 |
" )\n",
|
| 246 |
" return Path(local_path)\n",
|
| 247 |
" except Exception as e:\n",
|
| 248 |
+
" print(f\"\u26a0 Could not download {file_path}: {e}\")\n",
|
| 249 |
" return None\n",
|
| 250 |
"\n",
|
| 251 |
"# List available datasets\n",
|
|
|
|
| 283 |
" try:\n",
|
| 284 |
" df = pd.read_csv(local_path)\n",
|
| 285 |
" loaded_datasets[name] = df\n",
|
| 286 |
+
" print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features\")\n",
|
| 287 |
" except Exception as e:\n",
|
| 288 |
+
" print(f\" \u26a0 Could not load {name}: {e}\")\n",
|
| 289 |
" else:\n",
|
| 290 |
+
" print(f\" \u26a0 {name}: Not found in repository\")\n",
|
| 291 |
"\n",
|
| 292 |
+
"print(f\"\\n\u2713 Loaded {len(loaded_datasets)} datasets\")"
|
| 293 |
]
|
| 294 |
},
|
| 295 |
{
|
|
|
|
| 318 |
" name = csv_file.stem.replace(\"_processed\", \"\")\n",
|
| 319 |
" df = pd.read_csv(csv_file)\n",
|
| 320 |
" datasets[name] = df\n",
|
| 321 |
+
" print(f\" \u2713 {name}: {len(df)} samples\")\n",
|
| 322 |
" except Exception as e:\n",
|
| 323 |
+
" print(f\" \u26a0 {csv_file.name}: {e}\")\n",
|
| 324 |
" \n",
|
| 325 |
" return datasets\n",
|
| 326 |
"\n",
|
|
|
|
| 332 |
" if name not in loaded_datasets:\n",
|
| 333 |
" loaded_datasets[name] = df\n",
|
| 334 |
"\n",
|
| 335 |
+
"print(f\"\\n\u2713 Total datasets available: {len(loaded_datasets)}\")"
|
| 336 |
]
|
| 337 |
},
|
| 338 |
{
|
|
|
|
| 389 |
"for name, df in loaded_datasets.items():\n",
|
| 390 |
" report = validate_dataset(name, df)\n",
|
| 391 |
" validation_reports.append(report)\n",
|
| 392 |
+
" status = \"\u2713\" if report[\"valid\"] else \"\u26a0\"\n",
|
| 393 |
" print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
|
| 394 |
"\n",
|
| 395 |
"valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
|
| 396 |
+
"print(f\"\\n\u2713 {len(valid_datasets)} datasets passed validation\")"
|
| 397 |
]
|
| 398 |
},
|
| 399 |
{
|
|
|
|
| 451 |
" if name in loaded_datasets:\n",
|
| 452 |
" df = normalize_dataset(loaded_datasets[name], name)\n",
|
| 453 |
" normalized_datasets[name] = df\n",
|
| 454 |
+
" print(f\" \u2713 {name}: {len(df)} samples after normalization\")\n",
|
| 455 |
"\n",
|
| 456 |
+
"print(f\"\\n\u2713 Normalized {len(normalized_datasets)} datasets\")"
|
| 457 |
]
|
| 458 |
},
|
| 459 |
{
|
|
|
|
| 492 |
" \"processed_at\": datetime.now().isoformat()\n",
|
| 493 |
" }\n",
|
| 494 |
" dataset_manifest.append(manifest_entry)\n",
|
| 495 |
+
" print(f\" \u2713 Saved: {output_path.name}\")\n",
|
| 496 |
"\n",
|
| 497 |
"# Save manifest\n",
|
| 498 |
"manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
|
| 499 |
"with open(manifest_path, \"w\") as f:\n",
|
| 500 |
" json.dump(dataset_manifest, f, indent=2)\n",
|
| 501 |
"\n",
|
| 502 |
+
"print(f\"\\n\u2713 Dataset manifest saved to: {manifest_path}\")"
|
| 503 |
]
|
| 504 |
},
|
| 505 |
{
|
|
|
|
| 524 |
"total_samples = sum(len(df) for df in normalized_datasets.values())\n",
|
| 525 |
"\n",
|
| 526 |
"print(f\"\"\"\n",
|
| 527 |
+
"\ud83d\udcca Data Collection Summary:\n",
|
| 528 |
" - Datasets processed: {len(normalized_datasets)}\n",
|
| 529 |
" - Total samples: {total_samples:,}\n",
|
| 530 |
" - Output directory: {PROCESSED_DIR}\n",
|
| 531 |
"\n",
|
| 532 |
+
"\ud83d\udcc1 Datasets Ready for Feature Engineering:\"\"\")\n",
|
| 533 |
"\n",
|
| 534 |
"for entry in dataset_manifest:\n",
|
| 535 |
+
" print(f\" \u2713 {entry['name']}: {entry['samples']:,} samples\")\n",
|
| 536 |
"\n",
|
| 537 |
"print(f\"\"\"\n",
|
| 538 |
"Next step:\n",
|
| 539 |
+
" \u2192 02_feature_engineering.ipynb\n",
|
| 540 |
"\"\"\")\n",
|
| 541 |
"print(\"=\" * 60)"
|
| 542 |
]
|
|
|
|
| 549 |
},
|
| 550 |
"nbformat": 4,
|
| 551 |
"nbformat_minor": 5
|
| 552 |
+
}
|