Che237 commited on
Commit
460801e
·
verified ·
1 Parent(s): ca20dd7

Fix async to sync httpx - remove asyncio.get_event_loop

Browse files
Files changed (1) hide show
  1. notebooks/01_data_acquisition.ipynb +37 -38
notebooks/01_data_acquisition.ipynb CHANGED
@@ -50,12 +50,12 @@
50
  "if config_path.exists():\n",
51
  " with open(config_path) as f:\n",
52
  " CONFIG = json.load(f)\n",
53
- " print(\" Configuration loaded\")\n",
54
  "else:\n",
55
  " raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
56
  "\n",
57
  "DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
58
- "print(f\" Datasets directory: {DATASETS_DIR}\")"
59
  ]
60
  },
61
  {
@@ -78,7 +78,7 @@
78
  "class WebScraperDataCollector:\n",
79
  " \"\"\"\n",
80
  " Collects website security data via WebScrapper.live API.\n",
81
- " This aligns with the backend WebScraperAPIService.\n",
82
  " \"\"\"\n",
83
  " \n",
84
  " def __init__(self):\n",
@@ -86,11 +86,11 @@
86
  " self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
87
  " self.timeout = 60.0\n",
88
  " \n",
89
- " async def scrape_website(self, url: str) -> Dict[str, Any]:\n",
90
- " \"\"\"Scrape a single website and return security data\"\"\"\n",
91
  " try:\n",
92
- " async with httpx.AsyncClient(timeout=self.timeout) as client:\n",
93
- " response = await client.post(\n",
94
  " self.api_url,\n",
95
  " json={\"url\": url},\n",
96
  " headers={\n",
@@ -117,14 +117,15 @@
117
  " except Exception as e:\n",
118
  " return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
119
  " \n",
120
- " async def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
121
- " \"\"\"Collect data from multiple URLs\"\"\"\n",
 
122
  " results = []\n",
123
  " for url in urls:\n",
124
  " print(f\" Scraping: {url}\")\n",
125
- " result = await self.scrape_website(url)\n",
126
  " results.append(result)\n",
127
- " await asyncio.sleep(1) # Rate limiting\n",
128
  " return results\n",
129
  " \n",
130
  " def extract_security_features(self, data: Dict) -> Dict:\n",
@@ -136,7 +137,6 @@
136
  " network_requests = data.get(\"network_requests\", [])\n",
137
  " console_logs = data.get(\"console_logs\", [])\n",
138
  " \n",
139
- " # Extract features aligned with backend needs\n",
140
  " return {\n",
141
  " \"url\": data[\"url\"],\n",
142
  " \"is_https\": security_report.get(\"is_https\", False),\n",
@@ -163,7 +163,7 @@
163
  " return False\n",
164
  "\n",
165
  "scraper = WebScraperDataCollector()\n",
166
- "print(\" Web Scraper Data Collector initialized\")"
167
  ]
168
  },
169
  {
@@ -188,18 +188,17 @@
188
  "\n",
189
  "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
190
  "\n",
191
- "# Run async collection\n",
192
- "loop = asyncio.get_event_loop()\n",
193
- "scraped_data = loop.run_until_complete(scraper.collect_batch(SAMPLE_URLS[:3])) # Limited for demo\n",
194
  "\n",
195
  "# Extract features\n",
196
  "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
197
  "if features_list:\n",
198
  " web_scraper_df = pd.DataFrame(features_list)\n",
199
- " print(f\"\\n Collected {len(web_scraper_df)} website security profiles\")\n",
200
  " display(web_scraper_df.head())\n",
201
  "else:\n",
202
- " print(\" No data collected - API may be unavailable\")\n",
203
  " web_scraper_df = pd.DataFrame()"
204
  ]
205
  },
@@ -232,7 +231,7 @@
232
  " csv_files = [f for f in files if f.endswith('.csv')]\n",
233
  " return csv_files\n",
234
  " except Exception as e:\n",
235
- " print(f\" Could not list HF datasets: {e}\")\n",
236
  " return []\n",
237
  "\n",
238
  "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
@@ -246,7 +245,7 @@
246
  " )\n",
247
  " return Path(local_path)\n",
248
  " except Exception as e:\n",
249
- " print(f\" Could not download {file_path}: {e}\")\n",
250
  " return None\n",
251
  "\n",
252
  "# List available datasets\n",
@@ -284,13 +283,13 @@
284
  " try:\n",
285
  " df = pd.read_csv(local_path)\n",
286
  " loaded_datasets[name] = df\n",
287
- " print(f\" {name}: {len(df)} samples, {len(df.columns)} features\")\n",
288
  " except Exception as e:\n",
289
- " print(f\" Could not load {name}: {e}\")\n",
290
  " else:\n",
291
- " print(f\" {name}: Not found in repository\")\n",
292
  "\n",
293
- "print(f\"\\n Loaded {len(loaded_datasets)} datasets\")"
294
  ]
295
  },
296
  {
@@ -319,9 +318,9 @@
319
  " name = csv_file.stem.replace(\"_processed\", \"\")\n",
320
  " df = pd.read_csv(csv_file)\n",
321
  " datasets[name] = df\n",
322
- " print(f\" {name}: {len(df)} samples\")\n",
323
  " except Exception as e:\n",
324
- " print(f\" {csv_file.name}: {e}\")\n",
325
  " \n",
326
  " return datasets\n",
327
  "\n",
@@ -333,7 +332,7 @@
333
  " if name not in loaded_datasets:\n",
334
  " loaded_datasets[name] = df\n",
335
  "\n",
336
- "print(f\"\\n Total datasets available: {len(loaded_datasets)}\")"
337
  ]
338
  },
339
  {
@@ -390,11 +389,11 @@
390
  "for name, df in loaded_datasets.items():\n",
391
  " report = validate_dataset(name, df)\n",
392
  " validation_reports.append(report)\n",
393
- " status = \"✓\" if report[\"valid\"] else \"⚠\"\n",
394
  " print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
395
  "\n",
396
  "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
397
- "print(f\"\\n {len(valid_datasets)} datasets passed validation\")"
398
  ]
399
  },
400
  {
@@ -452,9 +451,9 @@
452
  " if name in loaded_datasets:\n",
453
  " df = normalize_dataset(loaded_datasets[name], name)\n",
454
  " normalized_datasets[name] = df\n",
455
- " print(f\" {name}: {len(df)} samples after normalization\")\n",
456
  "\n",
457
- "print(f\"\\n Normalized {len(normalized_datasets)} datasets\")"
458
  ]
459
  },
460
  {
@@ -493,14 +492,14 @@
493
  " \"processed_at\": datetime.now().isoformat()\n",
494
  " }\n",
495
  " dataset_manifest.append(manifest_entry)\n",
496
- " print(f\" Saved: {output_path.name}\")\n",
497
  "\n",
498
  "# Save manifest\n",
499
  "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
500
  "with open(manifest_path, \"w\") as f:\n",
501
  " json.dump(dataset_manifest, f, indent=2)\n",
502
  "\n",
503
- "print(f\"\\n Dataset manifest saved to: {manifest_path}\")"
504
  ]
505
  },
506
  {
@@ -525,19 +524,19 @@
525
  "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
526
  "\n",
527
  "print(f\"\"\"\n",
528
- "📊 Data Collection Summary:\n",
529
  " - Datasets processed: {len(normalized_datasets)}\n",
530
  " - Total samples: {total_samples:,}\n",
531
  " - Output directory: {PROCESSED_DIR}\n",
532
  "\n",
533
- "📁 Datasets Ready for Feature Engineering:\"\"\")\n",
534
  "\n",
535
  "for entry in dataset_manifest:\n",
536
- " print(f\" {entry['name']}: {entry['samples']:,} samples\")\n",
537
  "\n",
538
  "print(f\"\"\"\n",
539
  "Next step:\n",
540
- " 02_feature_engineering.ipynb\n",
541
  "\"\"\")\n",
542
  "print(\"=\" * 60)"
543
  ]
@@ -550,4 +549,4 @@
550
  },
551
  "nbformat": 4,
552
  "nbformat_minor": 5
553
- }
 
50
  "if config_path.exists():\n",
51
  " with open(config_path) as f:\n",
52
  " CONFIG = json.load(f)\n",
53
+ " print(\"\u2713 Configuration loaded\")\n",
54
  "else:\n",
55
  " raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
56
  "\n",
57
  "DATASETS_DIR = Path(CONFIG.get(\"datasets_dir\", \"/home/user/app/datasets\"))\n",
58
+ "print(f\"\u2713 Datasets directory: {DATASETS_DIR}\")"
59
  ]
60
  },
61
  {
 
78
  "class WebScraperDataCollector:\n",
79
  " \"\"\"\n",
80
  " Collects website security data via WebScrapper.live API.\n",
81
+ " Uses synchronous httpx for Jupyter compatibility.\n",
82
  " \"\"\"\n",
83
  " \n",
84
  " def __init__(self):\n",
 
86
  " self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
87
  " self.timeout = 60.0\n",
88
  " \n",
89
+ " def scrape_website(self, url: str) -> Dict[str, Any]:\n",
90
+ " \"\"\"Scrape a single website and return security data (sync version)\"\"\"\n",
91
  " try:\n",
92
+ " with httpx.Client(timeout=self.timeout) as client:\n",
93
+ " response = client.post(\n",
94
  " self.api_url,\n",
95
  " json={\"url\": url},\n",
96
  " headers={\n",
 
117
  " except Exception as e:\n",
118
  " return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
119
  " \n",
120
+ " def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
121
+ " \"\"\"Collect data from multiple URLs (sync version)\"\"\"\n",
122
+ " import time\n",
123
  " results = []\n",
124
  " for url in urls:\n",
125
  " print(f\" Scraping: {url}\")\n",
126
+ " result = self.scrape_website(url)\n",
127
  " results.append(result)\n",
128
+ " time.sleep(1) # Rate limiting\n",
129
  " return results\n",
130
  " \n",
131
  " def extract_security_features(self, data: Dict) -> Dict:\n",
 
137
  " network_requests = data.get(\"network_requests\", [])\n",
138
  " console_logs = data.get(\"console_logs\", [])\n",
139
  " \n",
 
140
  " return {\n",
141
  " \"url\": data[\"url\"],\n",
142
  " \"is_https\": security_report.get(\"is_https\", False),\n",
 
163
  " return False\n",
164
  "\n",
165
  "scraper = WebScraperDataCollector()\n",
166
+ "print(\"\u2713 Web Scraper Data Collector initialized\")\n"
167
  ]
168
  },
169
  {
 
188
  "\n",
189
  "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
190
  "\n",
191
+ "# Run synchronous collection (Jupyter-compatible)\n",
192
+ "scraped_data = scraper.collect_batch(SAMPLE_URLS[:3]) # Limited for demo\n",
 
193
  "\n",
194
  "# Extract features\n",
195
  "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
196
  "if features_list:\n",
197
  " web_scraper_df = pd.DataFrame(features_list)\n",
198
+ " print(f\"\\n\u2713 Collected {len(web_scraper_df)} website security profiles\")\n",
199
  " display(web_scraper_df.head())\n",
200
  "else:\n",
201
+ " print(\"\u26a0 No data collected - API may be unavailable\")\n",
202
  " web_scraper_df = pd.DataFrame()"
203
  ]
204
  },
 
231
  " csv_files = [f for f in files if f.endswith('.csv')]\n",
232
  " return csv_files\n",
233
  " except Exception as e:\n",
234
+ " print(f\"\u26a0 Could not list HF datasets: {e}\")\n",
235
  " return []\n",
236
  "\n",
237
  "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
 
245
  " )\n",
246
  " return Path(local_path)\n",
247
  " except Exception as e:\n",
248
+ " print(f\"\u26a0 Could not download {file_path}: {e}\")\n",
249
  " return None\n",
250
  "\n",
251
  "# List available datasets\n",
 
283
  " try:\n",
284
  " df = pd.read_csv(local_path)\n",
285
  " loaded_datasets[name] = df\n",
286
+ " print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features\")\n",
287
  " except Exception as e:\n",
288
+ " print(f\" \u26a0 Could not load {name}: {e}\")\n",
289
  " else:\n",
290
+ " print(f\" \u26a0 {name}: Not found in repository\")\n",
291
  "\n",
292
+ "print(f\"\\n\u2713 Loaded {len(loaded_datasets)} datasets\")"
293
  ]
294
  },
295
  {
 
318
  " name = csv_file.stem.replace(\"_processed\", \"\")\n",
319
  " df = pd.read_csv(csv_file)\n",
320
  " datasets[name] = df\n",
321
+ " print(f\" \u2713 {name}: {len(df)} samples\")\n",
322
  " except Exception as e:\n",
323
+ " print(f\" \u26a0 {csv_file.name}: {e}\")\n",
324
  " \n",
325
  " return datasets\n",
326
  "\n",
 
332
  " if name not in loaded_datasets:\n",
333
  " loaded_datasets[name] = df\n",
334
  "\n",
335
+ "print(f\"\\n\u2713 Total datasets available: {len(loaded_datasets)}\")"
336
  ]
337
  },
338
  {
 
389
  "for name, df in loaded_datasets.items():\n",
390
  " report = validate_dataset(name, df)\n",
391
  " validation_reports.append(report)\n",
392
+ " status = \"\u2713\" if report[\"valid\"] else \"\u26a0\"\n",
393
  " print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
394
  "\n",
395
  "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
396
+ "print(f\"\\n\u2713 {len(valid_datasets)} datasets passed validation\")"
397
  ]
398
  },
399
  {
 
451
  " if name in loaded_datasets:\n",
452
  " df = normalize_dataset(loaded_datasets[name], name)\n",
453
  " normalized_datasets[name] = df\n",
454
+ " print(f\" \u2713 {name}: {len(df)} samples after normalization\")\n",
455
  "\n",
456
+ "print(f\"\\n\u2713 Normalized {len(normalized_datasets)} datasets\")"
457
  ]
458
  },
459
  {
 
492
  " \"processed_at\": datetime.now().isoformat()\n",
493
  " }\n",
494
  " dataset_manifest.append(manifest_entry)\n",
495
+ " print(f\" \u2713 Saved: {output_path.name}\")\n",
496
  "\n",
497
  "# Save manifest\n",
498
  "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
499
  "with open(manifest_path, \"w\") as f:\n",
500
  " json.dump(dataset_manifest, f, indent=2)\n",
501
  "\n",
502
+ "print(f\"\\n\u2713 Dataset manifest saved to: {manifest_path}\")"
503
  ]
504
  },
505
  {
 
524
  "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
525
  "\n",
526
  "print(f\"\"\"\n",
527
+ "\ud83d\udcca Data Collection Summary:\n",
528
  " - Datasets processed: {len(normalized_datasets)}\n",
529
  " - Total samples: {total_samples:,}\n",
530
  " - Output directory: {PROCESSED_DIR}\n",
531
  "\n",
532
+ "\ud83d\udcc1 Datasets Ready for Feature Engineering:\"\"\")\n",
533
  "\n",
534
  "for entry in dataset_manifest:\n",
535
+ " print(f\" \u2713 {entry['name']}: {entry['samples']:,} samples\")\n",
536
  "\n",
537
  "print(f\"\"\"\n",
538
  "Next step:\n",
539
+ " \u2192 02_feature_engineering.ipynb\n",
540
  "\"\"\")\n",
541
  "print(\"=\" * 60)"
542
  ]
 
549
  },
550
  "nbformat": 4,
551
  "nbformat_minor": 5
552
+ }