Che237 commited on
Commit
e4ce7b0
Β·
verified Β·
1 Parent(s): f808c77

Add 01_data_acquisition.ipynb

Browse files
Files changed (1) hide show
  1. notebooks/01_data_acquisition.ipynb +549 -0
notebooks/01_data_acquisition.ipynb ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "417baa98",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 01 - Data Acquisition\n",
9
+ "\n",
10
+ "## CyberForge AI - Cybersecurity Data Collection & Preparation\n",
11
+ "\n",
12
+ "This notebook handles all data acquisition for the CyberForge AI ML pipeline.\n",
13
+ "\n",
14
+ "### Data Sources:\n",
15
+ "1. **Public Datasets** - Legal, publicly available cybersecurity datasets\n",
16
+ "2. **Web Scraper API** - Real-time website security data collection\n",
17
+ "3. **Hugging Face Datasets** - Pre-uploaded CyberForge datasets\n",
18
+ "4. **Synthetic Data** - Generated data for edge cases\n",
19
+ "\n",
20
+ "### Output:\n",
21
+ "- Cleaned, normalized datasets ready for feature engineering\n",
22
+ "- Data validation reports"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "id": "8e0aeada",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "# Load configuration from environment setup\n",
33
+ "import json\n",
34
+ "import pandas as pd\n",
35
+ "import numpy as np\n",
36
+ "from pathlib import Path\n",
37
+ "import httpx\n",
38
+ "import asyncio\n",
39
+ "from datetime import datetime\n",
40
+ "from typing import Dict, List, Any, Optional\n",
41
+ "import warnings\n",
42
+ "warnings.filterwarnings('ignore')\n",
43
+ "\n",
44
+ "# Load notebook config\n",
45
+ "config_path = Path(\"../notebook_config.json\")\n",
46
+ "if config_path.exists():\n",
47
+ " with open(config_path) as f:\n",
48
+ " CONFIG = json.load(f)\n",
49
+ " print(\"βœ“ Configuration loaded\")\n",
50
+ "else:\n",
51
+ " raise FileNotFoundError(\"Run 00_environment_setup.ipynb first!\")\n",
52
+ "\n",
53
+ "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n",
54
+ "print(f\"βœ“ Datasets directory: {DATASETS_DIR}\")"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "id": "9493ab50",
60
+ "metadata": {},
61
+ "source": [
62
+ "## 1. Web Scraper API Data Collection\n",
63
+ "\n",
64
+ "Collect real-time website security data using the WebScrapper.live API."
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "id": "876b9e05",
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "class WebScraperDataCollector:\n",
75
+ " \"\"\"\n",
76
+ " Collects website security data via WebScrapper.live API.\n",
77
+ " This aligns with the backend WebScraperAPIService.\n",
78
+ " \"\"\"\n",
79
+ " \n",
80
+ " def __init__(self):\n",
81
+ " self.api_url = \"http://webscrapper.live/api/scrape\"\n",
82
+ " self.api_key = \"sk-fd14eaa7bceb478db7afc7256e514d2b\"\n",
83
+ " self.timeout = 60.0\n",
84
+ " \n",
85
+ " async def scrape_website(self, url: str) -> Dict[str, Any]:\n",
86
+ " \"\"\"Scrape a single website and return security data\"\"\"\n",
87
+ " try:\n",
88
+ " async with httpx.AsyncClient(timeout=self.timeout) as client:\n",
89
+ " response = await client.post(\n",
90
+ " self.api_url,\n",
91
+ " json={\"url\": url},\n",
92
+ " headers={\n",
93
+ " \"Content-Type\": \"application/json\",\n",
94
+ " \"X-API-Key\": self.api_key\n",
95
+ " }\n",
96
+ " )\n",
97
+ " \n",
98
+ " if response.status_code == 200:\n",
99
+ " data = response.json()\n",
100
+ " return {\n",
101
+ " \"success\": True,\n",
102
+ " \"url\": url,\n",
103
+ " \"security_report\": data.get(\"security_report\", {}),\n",
104
+ " \"network_requests\": data.get(\"network_requests\", []),\n",
105
+ " \"console_logs\": data.get(\"console_logs\", []),\n",
106
+ " \"performance_metrics\": data.get(\"performance_metrics\", {}),\n",
107
+ " \"response_headers\": data.get(\"response_headers\", {}),\n",
108
+ " \"html_length\": len(data.get(\"html\", \"\") or data.get(\"content\", \"\")),\n",
109
+ " \"scraped_at\": datetime.now().isoformat()\n",
110
+ " }\n",
111
+ " else:\n",
112
+ " return {\"success\": False, \"url\": url, \"error\": f\"Status {response.status_code}\"}\n",
113
+ " except Exception as e:\n",
114
+ " return {\"success\": False, \"url\": url, \"error\": str(e)}\n",
115
+ " \n",
116
+ " async def collect_batch(self, urls: List[str]) -> List[Dict]:\n",
117
+ " \"\"\"Collect data from multiple URLs\"\"\"\n",
118
+ " results = []\n",
119
+ " for url in urls:\n",
120
+ " print(f\" Scraping: {url}\")\n",
121
+ " result = await self.scrape_website(url)\n",
122
+ " results.append(result)\n",
123
+ " await asyncio.sleep(1) # Rate limiting\n",
124
+ " return results\n",
125
+ " \n",
126
+ " def extract_security_features(self, data: Dict) -> Dict:\n",
127
+ " \"\"\"Extract security-relevant features from scraped data\"\"\"\n",
128
+ " if not data.get(\"success\"):\n",
129
+ " return None\n",
130
+ " \n",
131
+ " security_report = data.get(\"security_report\", {})\n",
132
+ " network_requests = data.get(\"network_requests\", [])\n",
133
+ " console_logs = data.get(\"console_logs\", [])\n",
134
+ " \n",
135
+ " # Extract features aligned with backend needs\n",
136
+ " return {\n",
137
+ " \"url\": data[\"url\"],\n",
138
+ " \"is_https\": security_report.get(\"is_https\", False),\n",
139
+ " \"has_mixed_content\": security_report.get(\"mixed_content\", False),\n",
140
+ " \"missing_headers_count\": len(security_report.get(\"missing_security_headers\", [])),\n",
141
+ " \"has_insecure_cookies\": security_report.get(\"insecure_cookies\", False),\n",
142
+ " \"total_requests\": len(network_requests),\n",
143
+ " \"external_requests\": sum(1 for r in network_requests if self._is_external(r, data[\"url\"])),\n",
144
+ " \"failed_requests\": sum(1 for r in network_requests if r.get(\"status\", 200) >= 400),\n",
145
+ " \"console_errors\": sum(1 for log in console_logs if log.get(\"level\") == \"error\"),\n",
146
+ " \"console_warnings\": sum(1 for log in console_logs if log.get(\"level\") == \"warning\"),\n",
147
+ " \"html_size\": data.get(\"html_length\", 0),\n",
148
+ " \"scraped_at\": data[\"scraped_at\"]\n",
149
+ " }\n",
150
+ " \n",
151
+ " def _is_external(self, request: Dict, base_url: str) -> bool:\n",
152
+ " \"\"\"Check if a request is to an external domain\"\"\"\n",
153
+ " try:\n",
154
+ " from urllib.parse import urlparse\n",
155
+ " base_domain = urlparse(base_url).netloc\n",
156
+ " req_domain = urlparse(request.get(\"url\", \"\")).netloc\n",
157
+ " return base_domain != req_domain\n",
158
+ " except:\n",
159
+ " return False\n",
160
+ "\n",
161
+ "scraper = WebScraperDataCollector()\n",
162
+ "print(\"βœ“ Web Scraper Data Collector initialized\")"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "aec0ba34",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "# Collect sample data from known safe/unsafe websites for training\n",
173
+ "SAMPLE_URLS = [\n",
174
+ " # Known safe websites\n",
175
+ " \"https://www.google.com\",\n",
176
+ " \"https://github.com\",\n",
177
+ " \"https://www.microsoft.com\",\n",
178
+ " \"https://www.amazon.com\",\n",
179
+ " \"https://www.wikipedia.org\",\n",
180
+ " # Test sites\n",
181
+ " \"https://example.com\",\n",
182
+ " \"https://httpbin.org\",\n",
183
+ "]\n",
184
+ "\n",
185
+ "print(f\"Collecting data from {len(SAMPLE_URLS)} URLs...\")\n",
186
+ "\n",
187
+ "# Run async collection\n",
188
+ "loop = asyncio.get_event_loop()\n",
189
+ "scraped_data = loop.run_until_complete(scraper.collect_batch(SAMPLE_URLS[:3])) # Limited for demo\n",
190
+ "\n",
191
+ "# Extract features\n",
192
+ "features_list = [scraper.extract_security_features(d) for d in scraped_data if d.get(\"success\")]\n",
193
+ "if features_list:\n",
194
+ " web_scraper_df = pd.DataFrame(features_list)\n",
195
+ " print(f\"\\nβœ“ Collected {len(web_scraper_df)} website security profiles\")\n",
196
+ " display(web_scraper_df.head())\n",
197
+ "else:\n",
198
+ " print(\"⚠ No data collected - API may be unavailable\")\n",
199
+ " web_scraper_df = pd.DataFrame()"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "id": "0dadb1ac",
205
+ "metadata": {},
206
+ "source": [
207
+ "## 2. Load Hugging Face Datasets\n",
208
+ "\n",
209
+ "Load pre-uploaded CyberForge datasets from Hugging Face."
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": null,
215
+ "id": "9be784df",
216
+ "metadata": {},
217
+ "outputs": [],
218
+ "source": [
219
+ "from huggingface_hub import hf_hub_download, list_repo_files\n",
220
+ "import os\n",
221
+ "\n",
222
+ "HF_DATASET_REPO = \"Che237/cyberforge-datasets\"\n",
223
+ "\n",
224
+ "def list_available_datasets():\n",
225
+ " \"\"\"List all available datasets in the HF repository\"\"\"\n",
226
+ " try:\n",
227
+ " files = list_repo_files(HF_DATASET_REPO, repo_type=\"dataset\")\n",
228
+ " csv_files = [f for f in files if f.endswith('.csv')]\n",
229
+ " return csv_files\n",
230
+ " except Exception as e:\n",
231
+ " print(f\"⚠ Could not list HF datasets: {e}\")\n",
232
+ " return []\n",
233
+ "\n",
234
+ "def download_dataset(file_path: str, local_dir: Path = DATASETS_DIR) -> Optional[Path]:\n",
235
+ " \"\"\"Download a specific dataset from Hugging Face\"\"\"\n",
236
+ " try:\n",
237
+ " local_path = hf_hub_download(\n",
238
+ " repo_id=HF_DATASET_REPO,\n",
239
+ " filename=file_path,\n",
240
+ " repo_type=\"dataset\",\n",
241
+ " cache_dir=str(local_dir / \"cache\")\n",
242
+ " )\n",
243
+ " return Path(local_path)\n",
244
+ " except Exception as e:\n",
245
+ " print(f\"⚠ Could not download {file_path}: {e}\")\n",
246
+ " return None\n",
247
+ "\n",
248
+ "# List available datasets\n",
249
+ "print(\"Available datasets on Hugging Face:\")\n",
250
+ "available_files = list_available_datasets()\n",
251
+ "for f in available_files[:20]: # Show first 20\n",
252
+ " print(f\" - {f}\")\n",
253
+ "print(f\" ... and {len(available_files) - 20} more\" if len(available_files) > 20 else \"\")"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": null,
259
+ "id": "cc9ef68b",
260
+ "metadata": {},
261
+ "outputs": [],
262
+ "source": [
263
+ "# Priority datasets for training (aligned with backend requirements)\n",
264
+ "PRIORITY_DATASETS = {\n",
265
+ " \"network_intrusion\": \"network_intrusion/network_intrusion_processed.csv\",\n",
266
+ " \"phishing_detection\": \"phishing_detection/phishing_detection_processed.csv\",\n",
267
+ " \"malware_detection\": \"malware_detection/malware_detection_processed.csv\",\n",
268
+ " \"anomaly_detection\": \"anomaly_detection/anomaly_detection_processed.csv\",\n",
269
+ " \"web_attack_detection\": \"web_attack_detection/web_attack_detection_processed.csv\",\n",
270
+ "}\n",
271
+ "\n",
272
+ "loaded_datasets = {}\n",
273
+ "\n",
274
+ "print(\"Downloading priority datasets...\")\n",
275
+ "for name, path in PRIORITY_DATASETS.items():\n",
276
+ " if path in available_files:\n",
277
+ " print(f\" Downloading: {name}\")\n",
278
+ " local_path = download_dataset(path)\n",
279
+ " if local_path:\n",
280
+ " try:\n",
281
+ " df = pd.read_csv(local_path)\n",
282
+ " loaded_datasets[name] = df\n",
283
+ " print(f\" βœ“ {name}: {len(df)} samples, {len(df.columns)} features\")\n",
284
+ " except Exception as e:\n",
285
+ " print(f\" ⚠ Could not load {name}: {e}\")\n",
286
+ " else:\n",
287
+ " print(f\" ⚠ {name}: Not found in repository\")\n",
288
+ "\n",
289
+ "print(f\"\\nβœ“ Loaded {len(loaded_datasets)} datasets\")"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "markdown",
294
+ "id": "3fadcbfc",
295
+ "metadata": {},
296
+ "source": [
297
+ "## 3. Load Local Datasets\n",
298
+ "\n",
299
+ "Load any datasets already present in the local datasets directory."
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": null,
305
+ "id": "40cad5c5",
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": [
309
+ "def load_local_datasets(datasets_dir: Path) -> Dict[str, pd.DataFrame]:\n",
310
+ " \"\"\"Load all CSV datasets from local directory\"\"\"\n",
311
+ " datasets = {}\n",
312
+ " \n",
313
+ " for csv_file in datasets_dir.rglob(\"*_processed.csv\"):\n",
314
+ " try:\n",
315
+ " name = csv_file.stem.replace(\"_processed\", \"\")\n",
316
+ " df = pd.read_csv(csv_file)\n",
317
+ " datasets[name] = df\n",
318
+ " print(f\" βœ“ {name}: {len(df)} samples\")\n",
319
+ " except Exception as e:\n",
320
+ " print(f\" ⚠ {csv_file.name}: {e}\")\n",
321
+ " \n",
322
+ " return datasets\n",
323
+ "\n",
324
+ "print(\"Loading local datasets...\")\n",
325
+ "local_datasets = load_local_datasets(DATASETS_DIR)\n",
326
+ "\n",
327
+ "# Merge with HF datasets (local takes precedence)\n",
328
+ "for name, df in local_datasets.items():\n",
329
+ " if name not in loaded_datasets:\n",
330
+ " loaded_datasets[name] = df\n",
331
+ "\n",
332
+ "print(f\"\\nβœ“ Total datasets available: {len(loaded_datasets)}\")"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "markdown",
337
+ "id": "a3584926",
338
+ "metadata": {},
339
+ "source": [
340
+ "## 4. Data Validation & Quality Checks"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "id": "4adefa74",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "def validate_dataset(name: str, df: pd.DataFrame) -> Dict[str, Any]:\n",
351
+ " \"\"\"Validate dataset quality and return report\"\"\"\n",
352
+ " report = {\n",
353
+ " \"name\": name,\n",
354
+ " \"samples\": len(df),\n",
355
+ " \"features\": len(df.columns),\n",
356
+ " \"missing_values\": df.isnull().sum().sum(),\n",
357
+ " \"missing_pct\": (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,\n",
358
+ " \"duplicate_rows\": df.duplicated().sum(),\n",
359
+ " \"numeric_columns\": len(df.select_dtypes(include=[np.number]).columns),\n",
360
+ " \"categorical_columns\": len(df.select_dtypes(include=['object', 'category']).columns),\n",
361
+ " \"memory_mb\": df.memory_usage(deep=True).sum() / (1024 * 1024),\n",
362
+ " \"has_label\": any(col in df.columns for col in ['label', 'target', 'class', 'is_malicious', 'attack_type']),\n",
363
+ " \"valid\": True\n",
364
+ " }\n",
365
+ " \n",
366
+ " # Validation checks\n",
367
+ " issues = []\n",
368
+ " if report[\"samples\"] < 100:\n",
369
+ " issues.append(\"Too few samples (<100)\")\n",
370
+ " if report[\"missing_pct\"] > 50:\n",
371
+ " issues.append(\"Too many missing values (>50%)\")\n",
372
+ " if not report[\"has_label\"]:\n",
373
+ " issues.append(\"No label column found\")\n",
374
+ " \n",
375
+ " report[\"issues\"] = issues\n",
376
+ " report[\"valid\"] = len(issues) == 0\n",
377
+ " \n",
378
+ " return report\n",
379
+ "\n",
380
+ "# Validate all datasets\n",
381
+ "validation_reports = []\n",
382
+ "print(\"Validating datasets...\\n\")\n",
383
+ "print(f\"{'Dataset':<30} {'Samples':>10} {'Features':>10} {'Missing %':>10} {'Valid':>8}\")\n",
384
+ "print(\"-\" * 75)\n",
385
+ "\n",
386
+ "for name, df in loaded_datasets.items():\n",
387
+ " report = validate_dataset(name, df)\n",
388
+ " validation_reports.append(report)\n",
389
+ " status = \"βœ“\" if report[\"valid\"] else \"⚠\"\n",
390
+ " print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
391
+ "\n",
392
+ "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
393
+ "print(f\"\\nβœ“ {len(valid_datasets)} datasets passed validation\")"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "markdown",
398
+ "id": "0603447f",
399
+ "metadata": {},
400
+ "source": [
401
+ "## 5. Data Normalization"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "code",
406
+ "execution_count": null,
407
+ "id": "6544e100",
408
+ "metadata": {},
409
+ "outputs": [],
410
+ "source": [
411
+ "def normalize_dataset(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
412
+ " \"\"\"Normalize dataset for consistent processing\"\"\"\n",
413
+ " df = df.copy()\n",
414
+ " \n",
415
+ " # Standardize column names\n",
416
+ " df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')\n",
417
+ " \n",
418
+ " # Find and standardize label column\n",
419
+ " label_columns = ['label', 'target', 'class', 'is_malicious', 'attack_type', 'attack', 'category']\n",
420
+ " for col in label_columns:\n",
421
+ " if col in df.columns:\n",
422
+ " df = df.rename(columns={col: 'label'})\n",
423
+ " break\n",
424
+ " \n",
425
+ " # Handle missing values\n",
426
+ " numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
427
+ " df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())\n",
428
+ " \n",
429
+ " categorical_cols = df.select_dtypes(include=['object', 'category']).columns\n",
430
+ " for col in categorical_cols:\n",
431
+ " if col != 'label':\n",
432
+ " df[col] = df[col].fillna('unknown')\n",
433
+ " \n",
434
+ " # Remove duplicates\n",
435
+ " df = df.drop_duplicates()\n",
436
+ " \n",
437
+ " # Add metadata\n",
438
+ " df.attrs['dataset_name'] = name\n",
439
+ " df.attrs['processed_at'] = datetime.now().isoformat()\n",
440
+ " \n",
441
+ " return df\n",
442
+ "\n",
443
+ "# Normalize all valid datasets\n",
444
+ "normalized_datasets = {}\n",
445
+ "print(\"Normalizing datasets...\")\n",
446
+ "\n",
447
+ "for name in valid_datasets:\n",
448
+ " if name in loaded_datasets:\n",
449
+ " df = normalize_dataset(loaded_datasets[name], name)\n",
450
+ " normalized_datasets[name] = df\n",
451
+ " print(f\" βœ“ {name}: {len(df)} samples after normalization\")\n",
452
+ "\n",
453
+ "print(f\"\\nβœ“ Normalized {len(normalized_datasets)} datasets\")"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "markdown",
458
+ "id": "a9a016d4",
459
+ "metadata": {},
460
+ "source": [
461
+ "## 6. Save Processed Data"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "id": "91248c3b",
468
+ "metadata": {},
469
+ "outputs": [],
470
+ "source": [
471
+ "# Create processed data directory\n",
472
+ "PROCESSED_DIR = DATASETS_DIR / \"processed\"\n",
473
+ "PROCESSED_DIR.mkdir(exist_ok=True)\n",
474
+ "\n",
475
+ "# Save each normalized dataset\n",
476
+ "print(\"Saving processed datasets...\")\n",
477
+ "dataset_manifest = []\n",
478
+ "\n",
479
+ "for name, df in normalized_datasets.items():\n",
480
+ " output_path = PROCESSED_DIR / f\"{name}_ready.csv\"\n",
481
+ " df.to_csv(output_path, index=False)\n",
482
+ " \n",
483
+ " manifest_entry = {\n",
484
+ " \"name\": name,\n",
485
+ " \"path\": str(output_path.relative_to(DATASETS_DIR.parent)),\n",
486
+ " \"samples\": len(df),\n",
487
+ " \"features\": len(df.columns),\n",
488
+ " \"has_label\": \"label\" in df.columns,\n",
489
+ " \"processed_at\": datetime.now().isoformat()\n",
490
+ " }\n",
491
+ " dataset_manifest.append(manifest_entry)\n",
492
+ " print(f\" βœ“ Saved: {output_path.name}\")\n",
493
+ "\n",
494
+ "# Save manifest\n",
495
+ "manifest_path = PROCESSED_DIR / \"manifest.json\"\n",
496
+ "with open(manifest_path, \"w\") as f:\n",
497
+ " json.dump(dataset_manifest, f, indent=2)\n",
498
+ "\n",
499
+ "print(f\"\\nβœ“ Dataset manifest saved to: {manifest_path}\")"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "markdown",
504
+ "id": "40ba332c",
505
+ "metadata": {},
506
+ "source": [
507
+ "## 7. Summary"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": null,
513
+ "id": "3ef2f995",
514
+ "metadata": {},
515
+ "outputs": [],
516
+ "source": [
517
+ "print(\"\\n\" + \"=\" * 60)\n",
518
+ "print(\"DATA ACQUISITION COMPLETE\")\n",
519
+ "print(\"=\" * 60)\n",
520
+ "\n",
521
+ "total_samples = sum(len(df) for df in normalized_datasets.values())\n",
522
+ "\n",
523
+ "print(f\"\"\"\n",
524
+ "πŸ“Š Data Collection Summary:\n",
525
+ " - Datasets processed: {len(normalized_datasets)}\n",
526
+ " - Total samples: {total_samples:,}\n",
527
+ " - Output directory: {PROCESSED_DIR}\n",
528
+ "\n",
529
+ "πŸ“ Datasets Ready for Feature Engineering:\"\"\")\n",
530
+ "\n",
531
+ "for entry in dataset_manifest:\n",
532
+ " print(f\" βœ“ {entry['name']}: {entry['samples']:,} samples\")\n",
533
+ "\n",
534
+ "print(f\"\"\"\n",
535
+ "Next step:\n",
536
+ " β†’ 02_feature_engineering.ipynb\n",
537
+ "\"\"\")\n",
538
+ "print(\"=\" * 60)"
539
+ ]
540
+ }
541
+ ],
542
+ "metadata": {
543
+ "language_info": {
544
+ "name": "python"
545
+ }
546
+ },
547
+ "nbformat": 4,
548
+ "nbformat_minor": 5
549
+ }