Che237 commited on
Commit
f808c77
Β·
verified Β·
1 Parent(s): 02911db

Add 00_environment_setup.ipynb

Browse files
Files changed (1) hide show
  1. notebooks/00_environment_setup.ipynb +471 -0
notebooks/00_environment_setup.ipynb ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "17e1f6f6",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 00 - Environment Setup\n",
9
+ "\n",
10
+ "## CyberForge AI - ML Pipeline Environment Configuration\n",
11
+ "\n",
12
+ "This notebook sets up the complete environment for the CyberForge AI machine learning pipeline.\n",
13
+ "\n",
14
+ "### What this notebook does:\n",
15
+ "1. Validates Python version and system requirements\n",
16
+ "2. Installs and pins all dependencies\n",
17
+ "3. Configures GPU/CPU detection\n",
18
+ "4. Sets up Gemini API connectivity\n",
19
+ "5. Validates Web Scraper API connection\n",
20
+ "6. Creates necessary directories\n",
21
+ "\n",
22
+ "### Prerequisites:\n",
23
+ "- Python 3.10+ (3.11 recommended)\n",
24
+ "- Access to Gemini API (API key required)\n",
25
+ "- Access to WebScrapper.live API"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "id": "33029fa4",
31
+ "metadata": {},
32
+ "source": [
33
+ "## 1. System Validation"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "id": "076fa991",
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "import sys\n",
44
+ "import platform\n",
45
+ "import os\n",
46
+ "from pathlib import Path\n",
47
+ "\n",
48
+ "print(\"=\" * 60)\n",
49
+ "print(\"CYBERFORGE AI - ENVIRONMENT VALIDATION\")\n",
50
+ "print(\"=\" * 60)\n",
51
+ "\n",
52
+ "# Python version check\n",
53
+ "python_version = sys.version_info\n",
54
+ "print(f\"\\nβœ“ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
55
+ "\n",
56
+ "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
57
+ " raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
58
+ "\n",
59
+ "# System info\n",
60
+ "print(f\"βœ“ Platform: {platform.system()} {platform.release()}\")\n",
61
+ "print(f\"βœ“ Architecture: {platform.machine()}\")\n",
62
+ "print(f\"βœ“ Processor: {platform.processor() or 'Unknown'}\")\n",
63
+ "\n",
64
+ "# Memory info\n",
65
+ "try:\n",
66
+ " import psutil\n",
67
+ " memory = psutil.virtual_memory()\n",
68
+ " print(f\"βœ“ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
69
+ "except ImportError:\n",
70
+ " print(\"⚠ psutil not installed - memory check skipped\")\n",
71
+ "\n",
72
+ "print(\"\\n\" + \"=\" * 60)"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "markdown",
77
+ "id": "45e95831",
78
+ "metadata": {},
79
+ "source": [
80
+ "## 2. Install Dependencies"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "id": "faa9b079",
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "# Core dependencies with pinned versions for reproducibility\n",
91
+ "DEPENDENCIES = \"\"\"\n",
92
+ "# Core ML/AI\n",
93
+ "numpy>=1.24.0,<2.0.0\n",
94
+ "pandas>=2.0.0\n",
95
+ "scikit-learn>=1.3.0\n",
96
+ "scipy>=1.11.0\n",
97
+ "\n",
98
+ "# Deep Learning\n",
99
+ "torch>=2.0.0\n",
100
+ "transformers>=4.30.0\n",
101
+ "\n",
102
+ "# Gemini API\n",
103
+ "google-generativeai>=0.3.0\n",
104
+ "\n",
105
+ "# Data Processing\n",
106
+ "joblib>=1.3.0\n",
107
+ "tqdm>=4.65.0\n",
108
+ "\n",
109
+ "# Feature Engineering\n",
110
+ "tldextract>=5.0.0\n",
111
+ "validators>=0.22.0\n",
112
+ "ipaddress>=1.0.23\n",
113
+ "\n",
114
+ "# Web/API\n",
115
+ "httpx>=0.25.0\n",
116
+ "aiohttp>=3.8.0\n",
117
+ "requests>=2.31.0\n",
118
+ "\n",
119
+ "# Hugging Face\n",
120
+ "huggingface_hub>=0.19.0\n",
121
+ "\n",
122
+ "# Utilities\n",
123
+ "python-dotenv>=1.0.0\n",
124
+ "pyyaml>=6.0.0\n",
125
+ "psutil>=5.9.0\n",
126
+ "\"\"\"\n",
127
+ "\n",
128
+ "# Write requirements file\n",
129
+ "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
130
+ "requirements_path.write_text(DEPENDENCIES.strip())\n",
131
+ "print(f\"βœ“ Requirements written to: {requirements_path.absolute()}\")"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "id": "7dc8c6ca",
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "# Install dependencies\n",
142
+ "import subprocess\n",
143
+ "\n",
144
+ "print(\"Installing dependencies... This may take a few minutes.\")\n",
145
+ "result = subprocess.run(\n",
146
+ " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n",
147
+ " capture_output=True,\n",
148
+ " text=True\n",
149
+ ")\n",
150
+ "\n",
151
+ "if result.returncode == 0:\n",
152
+ " print(\"βœ“ All dependencies installed successfully!\")\n",
153
+ "else:\n",
154
+ " print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "markdown",
159
+ "id": "c11760cc",
160
+ "metadata": {},
161
+ "source": [
162
+ "## 3. GPU/CPU Detection"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "d1b948c4",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "import torch\n",
173
+ "\n",
174
+ "print(\"=\" * 60)\n",
175
+ "print(\"COMPUTE DEVICE DETECTION\")\n",
176
+ "print(\"=\" * 60)\n",
177
+ "\n",
178
+ "# Check CUDA availability\n",
179
+ "cuda_available = torch.cuda.is_available()\n",
180
+ "print(f\"\\nβœ“ PyTorch Version: {torch.__version__}\")\n",
181
+ "print(f\"βœ“ CUDA Available: {cuda_available}\")\n",
182
+ "\n",
183
+ "if cuda_available:\n",
184
+ " print(f\"βœ“ CUDA Version: {torch.version.cuda}\")\n",
185
+ " print(f\"βœ“ GPU Count: {torch.cuda.device_count()}\")\n",
186
+ " for i in range(torch.cuda.device_count()):\n",
187
+ " props = torch.cuda.get_device_properties(i)\n",
188
+ " print(f\" - GPU {i}: {props.name} ({props.total_memory / (1024**3):.2f} GB)\")\n",
189
+ " DEVICE = torch.device(\"cuda\")\n",
190
+ "else:\n",
191
+ " print(\"⚠ No GPU detected - using CPU for training\")\n",
192
+ " DEVICE = torch.device(\"cpu\")\n",
193
+ "\n",
194
+ "# Check MPS (Apple Silicon)\n",
195
+ "if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
196
+ " print(\"βœ“ Apple MPS (Metal) available\")\n",
197
+ " DEVICE = torch.device(\"mps\")\n",
198
+ "\n",
199
+ "print(f\"\\nβœ“ Selected Device: {DEVICE}\")\n",
200
+ "print(\"=\" * 60)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "id": "d39ddbf5",
206
+ "metadata": {},
207
+ "source": [
208
+ "## 4. Environment Variables & API Configuration"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "id": "0f63a5ce",
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "from dotenv import load_dotenv\n",
219
+ "import os\n",
220
+ "\n",
221
+ "# Load environment variables from .env file\n",
222
+ "env_path = Path(\"../.env\")\n",
223
+ "if env_path.exists():\n",
224
+ " load_dotenv(env_path)\n",
225
+ " print(f\"βœ“ Loaded environment from: {env_path.absolute()}\")\n",
226
+ "else:\n",
227
+ " print(f\"⚠ No .env file found at {env_path.absolute()}\")\n",
228
+ "\n",
229
+ "# Configuration class\n",
230
+ "class Config:\n",
231
+ " # API Keys\n",
232
+ " GEMINI_API_KEY = os.getenv(\"GEMINI_API_KEY\", \"\")\n",
233
+ " HUGGINGFACE_TOKEN = os.getenv(\"HUGGINGFACE_API_TOKEN\", \"\")\n",
234
+ " WEBSCRAPER_API_KEY = \"sk-fd14eaa7bceb478db7afc7256e514d2b\" # WebScrapper.live API\n",
235
+ " WEBSCRAPER_API_URL = \"http://webscrapper.live/api/scrape\"\n",
236
+ " \n",
237
+ " # Paths\n",
238
+ " BASE_DIR = Path(\"..\").resolve()\n",
239
+ " DATASETS_DIR = BASE_DIR / \"datasets\"\n",
240
+ " MODELS_DIR = BASE_DIR / \"models\"\n",
241
+ " ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n",
242
+ " \n",
243
+ " # ML Settings\n",
244
+ " RANDOM_STATE = 42\n",
245
+ " TEST_SIZE = 0.2\n",
246
+ " CV_FOLDS = 5\n",
247
+ " \n",
248
+ " # Device\n",
249
+ " DEVICE = DEVICE\n",
250
+ "\n",
251
+ "config = Config()\n",
252
+ "\n",
253
+ "# Validate required API keys\n",
254
+ "print(\"\\n\" + \"=\" * 60)\n",
255
+ "print(\"API CONFIGURATION STATUS\")\n",
256
+ "print(\"=\" * 60)\n",
257
+ "print(f\"βœ“ Gemini API Key: {'Configured' if config.GEMINI_API_KEY else '⚠ NOT SET'}\")\n",
258
+ "print(f\"βœ“ HuggingFace Token: {'Configured' if config.HUGGINGFACE_TOKEN else '⚠ NOT SET'}\")\n",
259
+ "print(f\"βœ“ WebScraper API: Configured\")"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "markdown",
264
+ "id": "126b5f7f",
265
+ "metadata": {},
266
+ "source": [
267
+ "## 5. Gemini API Connectivity Test"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "id": "14cef3bc",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "import google.generativeai as genai\n",
278
+ "\n",
279
+ "def test_gemini_connection():\n",
280
+ " \"\"\"Test Gemini API connectivity\"\"\"\n",
281
+ " if not config.GEMINI_API_KEY:\n",
282
+ " return False, \"API key not configured\"\n",
283
+ " \n",
284
+ " try:\n",
285
+ " genai.configure(api_key=config.GEMINI_API_KEY)\n",
286
+ " model = genai.GenerativeModel('gemini-1.5-flash')\n",
287
+ " response = model.generate_content(\"Respond with only: OK\")\n",
288
+ " return True, response.text.strip()\n",
289
+ " except Exception as e:\n",
290
+ " return False, str(e)\n",
291
+ "\n",
292
+ "print(\"Testing Gemini API connection...\")\n",
293
+ "success, message = test_gemini_connection()\n",
294
+ "\n",
295
+ "if success:\n",
296
+ " print(f\"βœ“ Gemini API: Connected successfully\")\n",
297
+ "else:\n",
298
+ " print(f\"⚠ Gemini API: Connection failed - {message}\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "id": "628ac121",
304
+ "metadata": {},
305
+ "source": [
306
+ "## 6. Web Scraper API Connectivity Test"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "id": "beb1b036",
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": [
316
+ "import httpx\n",
317
+ "\n",
318
+ "async def test_webscraper_connection():\n",
319
+ " \"\"\"Test WebScrapper.live API connectivity\"\"\"\n",
320
+ " try:\n",
321
+ " async with httpx.AsyncClient(timeout=30.0) as client:\n",
322
+ " response = await client.post(\n",
323
+ " config.WEBSCRAPER_API_URL,\n",
324
+ " json={\"url\": \"https://example.com\"},\n",
325
+ " headers={\n",
326
+ " \"Content-Type\": \"application/json\",\n",
327
+ " \"X-API-Key\": config.WEBSCRAPER_API_KEY\n",
328
+ " }\n",
329
+ " )\n",
330
+ " if response.status_code == 200:\n",
331
+ " return True, \"Connected\"\n",
332
+ " else:\n",
333
+ " return False, f\"Status {response.status_code}\"\n",
334
+ " except Exception as e:\n",
335
+ " return False, str(e)\n",
336
+ "\n",
337
+ "print(\"Testing Web Scraper API connection...\")\n",
338
+ "\n",
339
+ "# Run async test\n",
340
+ "import asyncio\n",
341
+ "try:\n",
342
+ " loop = asyncio.get_event_loop()\n",
343
+ "except RuntimeError:\n",
344
+ " loop = asyncio.new_event_loop()\n",
345
+ " asyncio.set_event_loop(loop)\n",
346
+ "\n",
347
+ "success, message = loop.run_until_complete(test_webscraper_connection())\n",
348
+ "\n",
349
+ "if success:\n",
350
+ " print(f\"βœ“ WebScraper API: Connected successfully\")\n",
351
+ "else:\n",
352
+ " print(f\"⚠ WebScraper API: {message}\")"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "markdown",
357
+ "id": "75ee0f51",
358
+ "metadata": {},
359
+ "source": [
360
+ "## 7. Create Directory Structure"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": null,
366
+ "id": "776236f8",
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "# Create necessary directories\n",
371
+ "directories = [\n",
372
+ " config.DATASETS_DIR,\n",
373
+ " config.MODELS_DIR,\n",
374
+ " config.ARTIFACTS_DIR,\n",
375
+ " config.BASE_DIR / \"logs\",\n",
376
+ " config.BASE_DIR / \"cache\",\n",
377
+ "]\n",
378
+ "\n",
379
+ "print(\"Creating directory structure...\")\n",
380
+ "for directory in directories:\n",
381
+ " directory.mkdir(parents=True, exist_ok=True)\n",
382
+ " print(f\" βœ“ {directory}\")\n",
383
+ "\n",
384
+ "print(\"\\nβœ“ Directory structure ready!\")"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "markdown",
389
+ "id": "a6fe27eb",
390
+ "metadata": {},
391
+ "source": [
392
+ "## 8. Save Configuration for Other Notebooks"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "code",
397
+ "execution_count": null,
398
+ "id": "6b854bac",
399
+ "metadata": {},
400
+ "outputs": [],
401
+ "source": [
402
+ "import json\n",
403
+ "\n",
404
+ "# Export configuration for other notebooks\n",
405
+ "notebook_config = {\n",
406
+ " \"device\": str(DEVICE),\n",
407
+ " \"python_version\": f\"{python_version.major}.{python_version.minor}.{python_version.micro}\",\n",
408
+ " \"torch_version\": torch.__version__,\n",
409
+ " \"cuda_available\": cuda_available,\n",
410
+ " \"base_dir\": str(config.BASE_DIR),\n",
411
+ " \"datasets_dir\": str(config.DATASETS_DIR),\n",
412
+ " \"models_dir\": str(config.MODELS_DIR),\n",
413
+ " \"artifacts_dir\": str(config.ARTIFACTS_DIR),\n",
414
+ " \"random_state\": config.RANDOM_STATE,\n",
415
+ " \"test_size\": config.TEST_SIZE,\n",
416
+ " \"cv_folds\": config.CV_FOLDS,\n",
417
+ " \"gemini_configured\": bool(config.GEMINI_API_KEY),\n",
418
+ " \"huggingface_configured\": bool(config.HUGGINGFACE_TOKEN),\n",
419
+ " \"created_at\": str(pd.Timestamp.now())\n",
420
+ "}\n",
421
+ "\n",
422
+ "config_path = config.BASE_DIR / \"notebook_config.json\"\n",
423
+ "with open(config_path, \"w\") as f:\n",
424
+ " json.dump(notebook_config, f, indent=2)\n",
425
+ "\n",
426
+ "print(f\"βœ“ Configuration saved to: {config_path}\")\n",
427
+ "print(\"\\n\" + json.dumps(notebook_config, indent=2))"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "markdown",
432
+ "id": "ac7ada25",
433
+ "metadata": {},
434
+ "source": [
435
+ "## 9. Environment Summary"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": null,
441
+ "id": "f409be56",
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "print(\"\\n\" + \"=\" * 60)\n",
446
+ "print(\"ENVIRONMENT SETUP COMPLETE\")\n",
447
+ "print(\"=\" * 60)\n",
448
+ "print(f\"\"\"\n",
449
+ "βœ… Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n",
450
+ "βœ… Device: {DEVICE}\n",
451
+ "βœ… PyTorch: {torch.__version__}\n",
452
+ "βœ… Gemini API: {'Ready' if config.GEMINI_API_KEY else 'Not configured'}\n",
453
+ "βœ… HuggingFace: {'Ready' if config.HUGGINGFACE_TOKEN else 'Not configured'}\n",
454
+ "βœ… WebScraper API: Ready\n",
455
+ "βœ… Directories: Created\n",
456
+ "\n",
457
+ "You can now proceed to the next notebook:\n",
458
+ " β†’ 01_data_acquisition.ipynb\n",
459
+ "\"\"\")\n",
460
+ "print(\"=\" * 60)"
461
+ ]
462
+ }
463
+ ],
464
+ "metadata": {
465
+ "language_info": {
466
+ "name": "python"
467
+ }
468
+ },
469
+ "nbformat": 4,
470
+ "nbformat_minor": 5
471
+ }