Che237 commited on
Commit
0a285a3
·
verified ·
1 Parent(s): c1c0b2d

Delete notebooks/00_environment_setup.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. notebooks/00_environment_setup.ipynb +0 -495
notebooks/00_environment_setup.ipynb DELETED
@@ -1,495 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "17e1f6f6",
6
- "metadata": {},
7
- "source": [
8
- "# 00 - Environment Setup\n",
9
- "\n",
10
- "## CyberForge AI - ML Pipeline Environment Configuration\n",
11
- "\n",
12
- "This notebook sets up the complete environment for the CyberForge AI machine learning pipeline.\n",
13
- "\n",
14
- "### What this notebook does:\n",
15
- "1. Validates Python version and system requirements\n",
16
- "2. Installs and pins all dependencies\n",
17
- "3. Configures GPU/CPU detection\n",
18
- "4. Sets up Gemini API connectivity\n",
19
- "5. Validates Web Scraper API connection\n",
20
- "6. Creates necessary directories\n",
21
- "\n",
22
- "### Prerequisites:\n",
23
- "- Python 3.10+ (3.11 recommended)\n",
24
- "- Access to Gemini API (API key required)\n",
25
- "- Access to WebScrapper.live API"
26
- ]
27
- },
28
- {
29
- "cell_type": "markdown",
30
- "id": "33029fa4",
31
- "metadata": {},
32
- "source": [
33
- "## 1. System Validation"
34
- ]
35
- },
36
- {
37
- "cell_type": "code",
38
- "execution_count": null,
39
- "id": "076fa991",
40
- "metadata": {},
41
- "outputs": [],
42
- "source": [
43
- "import sys\n",
44
- "import platform\n",
45
- "import os\n",
46
- "from pathlib import Path\n",
47
- "\n",
48
- "print(\"=\" * 60)\n",
49
- "print(\"CYBERFORGE AI - ENVIRONMENT VALIDATION\")\n",
50
- "print(\"=\" * 60)\n",
51
- "\n",
52
- "# Python version check\n",
53
- "python_version = sys.version_info\n",
54
- "print(f\"\\n✓ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
55
- "\n",
56
- "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
57
- " raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
58
- "\n",
59
- "# System info\n",
60
- "print(f\"✓ Platform: {platform.system()} {platform.release()}\")\n",
61
- "print(f\"✓ Architecture: {platform.machine()}\")\n",
62
- "print(f\"✓ Processor: {platform.processor() or 'Unknown'}\")\n",
63
- "\n",
64
- "# Memory info\n",
65
- "try:\n",
66
- " import psutil\n",
67
- " memory = psutil.virtual_memory()\n",
68
- " print(f\"✓ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
69
- "except ImportError:\n",
70
- " print(\"⚠ psutil not installed - memory check skipped\")\n",
71
- "\n",
72
- "print(\"\\n\" + \"=\" * 60)"
73
- ]
74
- },
75
- {
76
- "cell_type": "markdown",
77
- "id": "45e95831",
78
- "metadata": {},
79
- "source": [
80
- "## 2. Install Dependencies"
81
- ]
82
- },
83
- {
84
- "cell_type": "code",
85
- "execution_count": null,
86
- "id": "faa9b079",
87
- "metadata": {},
88
- "outputs": [],
89
- "source": [
90
- "# Core dependencies with pinned versions for reproducibility\n",
91
- "DEPENDENCIES = \"\"\"\n",
92
- "# Core ML/AI\n",
93
- "numpy>=1.24.0,<2.0.0\n",
94
- "pandas>=2.0.0\n",
95
- "scikit-learn>=1.3.0\n",
96
- "scipy>=1.11.0\n",
97
- "\n",
98
- "# Deep Learning\n",
99
- "torch>=2.0.0\n",
100
- "transformers>=4.30.0\n",
101
- "\n",
102
- "# Gemini API\n",
103
- "google-generativeai>=0.3.0\n",
104
- "\n",
105
- "# Data Processing\n",
106
- "joblib>=1.3.0\n",
107
- "tqdm>=4.65.0\n",
108
- "\n",
109
- "# Feature Engineering\n",
110
- "tldextract>=5.0.0\n",
111
- "validators>=0.22.0\n",
112
- "ipaddress>=1.0.23\n",
113
- "\n",
114
- "# Web/API\n",
115
- "httpx>=0.25.0\n",
116
- "aiohttp>=3.8.0\n",
117
- "requests>=2.31.0\n",
118
- "\n",
119
- "# Hugging Face\n",
120
- "huggingface_hub>=0.19.0\n",
121
- "\n",
122
- "# Utilities\n",
123
- "python-dotenv>=1.0.0\n",
124
- "pyyaml>=6.0.0\n",
125
- "psutil>=5.9.0\n",
126
- "\"\"\"\n",
127
- "\n",
128
- "# Write requirements file\n",
129
- "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
130
- "requirements_path.write_text(DEPENDENCIES.strip())\n",
131
- "print(f\"✓ Requirements written to: {requirements_path.absolute()}\")"
132
- ]
133
- },
134
- {
135
- "cell_type": "code",
136
- "execution_count": null,
137
- "id": "7dc8c6ca",
138
- "metadata": {},
139
- "outputs": [],
140
- "source": [
141
- "# Install dependencies\n",
142
- "import subprocess\n",
143
- "\n",
144
- "print(\"Installing dependencies... This may take a few minutes.\")\n",
145
- "result = subprocess.run(\n",
146
- " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n",
147
- " capture_output=True,\n",
148
- " text=True\n",
149
- ")\n",
150
- "\n",
151
- "if result.returncode == 0:\n",
152
- " print(\"✓ All dependencies installed successfully!\")\n",
153
- "else:\n",
154
- " print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")"
155
- ]
156
- },
157
- {
158
- "cell_type": "markdown",
159
- "id": "c11760cc",
160
- "metadata": {},
161
- "source": [
162
- "## 3. GPU/CPU Detection"
163
- ]
164
- },
165
- {
166
- "cell_type": "code",
167
- "execution_count": null,
168
- "id": "d1b948c4",
169
- "metadata": {},
170
- "outputs": [],
171
- "source": [
172
- "import torch\n",
173
- "\n",
174
- "print(\"=\" * 60)\n",
175
- "print(\"COMPUTE DEVICE DETECTION\")\n",
176
- "print(\"=\" * 60)\n",
177
- "\n",
178
- "# Check CUDA availability\n",
179
- "cuda_available = torch.cuda.is_available()\n",
180
- "print(f\"\\n✓ PyTorch Version: {torch.__version__}\")\n",
181
- "print(f\"✓ CUDA Available: {cuda_available}\")\n",
182
- "\n",
183
- "if cuda_available:\n",
184
- " print(f\"✓ CUDA Version: {torch.version.cuda}\")\n",
185
- " print(f\"✓ GPU Count: {torch.cuda.device_count()}\")\n",
186
- " for i in range(torch.cuda.device_count()):\n",
187
- " props = torch.cuda.get_device_properties(i)\n",
188
- " print(f\" - GPU {i}: {props.name} ({props.total_memory / (1024**3):.2f} GB)\")\n",
189
- " DEVICE = torch.device(\"cuda\")\n",
190
- "else:\n",
191
- " print(\"⚠ No GPU detected - using CPU for training\")\n",
192
- " DEVICE = torch.device(\"cpu\")\n",
193
- "\n",
194
- "# Check MPS (Apple Silicon)\n",
195
- "if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
196
- " print(\"✓ Apple MPS (Metal) available\")\n",
197
- " DEVICE = torch.device(\"mps\")\n",
198
- "\n",
199
- "print(f\"\\n✓ Selected Device: {DEVICE}\")\n",
200
- "print(\"=\" * 60)"
201
- ]
202
- },
203
- {
204
- "cell_type": "markdown",
205
- "id": "d39ddbf5",
206
- "metadata": {},
207
- "source": [
208
- "## 4. Environment Variables & API Configuration"
209
- ]
210
- },
211
- {
212
- "cell_type": "code",
213
- "execution_count": null,
214
- "id": "0f63a5ce",
215
- "metadata": {},
216
- "outputs": [],
217
- "source": [
218
- "import json\n",
219
- "import os\n",
220
- "from pathlib import Path\n",
221
- "\n",
222
- "# Load configuration from notebook_config.json first (for HF Spaces)\n",
223
- "config_json_path = Path(\"notebook_config.json\")\n",
224
- "if config_json_path.exists():\n",
225
- " with open(config_json_path, \"r\") as f:\n",
226
- " loaded_config = json.load(f)\n",
227
- " print(f\"✓ Loaded configuration from: {config_json_path.absolute()}\")\n",
228
- "else:\n",
229
- " loaded_config = {}\n",
230
- " print(f\"⚠ No notebook_config.json found, using defaults\")\n",
231
- "\n",
232
- "# Try loading .env file as fallback (for local dev)\n",
233
- "try:\n",
234
- " from dotenv import load_dotenv\n",
235
- " env_path = Path(\"../.env\")\n",
236
- " if env_path.exists():\n",
237
- " load_dotenv(env_path)\n",
238
- " print(f\"✓ Loaded environment from: {env_path.absolute()}\")\n",
239
- "except ImportError:\n",
240
- " pass\n",
241
- "\n",
242
- "# Configuration class\n",
243
- "class Config:\n",
244
- " # API Keys - priority: config.json > env vars > HF secrets\n",
245
- " GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"AIzaSyA3HdWTLk_zJQ5P9G8Z8a8BEYSTPvLglhs\")\n",
246
- " HUGGINGFACE_TOKEN = os.getenv(\"HUGGINGFACE_API_TOKEN\", os.getenv(\"HF_TOKEN\", \"\"))\n",
247
- " WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n",
248
- " WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n",
249
- " \n",
250
- " # Gemini model\n",
251
- " GEMINI_MODEL = loaded_config.get(\"gemini_model\", \"gemini-2.5-flash\")\n",
252
- " \n",
253
- " # Paths\n",
254
- " BASE_DIR = Path(\"..\").resolve()\n",
255
- " DATASETS_DIR = BASE_DIR / \"datasets\"\n",
256
- " MODELS_DIR = BASE_DIR / \"models\"\n",
257
- " ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n",
258
- " \n",
259
- " # ML Settings\n",
260
- " RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n",
261
- " TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n",
262
- " CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n",
263
- " \n",
264
- " # Device\n",
265
- " DEVICE = DEVICE\n",
266
- "\n",
267
- "config = Config()\n",
268
- "\n",
269
- "# Validate required API keys\n",
270
- "print(\"\\n\" + \"=\" * 60)\n",
271
- "print(\"API CONFIGURATION STATUS\")\n",
272
- "print(\"=\" * 60)\n",
273
- "print(f\"✓ Gemini API Key: {'Configured (' + config.GEMINI_API_KEY[:10] + '...)' if config.GEMINI_API_KEY else '⚠ NOT SET'}\")\n",
274
- "print(f\"✓ Gemini Model: {config.GEMINI_MODEL}\")\n",
275
- "print(f\"✓ HuggingFace Token: {'Configured' if config.HUGGINGFACE_TOKEN else '⚠ NOT SET (optional)'}\")\n",
276
- "print(f\"✓ WebScraper API: Configured\")"
277
- ]
278
- },
279
- {
280
- "cell_type": "markdown",
281
- "id": "126b5f7f",
282
- "metadata": {},
283
- "source": [
284
- "## 5. Gemini API Connectivity Test"
285
- ]
286
- },
287
- {
288
- "cell_type": "code",
289
- "execution_count": null,
290
- "id": "14cef3bc",
291
- "metadata": {},
292
- "outputs": [],
293
- "source": [
294
- "import google.generativeai as genai\n",
295
- "\n",
296
- "def test_gemini_connection():\n",
297
- " \"\"\"Test Gemini API connectivity\"\"\"\n",
298
- " if not config.GEMINI_API_KEY:\n",
299
- " return False, \"API key not configured\"\n",
300
- " \n",
301
- " try:\n",
302
- " genai.configure(api_key=config.GEMINI_API_KEY)\n",
303
- " # Use the configured model (gemini-2.5-flash)\n",
304
- " model = genai.GenerativeModel(config.GEMINI_MODEL)\n",
305
- " response = model.generate_content(\"Respond with only: OK\")\n",
306
- " return True, f\"Model: {config.GEMINI_MODEL}, Response: {response.text.strip()}\"\n",
307
- " except Exception as e:\n",
308
- " # Fallback to gemini-1.5-flash if 2.5 not available\n",
309
- " try:\n",
310
- " model = genai.GenerativeModel('gemini-1.5-flash')\n",
311
- " response = model.generate_content(\"Respond with only: OK\")\n",
312
- " return True, f\"Model: gemini-1.5-flash (fallback), Response: {response.text.strip()}\"\n",
313
- " except Exception as e2:\n",
314
- " return False, str(e2)\n",
315
- "\n",
316
- "print(\"Testing Gemini API connection...\")\n",
317
- "success, message = test_gemini_connection()\n",
318
- "\n",
319
- "if success:\n",
320
- " print(f\"✓ Gemini API: {message}\")\n",
321
- "else:\n",
322
- " print(f\"⚠ Gemini API: Connection failed - {message}\")"
323
- ]
324
- },
325
- {
326
- "cell_type": "markdown",
327
- "id": "628ac121",
328
- "metadata": {},
329
- "source": [
330
- "## 6. Web Scraper API Connectivity Test"
331
- ]
332
- },
333
- {
334
- "cell_type": "code",
335
- "execution_count": null,
336
- "id": "beb1b036",
337
- "metadata": {},
338
- "outputs": [],
339
- "source": [
340
- "import httpx\n",
341
- "\n",
342
- "async def test_webscraper_connection():\n",
343
- " \"\"\"Test WebScrapper.live API connectivity\"\"\"\n",
344
- " try:\n",
345
- " async with httpx.AsyncClient(timeout=30.0) as client:\n",
346
- " response = await client.post(\n",
347
- " config.WEBSCRAPER_API_URL,\n",
348
- " json={\"url\": \"https://example.com\"},\n",
349
- " headers={\n",
350
- " \"Content-Type\": \"application/json\",\n",
351
- " \"X-API-Key\": config.WEBSCRAPER_API_KEY\n",
352
- " }\n",
353
- " )\n",
354
- " if response.status_code == 200:\n",
355
- " return True, \"Connected\"\n",
356
- " else:\n",
357
- " return False, f\"Status {response.status_code}\"\n",
358
- " except Exception as e:\n",
359
- " return False, str(e)\n",
360
- "\n",
361
- "print(\"Testing Web Scraper API connection...\")\n",
362
- "\n",
363
- "# Run async test\n",
364
- "import asyncio\n",
365
- "try:\n",
366
- " loop = asyncio.get_event_loop()\n",
367
- "except RuntimeError:\n",
368
- " loop = asyncio.new_event_loop()\n",
369
- " asyncio.set_event_loop(loop)\n",
370
- "\n",
371
- "success, message = loop.run_until_complete(test_webscraper_connection())\n",
372
- "\n",
373
- "if success:\n",
374
- " print(f\"✓ WebScraper API: Connected successfully\")\n",
375
- "else:\n",
376
- " print(f\"⚠ WebScraper API: {message}\")"
377
- ]
378
- },
379
- {
380
- "cell_type": "markdown",
381
- "id": "75ee0f51",
382
- "metadata": {},
383
- "source": [
384
- "## 7. Create Directory Structure"
385
- ]
386
- },
387
- {
388
- "cell_type": "code",
389
- "execution_count": null,
390
- "id": "776236f8",
391
- "metadata": {},
392
- "outputs": [],
393
- "source": [
394
- "# Create necessary directories\n",
395
- "directories = [\n",
396
- " config.DATASETS_DIR,\n",
397
- " config.MODELS_DIR,\n",
398
- " config.ARTIFACTS_DIR,\n",
399
- " config.BASE_DIR / \"logs\",\n",
400
- " config.BASE_DIR / \"cache\",\n",
401
- "]\n",
402
- "\n",
403
- "print(\"Creating directory structure...\")\n",
404
- "for directory in directories:\n",
405
- " directory.mkdir(parents=True, exist_ok=True)\n",
406
- " print(f\" ✓ {directory}\")\n",
407
- "\n",
408
- "print(\"\\n✓ Directory structure ready!\")"
409
- ]
410
- },
411
- {
412
- "cell_type": "markdown",
413
- "id": "a6fe27eb",
414
- "metadata": {},
415
- "source": [
416
- "## 8. Save Configuration for Other Notebooks"
417
- ]
418
- },
419
- {
420
- "cell_type": "code",
421
- "execution_count": null,
422
- "id": "6b854bac",
423
- "metadata": {},
424
- "outputs": [],
425
- "source": [
426
- "import json\n",
427
- "\n",
428
- "# Export configuration for other notebooks\n",
429
- "notebook_config = {\n",
430
- " \"device\": str(DEVICE),\n",
431
- " \"python_version\": f\"{python_version.major}.{python_version.minor}.{python_version.micro}\",\n",
432
- " \"torch_version\": torch.__version__,\n",
433
- " \"cuda_available\": cuda_available,\n",
434
- " \"base_dir\": str(config.BASE_DIR),\n",
435
- " \"datasets_dir\": str(config.DATASETS_DIR),\n",
436
- " \"models_dir\": str(config.MODELS_DIR),\n",
437
- " \"artifacts_dir\": str(config.ARTIFACTS_DIR),\n",
438
- " \"random_state\": config.RANDOM_STATE,\n",
439
- " \"test_size\": config.TEST_SIZE,\n",
440
- " \"cv_folds\": config.CV_FOLDS,\n",
441
- " \"gemini_configured\": bool(config.GEMINI_API_KEY),\n",
442
- " \"huggingface_configured\": bool(config.HUGGINGFACE_TOKEN),\n",
443
- " \"created_at\": str(pd.Timestamp.now())\n",
444
- "}\n",
445
- "\n",
446
- "config_path = config.BASE_DIR / \"notebook_config.json\"\n",
447
- "with open(config_path, \"w\") as f:\n",
448
- " json.dump(notebook_config, f, indent=2)\n",
449
- "\n",
450
- "print(f\"✓ Configuration saved to: {config_path}\")\n",
451
- "print(\"\\n\" + json.dumps(notebook_config, indent=2))"
452
- ]
453
- },
454
- {
455
- "cell_type": "markdown",
456
- "id": "ac7ada25",
457
- "metadata": {},
458
- "source": [
459
- "## 9. Environment Summary"
460
- ]
461
- },
462
- {
463
- "cell_type": "code",
464
- "execution_count": null,
465
- "id": "f409be56",
466
- "metadata": {},
467
- "outputs": [],
468
- "source": [
469
- "print(\"\\n\" + \"=\" * 60)\n",
470
- "print(\"ENVIRONMENT SETUP COMPLETE\")\n",
471
- "print(\"=\" * 60)\n",
472
- "print(f\"\"\"\n",
473
- "✅ Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n",
474
- "✅ Device: {DEVICE}\n",
475
- "✅ PyTorch: {torch.__version__}\n",
476
- "✅ Gemini API: {'Ready' if config.GEMINI_API_KEY else 'Not configured'}\n",
477
- "✅ HuggingFace: {'Ready' if config.HUGGINGFACE_TOKEN else 'Not configured'}\n",
478
- "✅ WebScraper API: Ready\n",
479
- "✅ Directories: Created\n",
480
- "\n",
481
- "You can now proceed to the next notebook:\n",
482
- " → 01_data_acquisition.ipynb\n",
483
- "\"\"\")\n",
484
- "print(\"=\" * 60)"
485
- ]
486
- }
487
- ],
488
- "metadata": {
489
- "language_info": {
490
- "name": "python"
491
- }
492
- },
493
- "nbformat": 4,
494
- "nbformat_minor": 5
495
- }