{ "cells": [ { "cell_type": "markdown", "id": "17e1f6f6", "metadata": {}, "source": [ "# 00 - Environment Setup\n", "\n", "## CyberForge AI - ML Pipeline Environment Configuration\n", "\n", "This notebook sets up the complete environment for the CyberForge AI machine learning pipeline.\n", "\n", "### What this notebook does:\n", "1. Validates Python version and system requirements\n", "2. Installs and pins all dependencies\n", "3. Configures GPU/CPU detection\n", "4. Sets up Gemini API connectivity\n", "5. Validates Web Scraper API connection\n", "6. Creates necessary directories\n", "\n", "### Prerequisites:\n", "- Python 3.10+ (3.11 recommended)\n", "- Access to Gemini API (API key required)\n", "- Access to WebScrapper.live API" ] }, { "cell_type": "markdown", "id": "33029fa4", "metadata": {}, "source": [ "## 1. System Validation" ] }, { "cell_type": "code", "execution_count": null, "id": "076fa991", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import platform\n", "import os\n", "from pathlib import Path\n", "\n", "print(\"=\" * 60)\n", "print(\"CYBERFORGE AI - ENVIRONMENT VALIDATION\")\n", "print(\"=\" * 60)\n", "\n", "# Python version check\n", "python_version = sys.version_info\n", "print(f\"\\n✓ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n", "\n", "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n", " raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n", "\n", "# System info\n", "print(f\"✓ Platform: {platform.system()} {platform.release()}\")\n", "print(f\"✓ Architecture: {platform.machine()}\")\n", "print(f\"✓ Processor: {platform.processor() or 'Unknown'}\")\n", "\n", "# Memory info\n", "try:\n", " import psutil\n", " memory = psutil.virtual_memory()\n", " print(f\"✓ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n", "except ImportError:\n", " print(\"⚠ psutil not installed - memory check skipped\")\n", "\n", "print(\"\\n\" + \"=\" * 60)" ] }, { "cell_type": "markdown", "id": "45e95831", "metadata": {}, "source": [ "## 2. Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "id": "faa9b079", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "# Core dependencies with pinned versions for reproducibility\n", "# NOTE: torch/transformers intentionally excluded - not needed for sklearn models\n", "# and too heavy for HF Space Docker containers\n", "DEPENDENCIES = \"\"\"\n", "# Core ML/AI\n", "numpy>=1.24.0,<2.0.0\n", "pandas>=2.0.0\n", "scikit-learn>=1.3.0\n", "scipy>=1.11.0\n", "\n", "# Gemini API (new SDK)\n", "google-genai>=1.0.0\n", "\n", "# Data Processing\n", "joblib>=1.3.0\n", "tqdm>=4.65.0\n", "pyarrow>=14.0.0\n", "\n", "# Feature Engineering\n", "tldextract>=5.0.0\n", "validators>=0.22.0\n", "\n", "# Web/API\n", "httpx>=0.25.0\n", "requests>=2.31.0\n", "\n", "# Hugging Face\n", "huggingface_hub>=0.19.0\n", "\n", "# Utilities\n", "python-dotenv>=1.0.0\n", "pyyaml>=6.0.0\n", "psutil>=5.9.0\n", "\"\"\"\n", "\n", "# Write requirements file\n", "requirements_path = Path(\"../requirements_notebooks.txt\")\n", "requirements_path.write_text(DEPENDENCIES.strip())\n", "print(f\"✓ Requirements written to: {requirements_path.absolute()}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7dc8c6ca", "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "import sys\n", "from pathlib import Path\n", "\n", "# Install dependencies\n", "requirements_path = Path(\"../requirements_notebooks.txt\")\n", "\n", "if requirements_path.exists():\n", " print(\"Installing dependencies... This may take a few minutes.\")\n", " result = subprocess.run(\n", " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n", " capture_output=True,\n", " text=True\n", " )\n", "\n", " if result.returncode == 0:\n", " print(\"✓ All dependencies installed successfully!\")\n", " else:\n", " print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")\n", "else:\n", " print(\"⚠ Requirements file not found. Run previous cell first or skip if deps installed.\")\n" ] }, { "cell_type": "markdown", "id": "c11760cc", "metadata": {}, "source": [ "## 3. GPU/CPU Detection" ] }, { "cell_type": "code", "execution_count": null, "id": "d1b948c4", "metadata": {}, "outputs": [], "source": [ "print(\"=\" * 60)\n", "print(\"COMPUTE DEVICE DETECTION\")\n", "print(\"=\" * 60)\n", "\n", "# CyberForge uses sklearn (CPU-only) — torch is optional\n", "try:\n", " import torch\n", " cuda_available = torch.cuda.is_available()\n", " print(f\"\\n✓ PyTorch Version: {torch.__version__}\")\n", " print(f\"✓ CUDA Available: {cuda_available}\")\n", "\n", " if cuda_available:\n", " print(f\"✓ CUDA Version: {torch.version.cuda}\")\n", " DEVICE = \"cuda\"\n", " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n", " print(\"✓ Apple MPS (Metal) available\")\n", " DEVICE = \"mps\"\n", " else:\n", " DEVICE = \"cpu\"\n", "except ImportError:\n", " print(\"\\n⚠ PyTorch not installed (not required — sklearn models use CPU)\")\n", " DEVICE = \"cpu\"\n", "\n", "print(f\"\\n✓ Selected Device: {DEVICE}\")\n", "print(\" (Note: CyberForge models use scikit-learn which runs on CPU)\")\n", "print(\"=\" * 60)\n" ] }, { "cell_type": "markdown", "id": "d39ddbf5", "metadata": {}, "source": [ "## 4. Environment Variables & API Configuration" ] }, { "cell_type": "code", "execution_count": null, "id": "0f63a5ce", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "from pathlib import Path\n", "\n", "# Load configuration from notebook_config.json first (for HF Spaces)\n", "config_json_path = Path(\"notebook_config.json\")\n", "if config_json_path.exists():\n", " with open(config_json_path, \"r\") as f:\n", " loaded_config = json.load(f)\n", " print(f\"✓ Loaded configuration from: {config_json_path.absolute()}\")\n", "else:\n", " loaded_config = {}\n", " print(f\"⚠ No notebook_config.json found, using defaults\")\n", "\n", "# Try loading .env file as fallback (for local dev)\n", "try:\n", " from dotenv import load_dotenv\n", " env_path = Path(\"../.env\")\n", " if env_path.exists():\n", " load_dotenv(env_path)\n", " print(f\"✓ Loaded environment from: {env_path.absolute()}\")\n", "except ImportError:\n", " pass\n", "\n", "# Detect device (torch is optional)\n", "DEVICE = \"cpu\"\n", "try:\n", " import torch\n", " if torch.cuda.is_available():\n", " DEVICE = \"cuda\"\n", " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n", " DEVICE = \"mps\"\n", "except ImportError:\n", " pass\n", "\n", "# Configuration class\n", "class Config:\n", " # API Keys - priority: config.json > env vars > defaults\n", " GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"\")\n", " HUGGINGFACE_TOKEN = loaded_config.get(\"hf_token\") or os.getenv(\"HF_TOKEN\", \"\")\n", " WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n", " WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n", " \n", " # Gemini model\n", " GEMINI_MODEL = loaded_config.get(\"gemini_model\", os.getenv(\"GEMINI_MODEL\", \"gemini-2.5-flash\"))\n", " \n", " # HF repos\n", " HF_REPO = loaded_config.get(\"hf_repo\", \"Che237/cyberforge-models\")\n", " HF_DATASETS_REPO = loaded_config.get(\"hf_datasets_repo\", \"Che237/cyberforge-datasets\")\n", " \n", " # Paths\n", " BASE_DIR = Path(\"..\").resolve()\n", " DATASETS_DIR = BASE_DIR / \"datasets\"\n", " MODELS_DIR = BASE_DIR / \"models\"\n", " ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n", " \n", " # ML Settings\n", " RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n", " TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n", " CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n", " \n", " # Device\n", " DEVICE = DEVICE\n", "\n", "config = Config()\n", "\n", "# Validate required API keys\n", "print(\"\\n\" + \"=\" * 60)\n", "print(\"API CONFIGURATION STATUS\")\n", "print(\"=\" * 60)\n", "print(f\" Gemini API Key: {'✓ Set' if config.GEMINI_API_KEY else '✗ Missing'}\")\n", "hf_status = '✓ Set' if config.HUGGINGFACE_TOKEN else '⚠ Not set (models will not upload)'\n", "print(f\" HuggingFace Token: {hf_status}\")\n", "print(f\" Gemini Model: {config.GEMINI_MODEL}\")\n", "print(f\" HF Model Repo: {config.HF_REPO}\")\n", "print(f\" Device: {config.DEVICE}\")\n" ] }, { "cell_type": "markdown", "id": "126b5f7f", "metadata": {}, "source": [ "## 5. Gemini API Connectivity Test" ] }, { "cell_type": "code", "execution_count": null, "id": "14cef3bc", "metadata": {}, "outputs": [], "source": [ "# Gemini Integration — using google-genai (new SDK)\n", "import json\n", "import os\n", "from pathlib import Path\n", "\n", "try:\n", " from google import genai\n", "except ImportError:\n", " import subprocess, sys\n", " subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-genai', '-q'])\n", " from google import genai\n", "\n", "# Load config (self-contained)\n", "config_json_path = Path('notebook_config.json')\n", "if config_json_path.exists():\n", " with open(config_json_path, 'r') as f:\n", " loaded_config = json.load(f)\n", "else:\n", " loaded_config = {}\n", "\n", "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n", "GEMINI_MODEL = loaded_config.get('gemini_model', os.getenv('GEMINI_MODEL', 'gemini-2.5-flash'))\n", "\n", "def test_gemini_connection():\n", " if not GEMINI_API_KEY:\n", " return False, 'API key not configured'\n", " try:\n", " client = genai.Client(api_key=GEMINI_API_KEY)\n", " response = client.models.generate_content(\n", " model=GEMINI_MODEL,\n", " contents='Respond with only: OK'\n", " )\n", " return True, f'Model: {GEMINI_MODEL}, Response: {response.text.strip()}'\n", " except Exception as e:\n", " # Try fallback model\n", " try:\n", " client = genai.Client(api_key=GEMINI_API_KEY)\n", " response = client.models.generate_content(\n", " model='gemini-2.5-flash',\n", " contents='Respond with only: OK'\n", " )\n", " return True, f'Model: gemini-2.5-flash (fallback), Response: {response.text.strip()}'\n", " except Exception as e2:\n", " return False, str(e2)\n", "\n", "print('Testing Gemini API connection...')\n", "success, message = test_gemini_connection()\n", "if success:\n", " print(f'✓ Gemini API: {message}')\n", "else:\n", " print(f'⚠ Gemini API: Connection failed - {message}')\n" ] }, { "cell_type": "markdown", "id": "628ac121", "metadata": {}, "source": [ "## 6. Web Scraper API Connectivity Test" ] }, { "cell_type": "code", "execution_count": null, "id": "beb1b036", "metadata": {}, "outputs": [], "source": [ "import httpx\n", "import json\n", "import os\n", "from pathlib import Path\n", "\n", "# Load config (self-contained)\n", "config_json_path = Path('notebook_config.json')\n", "if config_json_path.exists():\n", " with open(config_json_path, 'r') as f:\n", " loaded_config = json.load(f)\n", "else:\n", " loaded_config = {}\n", "\n", "WEBSCRAPER_API_KEY = loaded_config.get('webscraper_api_key', 'sk-fd14eaa7bceb478db7afc7256e514d2b')\n", "WEBSCRAPER_API_URL = loaded_config.get('webscraper_api_url', 'http://webscrapper.live/api/scrape')\n", "\n", "def test_webscraper_connection_sync():\n", " try:\n", " with httpx.Client(timeout=30.0) as client:\n", " response = client.post(\n", " WEBSCRAPER_API_URL,\n", " json={'url': 'https://example.com'},\n", " headers={'Content-Type': 'application/json', 'X-API-Key': WEBSCRAPER_API_KEY}\n", " )\n", " if response.status_code == 200:\n", " return True, 'Connected'\n", " else:\n", " return False, f'Status {response.status_code}'\n", " except Exception as e:\n", " return False, str(e)\n", "\n", "print('Testing Web Scraper API connection...')\n", "success, message = test_webscraper_connection_sync()\n", "if success:\n", " print(f'✓ WebScraper API: Connected successfully')\n", "else:\n", " print(f'⚠ WebScraper API: {message}')\n" ] }, { "cell_type": "markdown", "id": "75ee0f51", "metadata": {}, "source": [ "## 7. Create Directory Structure" ] }, { "cell_type": "code", "execution_count": null, "id": "776236f8", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "# Define directories (self-contained)\n", "BASE_DIR = Path('..').resolve()\n", "DATASETS_DIR = BASE_DIR / 'datasets'\n", "MODELS_DIR = BASE_DIR / 'models'\n", "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n", "\n", "# Create necessary directories\n", "directories = [\n", " DATASETS_DIR,\n", " MODELS_DIR,\n", " ARTIFACTS_DIR,\n", " BASE_DIR / 'logs',\n", " BASE_DIR / 'cache',\n", "]\n", "\n", "print('Creating directory structure...')\n", "for directory in directories:\n", " directory.mkdir(parents=True, exist_ok=True)\n", " print(f' ✓ {directory}')\n", "\n", "print('\\n✓ Directory structure ready!')\n" ] }, { "cell_type": "markdown", "id": "a6fe27eb", "metadata": {}, "source": [ "## 8. Save Configuration for Other Notebooks" ] }, { "cell_type": "code", "execution_count": null, "id": "6b854bac", "metadata": {}, "outputs": [], "source": [ "import json\n", "import sys\n", "import os\n", "from pathlib import Path\n", "\n", "# Get values (self-contained)\n", "python_version = sys.version_info\n", "\n", "DEVICE = 'cpu'\n", "torch_version = 'not installed (not required)'\n", "cuda_available = False\n", "try:\n", " import torch\n", " torch_version = torch.__version__\n", " cuda_available = torch.cuda.is_available()\n", " if cuda_available:\n", " DEVICE = 'cuda'\n", " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n", " DEVICE = 'mps'\n", "except ImportError:\n", " pass\n", "\n", "# Load config\n", "config_json_path = Path('notebook_config.json')\n", "if config_json_path.exists():\n", " with open(config_json_path, 'r') as f:\n", " loaded_config = json.load(f)\n", "else:\n", " loaded_config = {}\n", "\n", "BASE_DIR = Path('..').resolve()\n", "DATASETS_DIR = BASE_DIR / 'datasets'\n", "MODELS_DIR = BASE_DIR / 'models'\n", "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n", "RANDOM_STATE = loaded_config.get('random_state', 42)\n", "TEST_SIZE = loaded_config.get('test_size', 0.2)\n", "CV_FOLDS = loaded_config.get('cv_folds', 5)\n", "\n", "# Export configuration for other notebooks\n", "notebook_config = {\n", " 'device': str(DEVICE),\n", " 'python_version': f'{python_version.major}.{python_version.minor}.{python_version.micro}',\n", " 'torch_version': torch_version,\n", " 'cuda_available': cuda_available,\n", " 'base_dir': str(BASE_DIR),\n", " 'datasets_dir': str(DATASETS_DIR),\n", " 'models_dir': str(MODELS_DIR),\n", " 'artifacts_dir': str(ARTIFACTS_DIR),\n", " 'random_state': RANDOM_STATE,\n", " 'test_size': TEST_SIZE,\n", " 'cv_folds': CV_FOLDS,\n", "}\n", "\n", "config_path = Path('notebook_runtime_config.json')\n", "with open(config_path, 'w') as f:\n", " json.dump(notebook_config, f, indent=2)\n", "\n", "print(f'✓ Configuration exported to: {config_path.absolute()}')\n", "print(json.dumps(notebook_config, indent=2))\n" ] }, { "cell_type": "markdown", "id": "ac7ada25", "metadata": {}, "source": [ "## 9. Environment Summary" ] }, { "cell_type": "code", "execution_count": null, "id": "f409be56", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import json\n", "import os\n", "from pathlib import Path\n", "\n", "python_version = sys.version_info\n", "\n", "try:\n", " import torch\n", " torch_version = torch.__version__\n", " if torch.cuda.is_available():\n", " DEVICE = 'cuda'\n", " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n", " DEVICE = 'mps'\n", " else:\n", " DEVICE = 'cpu'\n", "except ImportError:\n", " torch_version = 'not installed'\n", " DEVICE = 'cpu'\n", "\n", "# Load config\n", "config_json_path = Path('notebook_config.json')\n", "if config_json_path.exists():\n", " with open(config_json_path, 'r') as f:\n", " loaded_config = json.load(f)\n", "else:\n", " loaded_config = {}\n", "\n", "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n", "HUGGINGFACE_TOKEN = os.getenv('HF_TOKEN', '')\n", "\n", "print('\\n' + '=' * 60)\n", "print('ENVIRONMENT SETUP COMPLETE')\n", "print('=' * 60)\n", "print(f'''\n", "✅ Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n", "✅ Device: {DEVICE}\n", "✅ PyTorch: {torch_version}\n", "✅ Gemini API: {'Ready' if GEMINI_API_KEY else 'Not configured'}\n", "✅ HuggingFace: {'Ready' if HUGGINGFACE_TOKEN else 'Using public access'}\n", "✅ WebScraper API: Ready\n", "✅ Directories: Created\n", "\n", "You can now proceed to the next notebook:\n", " → 01_data_acquisition.ipynb\n", "''')\n", "print('=' * 60)\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }