Spaces:

Che237
/

cyberforge

Running

App Files Files Community

Che237 commited on 23 days ago

Commit

9cd197b

verified ·

1 Parent(s): 83bd628

Upload notebooks/00_environment_setup.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/00_environment_setup.ipynb +397 -38

notebooks/00_environment_setup.ipynb CHANGED Viewed

@@ -51,23 +51,23 @@
     "\n",
     "# Python version check\n",
     "python_version = sys.version_info\n",
-    "print(f\"\\n\u2713 Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
     "\n",
     "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
     "    raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
     "\n",
     "# System info\n",
-    "print(f\"\u2713 Platform: {platform.system()} {platform.release()}\")\n",
-    "print(f\"\u2713 Architecture: {platform.machine()}\")\n",
-    "print(f\"\u2713 Processor: {platform.processor() or 'Unknown'}\")\n",
     "\n",
     "# Memory info\n",
     "try:\n",
     "    import psutil\n",
     "    memory = psutil.virtual_memory()\n",
-    "    print(f\"\u2713 Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
     "except ImportError:\n",
-    "    print(\"\u26a0 psutil not installed - memory check skipped\")\n",
     "\n",
     "print(\"\\n\" + \"=\" * 60)"
    ]
@@ -86,7 +86,49 @@
    "id": "faa9b079",
    "metadata": {},
    "outputs": [],
-   "source": "from pathlib import Path\n\n# Core dependencies with pinned versions for reproducibility\nDEPENDENCIES = \"\"\"\n# Core ML/AI\nnumpy>=1.24.0,<2.0.0\npandas>=2.0.0\nscikit-learn>=1.3.0\nscipy>=1.11.0\n\n# Deep Learning  \ntorch>=2.0.0\ntransformers>=4.30.0\n\n# Gemini API\ngoogle-generativeai>=0.3.0\n\n# Data Processing\njoblib>=1.3.0\ntqdm>=4.65.0\n\n# Feature Engineering\ntldextract>=5.0.0\nvalidators>=0.22.0\n\n# Web/API\nhttpx>=0.25.0\nrequests>=2.31.0\n\n# Hugging Face\nhuggingface_hub>=0.19.0\n\n# Utilities\npython-dotenv>=1.0.0\npyyaml>=6.0.0\npsutil>=5.9.0\n\"\"\"\n\n# Write requirements file\nrequirements_path = Path(\"../requirements_notebooks.txt\")\nrequirements_path.write_text(DEPENDENCIES.strip())\nprint(f\"\u2713 Requirements written to: {requirements_path.absolute()}\")\n"
   },
   {
    "cell_type": "code",
@@ -94,7 +136,29 @@
    "id": "7dc8c6ca",
    "metadata": {},
    "outputs": [],
-   "source": "import subprocess\nimport sys\nfrom pathlib import Path\n\n# Install dependencies\nrequirements_path = Path(\"../requirements_notebooks.txt\")\n\nif requirements_path.exists():\n    print(\"Installing dependencies... This may take a few minutes.\")\n    result = subprocess.run(\n        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n        capture_output=True,\n        text=True\n    )\n\n    if result.returncode == 0:\n        print(\"\u2713 All dependencies installed successfully!\")\n    else:\n        print(f\"\u26a0 Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")\nelse:\n    print(\"\u26a0 Requirements file not found. Run previous cell first or skip if deps installed.\")\n"
   },
   {
    "cell_type": "markdown",
@@ -111,35 +175,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch\n",
-    "\n",
     "print(\"=\" * 60)\n",
     "print(\"COMPUTE DEVICE DETECTION\")\n",
     "print(\"=\" * 60)\n",
     "\n",
-    "# Check CUDA availability\n",
-    "cuda_available = torch.cuda.is_available()\n",
-    "print(f\"\\n\u2713 PyTorch Version: {torch.__version__}\")\n",
-    "print(f\"\u2713 CUDA Available: {cuda_available}\")\n",
-    "\n",
-    "if cuda_available:\n",
-    "    print(f\"\u2713 CUDA Version: {torch.version.cuda}\")\n",
-    "    print(f\"\u2713 GPU Count: {torch.cuda.device_count()}\")\n",
-    "    for i in range(torch.cuda.device_count()):\n",
-    "        props = torch.cuda.get_device_properties(i)\n",
-    "        print(f\"  - GPU {i}: {props.name} ({props.total_memory / (1024**3):.2f} GB)\")\n",
-    "    DEVICE = torch.device(\"cuda\")\n",
-    "else:\n",
-    "    print(\"\u26a0 No GPU detected - using CPU for training\")\n",
-    "    DEVICE = torch.device(\"cpu\")\n",
     "\n",
-    "# Check MPS (Apple Silicon)\n",
-    "if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
-    "    print(\"\u2713 Apple MPS (Metal) available\")\n",
-    "    DEVICE = torch.device(\"mps\")\n",
     "\n",
-    "print(f\"\\n\u2713 Selected Device: {DEVICE}\")\n",
-    "print(\"=\" * 60)"
    ]
   },
   {
@@ -156,7 +217,83 @@
    "id": "0f63a5ce",
    "metadata": {},
    "outputs": [],
-   "source": "import json\nimport os\nfrom pathlib import Path\n\n# Load configuration from notebook_config.json first (for HF Spaces)\nconfig_json_path = Path(\"notebook_config.json\")\nif config_json_path.exists():\n    with open(config_json_path, \"r\") as f:\n        loaded_config = json.load(f)\n    print(f\"\u2713 Loaded configuration from: {config_json_path.absolute()}\")\nelse:\n    loaded_config = {}\n    print(f\"\u26a0 No notebook_config.json found, using defaults\")\n\n# Try loading .env file as fallback (for local dev)\ntry:\n    from dotenv import load_dotenv\n    env_path = Path(\"../.env\")\n    if env_path.exists():\n        load_dotenv(env_path)\n        print(f\"\u2713 Loaded environment from: {env_path.absolute()}\")\nexcept ImportError:\n    pass\n\n# Detect device\ntry:\n    import torch\n    if torch.cuda.is_available():\n        DEVICE = torch.device(\"cuda\")\n    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n        DEVICE = torch.device(\"mps\")\n    else:\n        DEVICE = torch.device(\"cpu\")\nexcept ImportError:\n    DEVICE = \"cpu\"\n\n# Configuration class\nclass Config:\n    # API Keys - priority: config.json > env vars > defaults\n    GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"AIzaSyA3HdWTLk_zJQ5P9G8Z8a8BEYSTPvLglhs\")\n    HUGGINGFACE_TOKEN = os.getenv(\"HUGGINGFACE_API_TOKEN\", os.getenv(\"HF_TOKEN\", \"\"))\n    WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n    WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n    \n    # Gemini model\n    GEMINI_MODEL = loaded_config.get(\"gemini_model\", \"gemini-2.5-flash\")\n    \n    # Paths\n    BASE_DIR = Path(\"..\").resolve()\n    DATASETS_DIR = BASE_DIR / \"datasets\"\n    MODELS_DIR = BASE_DIR / \"models\"\n    ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n    \n    # ML Settings\n    RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n    TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n    CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n    \n    # Device\n    DEVICE = DEVICE\n\nconfig = Config()\n\n# Validate required API keys\nprint(\"\\n\" + \"=\" * 60)\nprint(\"API CONFIGURATION STATUS\")\nprint(\"=\" * 60)\n"
   },
   {
    "cell_type": "markdown",
@@ -172,7 +309,59 @@
    "id": "14cef3bc",
    "metadata": {},
    "outputs": [],
-   "source": "import google.generativeai as genai\nimport json\nimport os\nfrom pathlib import Path\n\n# Load config (self-contained)\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n    with open(config_json_path, 'r') as f:\n        loaded_config = json.load(f)\nelse:\n    loaded_config = {}\n\nGEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\nGEMINI_MODEL = loaded_config.get('gemini_model', 'gemini-2.5-flash')\n\ndef test_gemini_connection():\n    if not GEMINI_API_KEY:\n        return False, 'API key not configured'\n    try:\n        genai.configure(api_key=GEMINI_API_KEY)\n        model = genai.GenerativeModel(GEMINI_MODEL)\n        response = model.generate_content('Respond with only: OK')\n        return True, f'Model: {GEMINI_MODEL}, Response: {response.text.strip()}'\n    except Exception as e:\n        try:\n            model = genai.GenerativeModel('gemini-1.5-flash')\n            response = model.generate_content('Respond with only: OK')\n            return True, f'Model: gemini-1.5-flash (fallback), Response: {response.text.strip()}'\n        except Exception as e2:\n            return False, str(e2)\n\nprint('Testing Gemini API connection...')\nsuccess, message = test_gemini_connection()\nif success:\n    print(f'\u2713 Gemini API: {message}')\nelse:\n    print(f'\u26a0 Gemini API: Connection failed - {message}')\n"
   },
   {
    "cell_type": "markdown",
@@ -188,7 +377,45 @@
    "id": "beb1b036",
    "metadata": {},
    "outputs": [],
-   "source": "import httpx\nimport json\nimport os\nfrom pathlib import Path\n\n# Load config (self-contained)\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n    with open(config_json_path, 'r') as f:\n        loaded_config = json.load(f)\nelse:\n    loaded_config = {}\n\nWEBSCRAPER_API_KEY = loaded_config.get('webscraper_api_key', 'sk-fd14eaa7bceb478db7afc7256e514d2b')\nWEBSCRAPER_API_URL = loaded_config.get('webscraper_api_url', 'http://webscrapper.live/api/scrape')\n\ndef test_webscraper_connection_sync():\n    try:\n        with httpx.Client(timeout=30.0) as client:\n            response = client.post(\n                WEBSCRAPER_API_URL,\n                json={'url': 'https://example.com'},\n                headers={'Content-Type': 'application/json', 'X-API-Key': WEBSCRAPER_API_KEY}\n            )\n            if response.status_code == 200:\n                return True, 'Connected'\n            else:\n                return False, f'Status {response.status_code}'\n    except Exception as e:\n        return False, str(e)\n\nprint('Testing Web Scraper API connection...')\nsuccess, message = test_webscraper_connection_sync()\nif success:\n    print(f'\u2713 WebScraper API: Connected successfully')\nelse:\n    print(f'\u26a0 WebScraper API: {message}')\n"
   },
   {
    "cell_type": "markdown",
@@ -204,7 +431,31 @@
    "id": "776236f8",
    "metadata": {},
    "outputs": [],
-   "source": "from pathlib import Path\n\n# Define directories (self-contained)\nBASE_DIR = Path('..').resolve()\nDATASETS_DIR = BASE_DIR / 'datasets'\nMODELS_DIR = BASE_DIR / 'models'\nARTIFACTS_DIR = BASE_DIR / 'artifacts'\n\n# Create necessary directories\ndirectories = [\n    DATASETS_DIR,\n    MODELS_DIR,\n    ARTIFACTS_DIR,\n    BASE_DIR / 'logs',\n    BASE_DIR / 'cache',\n]\n\nprint('Creating directory structure...')\nfor directory in directories:\n    directory.mkdir(parents=True, exist_ok=True)\n    print(f'  \u2713 {directory}')\n\nprint('\\n\u2713 Directory structure ready!')\n"
   },
   {
    "cell_type": "markdown",
@@ -220,7 +471,67 @@
    "id": "6b854bac",
    "metadata": {},
    "outputs": [],
-   "source": "import json\nimport sys\nimport os\nfrom pathlib import Path\n\n# Get values (self-contained)\npython_version = sys.version_info\n\ntry:\n    import torch\n    torch_version = torch.__version__\n    cuda_available = torch.cuda.is_available()\n    if cuda_available:\n        DEVICE = 'cuda'\n    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n        DEVICE = 'mps'\n    else:\n        DEVICE = 'cpu'\nexcept ImportError:\n    torch_version = 'not installed'\n    cuda_available = False\n    DEVICE = 'cpu'\n\n# Load config\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n    with open(config_json_path, 'r') as f:\n        loaded_config = json.load(f)\nelse:\n    loaded_config = {}\n\nBASE_DIR = Path('..').resolve()\nDATASETS_DIR = BASE_DIR / 'datasets'\nMODELS_DIR = BASE_DIR / 'models'\nARTIFACTS_DIR = BASE_DIR / 'artifacts'\nRANDOM_STATE = loaded_config.get('random_state', 42)\nTEST_SIZE = loaded_config.get('test_size', 0.2)\nCV_FOLDS = loaded_config.get('cv_folds', 5)\n\n# Export configuration for other notebooks\nnotebook_config = {\n    'device': str(DEVICE),\n    'python_version': f'{python_version.major}.{python_version.minor}.{python_version.micro}',\n    'torch_version': torch_version,\n    'cuda_available': cuda_available,\n    'base_dir': str(BASE_DIR),\n    'datasets_dir': str(DATASETS_DIR),\n    'models_dir': str(MODELS_DIR),\n    'artifacts_dir': str(ARTIFACTS_DIR),\n    'random_state': RANDOM_STATE,\n    'test_size': TEST_SIZE,\n    'cv_folds': CV_FOLDS,\n}\n\nconfig_path = Path('notebook_runtime_config.json')\nwith open(config_path, 'w') as f:\n    json.dump(notebook_config, f, indent=2)\n\nprint(f'\u2713 Configuration exported to: {config_path.absolute()}')\nprint(json.dumps(notebook_config, indent=2))\n"
   },
   {
    "cell_type": "markdown",
@@ -236,7 +547,55 @@
    "id": "f409be56",
    "metadata": {},
    "outputs": [],
-   "source": "import sys\nimport json\nimport os\nfrom pathlib import Path\n\npython_version = sys.version_info\n\ntry:\n    import torch\n    torch_version = torch.__version__\n    if torch.cuda.is_available():\n        DEVICE = 'cuda'\n    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n        DEVICE = 'mps'\n    else:\n        DEVICE = 'cpu'\nexcept ImportError:\n    torch_version = 'not installed'\n    DEVICE = 'cpu'\n\n# Load config\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n    with open(config_json_path, 'r') as f:\n        loaded_config = json.load(f)\nelse:\n    loaded_config = {}\n\nGEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\nHUGGINGFACE_TOKEN = os.getenv('HF_TOKEN', '')\n\nprint('\\n' + '=' * 60)\nprint('ENVIRONMENT SETUP COMPLETE')\nprint('=' * 60)\nprint(f'''\n\u2705 Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n\u2705 Device: {DEVICE}\n\u2705 PyTorch: {torch_version}\n\u2705 Gemini API: {'Ready' if GEMINI_API_KEY else 'Not configured'}\n\u2705 HuggingFace: {'Ready' if HUGGINGFACE_TOKEN else 'Using public access'}\n\u2705 WebScraper API: Ready\n\u2705 Directories: Created\n\nYou can now proceed to the next notebook:\n  \u2192 01_data_acquisition.ipynb\n''')\nprint('=' * 60)\n"
   }
  ],
  "metadata": {
@@ -246,4 +605,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

     "\n",
     "# Python version check\n",
     "python_version = sys.version_info\n",
+    "print(f\"\\n✓ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
     "\n",
     "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
     "    raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
     "\n",
     "# System info\n",
+    "print(f\"✓ Platform: {platform.system()} {platform.release()}\")\n",
+    "print(f\"✓ Architecture: {platform.machine()}\")\n",
+    "print(f\"✓ Processor: {platform.processor() or 'Unknown'}\")\n",
     "\n",
     "# Memory info\n",
     "try:\n",
     "    import psutil\n",
     "    memory = psutil.virtual_memory()\n",
+    "    print(f\"✓ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
     "except ImportError:\n",
+    "    print(\"⚠ psutil not installed - memory check skipped\")\n",
     "\n",
     "print(\"\\n\" + \"=\" * 60)"
    ]
    "id": "faa9b079",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "# Core dependencies with pinned versions for reproducibility\n",
+    "# NOTE: torch/transformers intentionally excluded - not needed for sklearn models\n",
+    "# and too heavy for HF Space Docker containers\n",
+    "DEPENDENCIES = \"\"\"\n",
+    "# Core ML/AI\n",
+    "numpy>=1.24.0,<2.0.0\n",
+    "pandas>=2.0.0\n",
+    "scikit-learn>=1.3.0\n",
+    "scipy>=1.11.0\n",
+    "\n",
+    "# Gemini API (new SDK)\n",
+    "google-genai>=1.0.0\n",
+    "\n",
+    "# Data Processing\n",
+    "joblib>=1.3.0\n",
+    "tqdm>=4.65.0\n",
+    "pyarrow>=14.0.0\n",
+    "\n",
+    "# Feature Engineering\n",
+    "tldextract>=5.0.0\n",
+    "validators>=0.22.0\n",
+    "\n",
+    "# Web/API\n",
+    "httpx>=0.25.0\n",
+    "requests>=2.31.0\n",
+    "\n",
+    "# Hugging Face\n",
+    "huggingface_hub>=0.19.0\n",
+    "\n",
+    "# Utilities\n",
+    "python-dotenv>=1.0.0\n",
+    "pyyaml>=6.0.0\n",
+    "psutil>=5.9.0\n",
+    "\"\"\"\n",
+    "\n",
+    "# Write requirements file\n",
+    "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
+    "requirements_path.write_text(DEPENDENCIES.strip())\n",
+    "print(f\"✓ Requirements written to: {requirements_path.absolute()}\")\n"
+   ]
   },
   {
    "cell_type": "code",
    "id": "7dc8c6ca",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Install dependencies\n",
+    "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
+    "\n",
+    "if requirements_path.exists():\n",
+    "    print(\"Installing dependencies... This may take a few minutes.\")\n",
+    "    result = subprocess.run(\n",
+    "        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n",
+    "        capture_output=True,\n",
+    "        text=True\n",
+    "    )\n",
+    "\n",
+    "    if result.returncode == 0:\n",
+    "        print(\"✓ All dependencies installed successfully!\")\n",
+    "    else:\n",
+    "        print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")\n",
+    "else:\n",
+    "    print(\"⚠ Requirements file not found. Run previous cell first or skip if deps installed.\")\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "outputs": [],
    "source": [
     "print(\"=\" * 60)\n",
     "print(\"COMPUTE DEVICE DETECTION\")\n",
     "print(\"=\" * 60)\n",
     "\n",
+    "# CyberForge uses sklearn (CPU-only) — torch is optional\n",
+    "try:\n",
+    "    import torch\n",
+    "    cuda_available = torch.cuda.is_available()\n",
+    "    print(f\"\\n✓ PyTorch Version: {torch.__version__}\")\n",
+    "    print(f\"✓ CUDA Available: {cuda_available}\")\n",
     "\n",
+    "    if cuda_available:\n",
+    "        print(f\"✓ CUDA Version: {torch.version.cuda}\")\n",
+    "        DEVICE = \"cuda\"\n",
+    "    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
+    "        print(\"✓ Apple MPS (Metal) available\")\n",
+    "        DEVICE = \"mps\"\n",
+    "    else:\n",
+    "        DEVICE = \"cpu\"\n",
+    "except ImportError:\n",
+    "    print(\"\\n⚠ PyTorch not installed (not required — sklearn models use CPU)\")\n",
+    "    DEVICE = \"cpu\"\n",
     "\n",
+    "print(f\"\\n✓ Selected Device: {DEVICE}\")\n",
+    "print(\"  (Note: CyberForge models use scikit-learn which runs on CPU)\")\n",
+    "print(\"=\" * 60)\n"
    ]
   },
   {
    "id": "0f63a5ce",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Load configuration from notebook_config.json first (for HF Spaces)\n",
+    "config_json_path = Path(\"notebook_config.json\")\n",
+    "if config_json_path.exists():\n",
+    "    with open(config_json_path, \"r\") as f:\n",
+    "        loaded_config = json.load(f)\n",
+    "    print(f\"✓ Loaded configuration from: {config_json_path.absolute()}\")\n",
+    "else:\n",
+    "    loaded_config = {}\n",
+    "    print(f\"⚠ No notebook_config.json found, using defaults\")\n",
+    "\n",
+    "# Try loading .env file as fallback (for local dev)\n",
+    "try:\n",
+    "    from dotenv import load_dotenv\n",
+    "    env_path = Path(\"../.env\")\n",
+    "    if env_path.exists():\n",
+    "        load_dotenv(env_path)\n",
+    "        print(f\"✓ Loaded environment from: {env_path.absolute()}\")\n",
+    "except ImportError:\n",
+    "    pass\n",
+    "\n",
+    "# Detect device (torch is optional)\n",
+    "DEVICE = \"cpu\"\n",
+    "try:\n",
+    "    import torch\n",
+    "    if torch.cuda.is_available():\n",
+    "        DEVICE = \"cuda\"\n",
+    "    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
+    "        DEVICE = \"mps\"\n",
+    "except ImportError:\n",
+    "    pass\n",
+    "\n",
+    "# Configuration class\n",
+    "class Config:\n",
+    "    # API Keys - priority: config.json > env vars > defaults\n",
+    "    GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"\")\n",
+    "    HUGGINGFACE_TOKEN = loaded_config.get(\"hf_token\") or os.getenv(\"HF_TOKEN\", \"\")\n",
+    "    WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n",
+    "    WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n",
+    "    \n",
+    "    # Gemini model\n",
+    "    GEMINI_MODEL = loaded_config.get(\"gemini_model\", os.getenv(\"GEMINI_MODEL\", \"gemini-2.5-flash\"))\n",
+    "    \n",
+    "    # HF repos\n",
+    "    HF_REPO = loaded_config.get(\"hf_repo\", \"Che237/cyberforge-models\")\n",
+    "    HF_DATASETS_REPO = loaded_config.get(\"hf_datasets_repo\", \"Che237/cyberforge-datasets\")\n",
+    "    \n",
+    "    # Paths\n",
+    "    BASE_DIR = Path(\"..\").resolve()\n",
+    "    DATASETS_DIR = BASE_DIR / \"datasets\"\n",
+    "    MODELS_DIR = BASE_DIR / \"models\"\n",
+    "    ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n",
+    "    \n",
+    "    # ML Settings\n",
+    "    RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n",
+    "    TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n",
+    "    CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n",
+    "    \n",
+    "    # Device\n",
+    "    DEVICE = DEVICE\n",
+    "\n",
+    "config = Config()\n",
+    "\n",
+    "# Validate required API keys\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"API CONFIGURATION STATUS\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"  Gemini API Key: {'✓ Set' if config.GEMINI_API_KEY else '✗ Missing'}\")\n",
+    "print(f\"  HuggingFace Token: {'✓ Set' if config.HUGGINGFACE_TOKEN else '⚠ Not set (models won\\\\'t upload)'}\")\n",
+    "print(f\"  Gemini Model: {config.GEMINI_MODEL}\")\n",
+    "print(f\"  HF Model Repo: {config.HF_REPO}\")\n",
+    "print(f\"  Device: {config.DEVICE}\")\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "id": "14cef3bc",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# Gemini Integration — using google-genai (new SDK)\n",
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "try:\n",
+    "    from google import genai\n",
+    "except ImportError:\n",
+    "    import subprocess, sys\n",
+    "    subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-genai', '-q'])\n",
+    "    from google import genai\n",
+    "\n",
+    "# Load config (self-contained)\n",
+    "config_json_path = Path('notebook_config.json')\n",
+    "if config_json_path.exists():\n",
+    "    with open(config_json_path, 'r') as f:\n",
+    "        loaded_config = json.load(f)\n",
+    "else:\n",
+    "    loaded_config = {}\n",
+    "\n",
+    "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n",
+    "GEMINI_MODEL = loaded_config.get('gemini_model', os.getenv('GEMINI_MODEL', 'gemini-2.5-flash'))\n",
+    "\n",
+    "def test_gemini_connection():\n",
+    "    if not GEMINI_API_KEY:\n",
+    "        return False, 'API key not configured'\n",
+    "    try:\n",
+    "        client = genai.Client(api_key=GEMINI_API_KEY)\n",
+    "        response = client.models.generate_content(\n",
+    "            model=GEMINI_MODEL,\n",
+    "            contents='Respond with only: OK'\n",
+    "        )\n",
+    "        return True, f'Model: {GEMINI_MODEL}, Response: {response.text.strip()}'\n",
+    "    except Exception as e:\n",
+    "        # Try fallback model\n",
+    "        try:\n",
+    "            client = genai.Client(api_key=GEMINI_API_KEY)\n",
+    "            response = client.models.generate_content(\n",
+    "                model='gemini-2.5-flash',\n",
+    "                contents='Respond with only: OK'\n",
+    "            )\n",
+    "            return True, f'Model: gemini-2.5-flash (fallback), Response: {response.text.strip()}'\n",
+    "        except Exception as e2:\n",
+    "            return False, str(e2)\n",
+    "\n",
+    "print('Testing Gemini API connection...')\n",
+    "success, message = test_gemini_connection()\n",
+    "if success:\n",
+    "    print(f'✓ Gemini API: {message}')\n",
+    "else:\n",
+    "    print(f'⚠ Gemini API: Connection failed - {message}')\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "id": "beb1b036",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import httpx\n",
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Load config (self-contained)\n",
+    "config_json_path = Path('notebook_config.json')\n",
+    "if config_json_path.exists():\n",
+    "    with open(config_json_path, 'r') as f:\n",
+    "        loaded_config = json.load(f)\n",
+    "else:\n",
+    "    loaded_config = {}\n",
+    "\n",
+    "WEBSCRAPER_API_KEY = loaded_config.get('webscraper_api_key', 'sk-fd14eaa7bceb478db7afc7256e514d2b')\n",
+    "WEBSCRAPER_API_URL = loaded_config.get('webscraper_api_url', 'http://webscrapper.live/api/scrape')\n",
+    "\n",
+    "def test_webscraper_connection_sync():\n",
+    "    try:\n",
+    "        with httpx.Client(timeout=30.0) as client:\n",
+    "            response = client.post(\n",
+    "                WEBSCRAPER_API_URL,\n",
+    "                json={'url': 'https://example.com'},\n",
+    "                headers={'Content-Type': 'application/json', 'X-API-Key': WEBSCRAPER_API_KEY}\n",
+    "            )\n",
+    "            if response.status_code == 200:\n",
+    "                return True, 'Connected'\n",
+    "            else:\n",
+    "                return False, f'Status {response.status_code}'\n",
+    "    except Exception as e:\n",
+    "        return False, str(e)\n",
+    "\n",
+    "print('Testing Web Scraper API connection...')\n",
+    "success, message = test_webscraper_connection_sync()\n",
+    "if success:\n",
+    "    print(f'✓ WebScraper API: Connected successfully')\n",
+    "else:\n",
+    "    print(f'⚠ WebScraper API: {message}')\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "id": "776236f8",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "# Define directories (self-contained)\n",
+    "BASE_DIR = Path('..').resolve()\n",
+    "DATASETS_DIR = BASE_DIR / 'datasets'\n",
+    "MODELS_DIR = BASE_DIR / 'models'\n",
+    "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n",
+    "\n",
+    "# Create necessary directories\n",
+    "directories = [\n",
+    "    DATASETS_DIR,\n",
+    "    MODELS_DIR,\n",
+    "    ARTIFACTS_DIR,\n",
+    "    BASE_DIR / 'logs',\n",
+    "    BASE_DIR / 'cache',\n",
+    "]\n",
+    "\n",
+    "print('Creating directory structure...')\n",
+    "for directory in directories:\n",
+    "    directory.mkdir(parents=True, exist_ok=True)\n",
+    "    print(f'  ✓ {directory}')\n",
+    "\n",
+    "print('\\n✓ Directory structure ready!')\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "id": "6b854bac",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import json\n",
+    "import sys\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Get values (self-contained)\n",
+    "python_version = sys.version_info\n",
+    "\n",
+    "DEVICE = 'cpu'\n",
+    "torch_version = 'not installed (not required)'\n",
+    "cuda_available = False\n",
+    "try:\n",
+    "    import torch\n",
+    "    torch_version = torch.__version__\n",
+    "    cuda_available = torch.cuda.is_available()\n",
+    "    if cuda_available:\n",
+    "        DEVICE = 'cuda'\n",
+    "    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
+    "        DEVICE = 'mps'\n",
+    "except ImportError:\n",
+    "    pass\n",
+    "\n",
+    "# Load config\n",
+    "config_json_path = Path('notebook_config.json')\n",
+    "if config_json_path.exists():\n",
+    "    with open(config_json_path, 'r') as f:\n",
+    "        loaded_config = json.load(f)\n",
+    "else:\n",
+    "    loaded_config = {}\n",
+    "\n",
+    "BASE_DIR = Path('..').resolve()\n",
+    "DATASETS_DIR = BASE_DIR / 'datasets'\n",
+    "MODELS_DIR = BASE_DIR / 'models'\n",
+    "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n",
+    "RANDOM_STATE = loaded_config.get('random_state', 42)\n",
+    "TEST_SIZE = loaded_config.get('test_size', 0.2)\n",
+    "CV_FOLDS = loaded_config.get('cv_folds', 5)\n",
+    "\n",
+    "# Export configuration for other notebooks\n",
+    "notebook_config = {\n",
+    "    'device': str(DEVICE),\n",
+    "    'python_version': f'{python_version.major}.{python_version.minor}.{python_version.micro}',\n",
+    "    'torch_version': torch_version,\n",
+    "    'cuda_available': cuda_available,\n",
+    "    'base_dir': str(BASE_DIR),\n",
+    "    'datasets_dir': str(DATASETS_DIR),\n",
+    "    'models_dir': str(MODELS_DIR),\n",
+    "    'artifacts_dir': str(ARTIFACTS_DIR),\n",
+    "    'random_state': RANDOM_STATE,\n",
+    "    'test_size': TEST_SIZE,\n",
+    "    'cv_folds': CV_FOLDS,\n",
+    "}\n",
+    "\n",
+    "config_path = Path('notebook_runtime_config.json')\n",
+    "with open(config_path, 'w') as f:\n",
+    "    json.dump(notebook_config, f, indent=2)\n",
+    "\n",
+    "print(f'✓ Configuration exported to: {config_path.absolute()}')\n",
+    "print(json.dumps(notebook_config, indent=2))\n"
+   ]
   },
   {
    "cell_type": "markdown",
    "id": "f409be56",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import sys\n",
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "python_version = sys.version_info\n",
+    "\n",
+    "try:\n",
+    "    import torch\n",
+    "    torch_version = torch.__version__\n",
+    "    if torch.cuda.is_available():\n",
+    "        DEVICE = 'cuda'\n",
+    "    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
+    "        DEVICE = 'mps'\n",
+    "    else:\n",
+    "        DEVICE = 'cpu'\n",
+    "except ImportError:\n",
+    "    torch_version = 'not installed'\n",
+    "    DEVICE = 'cpu'\n",
+    "\n",
+    "# Load config\n",
+    "config_json_path = Path('notebook_config.json')\n",
+    "if config_json_path.exists():\n",
+    "    with open(config_json_path, 'r') as f:\n",
+    "        loaded_config = json.load(f)\n",
+    "else:\n",
+    "    loaded_config = {}\n",
+    "\n",
+    "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n",
+    "HUGGINGFACE_TOKEN = os.getenv('HF_TOKEN', '')\n",
+    "\n",
+    "print('\\n' + '=' * 60)\n",
+    "print('ENVIRONMENT SETUP COMPLETE')\n",
+    "print('=' * 60)\n",
+    "print(f'''\n",
+    "✅ Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n",
+    "✅ Device: {DEVICE}\n",
+    "✅ PyTorch: {torch_version}\n",
+    "✅ Gemini API: {'Ready' if GEMINI_API_KEY else 'Not configured'}\n",
+    "✅ HuggingFace: {'Ready' if HUGGINGFACE_TOKEN else 'Using public access'}\n",
+    "✅ WebScraper API: Ready\n",
+    "✅ Directories: Created\n",
+    "\n",
+    "You can now proceed to the next notebook:\n",
+    "  → 01_data_acquisition.ipynb\n",
+    "''')\n",
+    "print('=' * 60)\n"
+   ]
   }
  ],
  "metadata": {
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}