Spaces:

Che237
/

cyberforge

Running

App Files Files Community

Che237 commited on 12 days ago

Commit

0a285a3

verified ·

1 Parent(s): c1c0b2d

Delete notebooks/00_environment_setup.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/00_environment_setup.ipynb +0 -495

notebooks/00_environment_setup.ipynb DELETED Viewed

@@ -1,495 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "17e1f6f6",
-   "metadata": {},
-   "source": [
-    "# 00 - Environment Setup\n",
-    "\n",
-    "## CyberForge AI - ML Pipeline Environment Configuration\n",
-    "\n",
-    "This notebook sets up the complete environment for the CyberForge AI machine learning pipeline.\n",
-    "\n",
-    "### What this notebook does:\n",
-    "1. Validates Python version and system requirements\n",
-    "2. Installs and pins all dependencies\n",
-    "3. Configures GPU/CPU detection\n",
-    "4. Sets up Gemini API connectivity\n",
-    "5. Validates Web Scraper API connection\n",
-    "6. Creates necessary directories\n",
-    "\n",
-    "### Prerequisites:\n",
-    "- Python 3.10+ (3.11 recommended)\n",
-    "- Access to Gemini API (API key required)\n",
-    "- Access to WebScrapper.live API"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "33029fa4",
-   "metadata": {},
-   "source": [
-    "## 1. System Validation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "076fa991",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "import platform\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "print(\"=\" * 60)\n",
-    "print(\"CYBERFORGE AI - ENVIRONMENT VALIDATION\")\n",
-    "print(\"=\" * 60)\n",
-    "\n",
-    "# Python version check\n",
-    "python_version = sys.version_info\n",
-    "print(f\"\\n✓ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
-    "\n",
-    "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
-    "    raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
-    "\n",
-    "# System info\n",
-    "print(f\"✓ Platform: {platform.system()} {platform.release()}\")\n",
-    "print(f\"✓ Architecture: {platform.machine()}\")\n",
-    "print(f\"✓ Processor: {platform.processor() or 'Unknown'}\")\n",
-    "\n",
-    "# Memory info\n",
-    "try:\n",
-    "    import psutil\n",
-    "    memory = psutil.virtual_memory()\n",
-    "    print(f\"✓ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
-    "except ImportError:\n",
-    "    print(\"⚠ psutil not installed - memory check skipped\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\" * 60)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "45e95831",
-   "metadata": {},
-   "source": [
-    "## 2. Install Dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "faa9b079",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Core dependencies with pinned versions for reproducibility\n",
-    "DEPENDENCIES = \"\"\"\n",
-    "# Core ML/AI\n",
-    "numpy>=1.24.0,<2.0.0\n",
-    "pandas>=2.0.0\n",
-    "scikit-learn>=1.3.0\n",
-    "scipy>=1.11.0\n",
-    "\n",
-    "# Deep Learning\n",
-    "torch>=2.0.0\n",
-    "transformers>=4.30.0\n",
-    "\n",
-    "# Gemini API\n",
-    "google-generativeai>=0.3.0\n",
-    "\n",
-    "# Data Processing\n",
-    "joblib>=1.3.0\n",
-    "tqdm>=4.65.0\n",
-    "\n",
-    "# Feature Engineering\n",
-    "tldextract>=5.0.0\n",
-    "validators>=0.22.0\n",
-    "ipaddress>=1.0.23\n",
-    "\n",
-    "# Web/API\n",
-    "httpx>=0.25.0\n",
-    "aiohttp>=3.8.0\n",
-    "requests>=2.31.0\n",
-    "\n",
-    "# Hugging Face\n",
-    "huggingface_hub>=0.19.0\n",
-    "\n",
-    "# Utilities\n",
-    "python-dotenv>=1.0.0\n",
-    "pyyaml>=6.0.0\n",
-    "psutil>=5.9.0\n",
-    "\"\"\"\n",
-    "\n",
-    "# Write requirements file\n",
-    "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
-    "requirements_path.write_text(DEPENDENCIES.strip())\n",
-    "print(f\"✓ Requirements written to: {requirements_path.absolute()}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7dc8c6ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install dependencies\n",
-    "import subprocess\n",
-    "\n",
-    "print(\"Installing dependencies... This may take a few minutes.\")\n",
-    "result = subprocess.run(\n",
-    "    [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n",
-    "    capture_output=True,\n",
-    "    text=True\n",
-    ")\n",
-    "\n",
-    "if result.returncode == 0:\n",
-    "    print(\"✓ All dependencies installed successfully!\")\n",
-    "else:\n",
-    "    print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c11760cc",
-   "metadata": {},
-   "source": [
-    "## 3. GPU/CPU Detection"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d1b948c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "\n",
-    "print(\"=\" * 60)\n",
-    "print(\"COMPUTE DEVICE DETECTION\")\n",
-    "print(\"=\" * 60)\n",
-    "\n",
-    "# Check CUDA availability\n",
-    "cuda_available = torch.cuda.is_available()\n",
-    "print(f\"\\n✓ PyTorch Version: {torch.__version__}\")\n",
-    "print(f\"✓ CUDA Available: {cuda_available}\")\n",
-    "\n",
-    "if cuda_available:\n",
-    "    print(f\"✓ CUDA Version: {torch.version.cuda}\")\n",
-    "    print(f\"✓ GPU Count: {torch.cuda.device_count()}\")\n",
-    "    for i in range(torch.cuda.device_count()):\n",
-    "        props = torch.cuda.get_device_properties(i)\n",
-    "        print(f\"  - GPU {i}: {props.name} ({props.total_memory / (1024**3):.2f} GB)\")\n",
-    "    DEVICE = torch.device(\"cuda\")\n",
-    "else:\n",
-    "    print(\"⚠ No GPU detected - using CPU for training\")\n",
-    "    DEVICE = torch.device(\"cpu\")\n",
-    "\n",
-    "# Check MPS (Apple Silicon)\n",
-    "if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
-    "    print(\"✓ Apple MPS (Metal) available\")\n",
-    "    DEVICE = torch.device(\"mps\")\n",
-    "\n",
-    "print(f\"\\n✓ Selected Device: {DEVICE}\")\n",
-    "print(\"=\" * 60)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d39ddbf5",
-   "metadata": {},
-   "source": [
-    "## 4. Environment Variables & API Configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0f63a5ce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "# Load configuration from notebook_config.json first (for HF Spaces)\n",
-    "config_json_path = Path(\"notebook_config.json\")\n",
-    "if config_json_path.exists():\n",
-    "    with open(config_json_path, \"r\") as f:\n",
-    "        loaded_config = json.load(f)\n",
-    "    print(f\"✓ Loaded configuration from: {config_json_path.absolute()}\")\n",
-    "else:\n",
-    "    loaded_config = {}\n",
-    "    print(f\"⚠ No notebook_config.json found, using defaults\")\n",
-    "\n",
-    "# Try loading .env file as fallback (for local dev)\n",
-    "try:\n",
-    "    from dotenv import load_dotenv\n",
-    "    env_path = Path(\"../.env\")\n",
-    "    if env_path.exists():\n",
-    "        load_dotenv(env_path)\n",
-    "        print(f\"✓ Loaded environment from: {env_path.absolute()}\")\n",
-    "except ImportError:\n",
-    "    pass\n",
-    "\n",
-    "# Configuration class\n",
-    "class Config:\n",
-    "    # API Keys - priority: config.json > env vars > HF secrets\n",
-    "    GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"AIzaSyA3HdWTLk_zJQ5P9G8Z8a8BEYSTPvLglhs\")\n",
-    "    HUGGINGFACE_TOKEN = os.getenv(\"HUGGINGFACE_API_TOKEN\", os.getenv(\"HF_TOKEN\", \"\"))\n",
-    "    WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n",
-    "    WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n",
-    "    \n",
-    "    # Gemini model\n",
-    "    GEMINI_MODEL = loaded_config.get(\"gemini_model\", \"gemini-2.5-flash\")\n",
-    "    \n",
-    "    # Paths\n",
-    "    BASE_DIR = Path(\"..\").resolve()\n",
-    "    DATASETS_DIR = BASE_DIR / \"datasets\"\n",
-    "    MODELS_DIR = BASE_DIR / \"models\"\n",
-    "    ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n",
-    "    \n",
-    "    # ML Settings\n",
-    "    RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n",
-    "    TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n",
-    "    CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n",
-    "    \n",
-    "    # Device\n",
-    "    DEVICE = DEVICE\n",
-    "\n",
-    "config = Config()\n",
-    "\n",
-    "# Validate required API keys\n",
-    "print(\"\\n\" + \"=\" * 60)\n",
-    "print(\"API CONFIGURATION STATUS\")\n",
-    "print(\"=\" * 60)\n",
-    "print(f\"✓ Gemini API Key: {'Configured (' + config.GEMINI_API_KEY[:10] + '...)' if config.GEMINI_API_KEY else '⚠ NOT SET'}\")\n",
-    "print(f\"✓ Gemini Model: {config.GEMINI_MODEL}\")\n",
-    "print(f\"✓ HuggingFace Token: {'Configured' if config.HUGGINGFACE_TOKEN else '⚠ NOT SET (optional)'}\")\n",
-    "print(f\"✓ WebScraper API: Configured\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "126b5f7f",
-   "metadata": {},
-   "source": [
-    "## 5. Gemini API Connectivity Test"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "14cef3bc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import google.generativeai as genai\n",
-    "\n",
-    "def test_gemini_connection():\n",
-    "    \"\"\"Test Gemini API connectivity\"\"\"\n",
-    "    if not config.GEMINI_API_KEY:\n",
-    "        return False, \"API key not configured\"\n",
-    "    \n",
-    "    try:\n",
-    "        genai.configure(api_key=config.GEMINI_API_KEY)\n",
-    "        # Use the configured model (gemini-2.5-flash)\n",
-    "        model = genai.GenerativeModel(config.GEMINI_MODEL)\n",
-    "        response = model.generate_content(\"Respond with only: OK\")\n",
-    "        return True, f\"Model: {config.GEMINI_MODEL}, Response: {response.text.strip()}\"\n",
-    "    except Exception as e:\n",
-    "        # Fallback to gemini-1.5-flash if 2.5 not available\n",
-    "        try:\n",
-    "            model = genai.GenerativeModel('gemini-1.5-flash')\n",
-    "            response = model.generate_content(\"Respond with only: OK\")\n",
-    "            return True, f\"Model: gemini-1.5-flash (fallback), Response: {response.text.strip()}\"\n",
-    "        except Exception as e2:\n",
-    "            return False, str(e2)\n",
-    "\n",
-    "print(\"Testing Gemini API connection...\")\n",
-    "success, message = test_gemini_connection()\n",
-    "\n",
-    "if success:\n",
-    "    print(f\"✓ Gemini API: {message}\")\n",
-    "else:\n",
-    "    print(f\"⚠ Gemini API: Connection failed - {message}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "628ac121",
-   "metadata": {},
-   "source": [
-    "## 6. Web Scraper API Connectivity Test"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "beb1b036",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import httpx\n",
-    "\n",
-    "async def test_webscraper_connection():\n",
-    "    \"\"\"Test WebScrapper.live API connectivity\"\"\"\n",
-    "    try:\n",
-    "        async with httpx.AsyncClient(timeout=30.0) as client:\n",
-    "            response = await client.post(\n",
-    "                config.WEBSCRAPER_API_URL,\n",
-    "                json={\"url\": \"https://example.com\"},\n",
-    "                headers={\n",
-    "                    \"Content-Type\": \"application/json\",\n",
-    "                    \"X-API-Key\": config.WEBSCRAPER_API_KEY\n",
-    "                }\n",
-    "            )\n",
-    "            if response.status_code == 200:\n",
-    "                return True, \"Connected\"\n",
-    "            else:\n",
-    "                return False, f\"Status {response.status_code}\"\n",
-    "    except Exception as e:\n",
-    "        return False, str(e)\n",
-    "\n",
-    "print(\"Testing Web Scraper API connection...\")\n",
-    "\n",
-    "# Run async test\n",
-    "import asyncio\n",
-    "try:\n",
-    "    loop = asyncio.get_event_loop()\n",
-    "except RuntimeError:\n",
-    "    loop = asyncio.new_event_loop()\n",
-    "    asyncio.set_event_loop(loop)\n",
-    "\n",
-    "success, message = loop.run_until_complete(test_webscraper_connection())\n",
-    "\n",
-    "if success:\n",
-    "    print(f\"✓ WebScraper API: Connected successfully\")\n",
-    "else:\n",
-    "    print(f\"⚠ WebScraper API: {message}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "75ee0f51",
-   "metadata": {},
-   "source": [
-    "## 7. Create Directory Structure"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "776236f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create necessary directories\n",
-    "directories = [\n",
-    "    config.DATASETS_DIR,\n",
-    "    config.MODELS_DIR,\n",
-    "    config.ARTIFACTS_DIR,\n",
-    "    config.BASE_DIR / \"logs\",\n",
-    "    config.BASE_DIR / \"cache\",\n",
-    "]\n",
-    "\n",
-    "print(\"Creating directory structure...\")\n",
-    "for directory in directories:\n",
-    "    directory.mkdir(parents=True, exist_ok=True)\n",
-    "    print(f\"  ✓ {directory}\")\n",
-    "\n",
-    "print(\"\\n✓ Directory structure ready!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a6fe27eb",
-   "metadata": {},
-   "source": [
-    "## 8. Save Configuration for Other Notebooks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6b854bac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "# Export configuration for other notebooks\n",
-    "notebook_config = {\n",
-    "    \"device\": str(DEVICE),\n",
-    "    \"python_version\": f\"{python_version.major}.{python_version.minor}.{python_version.micro}\",\n",
-    "    \"torch_version\": torch.__version__,\n",
-    "    \"cuda_available\": cuda_available,\n",
-    "    \"base_dir\": str(config.BASE_DIR),\n",
-    "    \"datasets_dir\": str(config.DATASETS_DIR),\n",
-    "    \"models_dir\": str(config.MODELS_DIR),\n",
-    "    \"artifacts_dir\": str(config.ARTIFACTS_DIR),\n",
-    "    \"random_state\": config.RANDOM_STATE,\n",
-    "    \"test_size\": config.TEST_SIZE,\n",
-    "    \"cv_folds\": config.CV_FOLDS,\n",
-    "    \"gemini_configured\": bool(config.GEMINI_API_KEY),\n",
-    "    \"huggingface_configured\": bool(config.HUGGINGFACE_TOKEN),\n",
-    "    \"created_at\": str(pd.Timestamp.now())\n",
-    "}\n",
-    "\n",
-    "config_path = config.BASE_DIR / \"notebook_config.json\"\n",
-    "with open(config_path, \"w\") as f:\n",
-    "    json.dump(notebook_config, f, indent=2)\n",
-    "\n",
-    "print(f\"✓ Configuration saved to: {config_path}\")\n",
-    "print(\"\\n\" + json.dumps(notebook_config, indent=2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ac7ada25",
-   "metadata": {},
-   "source": [
-    "## 9. Environment Summary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f409be56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"\\n\" + \"=\" * 60)\n",
-    "print(\"ENVIRONMENT SETUP COMPLETE\")\n",
-    "print(\"=\" * 60)\n",
-    "print(f\"\"\"\n",
-    "✅ Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n",
-    "✅ Device: {DEVICE}\n",
-    "✅ PyTorch: {torch.__version__}\n",
-    "✅ Gemini API: {'Ready' if config.GEMINI_API_KEY else 'Not configured'}\n",
-    "✅ HuggingFace: {'Ready' if config.HUGGINGFACE_TOKEN else 'Not configured'}\n",
-    "✅ WebScraper API: Ready\n",
-    "✅ Directories: Created\n",
-    "\n",
-    "You can now proceed to the next notebook:\n",
-    "  → 01_data_acquisition.ipynb\n",
-    "\"\"\")\n",
-    "print(\"=\" * 60)"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}