Che237 commited on
Commit
9cd197b
Β·
verified Β·
1 Parent(s): 83bd628

Upload notebooks/00_environment_setup.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. notebooks/00_environment_setup.ipynb +397 -38
notebooks/00_environment_setup.ipynb CHANGED
@@ -51,23 +51,23 @@
51
  "\n",
52
  "# Python version check\n",
53
  "python_version = sys.version_info\n",
54
- "print(f\"\\n\u2713 Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
55
  "\n",
56
  "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
57
  " raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
58
  "\n",
59
  "# System info\n",
60
- "print(f\"\u2713 Platform: {platform.system()} {platform.release()}\")\n",
61
- "print(f\"\u2713 Architecture: {platform.machine()}\")\n",
62
- "print(f\"\u2713 Processor: {platform.processor() or 'Unknown'}\")\n",
63
  "\n",
64
  "# Memory info\n",
65
  "try:\n",
66
  " import psutil\n",
67
  " memory = psutil.virtual_memory()\n",
68
- " print(f\"\u2713 Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
69
  "except ImportError:\n",
70
- " print(\"\u26a0 psutil not installed - memory check skipped\")\n",
71
  "\n",
72
  "print(\"\\n\" + \"=\" * 60)"
73
  ]
@@ -86,7 +86,49 @@
86
  "id": "faa9b079",
87
  "metadata": {},
88
  "outputs": [],
89
- "source": "from pathlib import Path\n\n# Core dependencies with pinned versions for reproducibility\nDEPENDENCIES = \"\"\"\n# Core ML/AI\nnumpy>=1.24.0,<2.0.0\npandas>=2.0.0\nscikit-learn>=1.3.0\nscipy>=1.11.0\n\n# Deep Learning \ntorch>=2.0.0\ntransformers>=4.30.0\n\n# Gemini API\ngoogle-generativeai>=0.3.0\n\n# Data Processing\njoblib>=1.3.0\ntqdm>=4.65.0\n\n# Feature Engineering\ntldextract>=5.0.0\nvalidators>=0.22.0\n\n# Web/API\nhttpx>=0.25.0\nrequests>=2.31.0\n\n# Hugging Face\nhuggingface_hub>=0.19.0\n\n# Utilities\npython-dotenv>=1.0.0\npyyaml>=6.0.0\npsutil>=5.9.0\n\"\"\"\n\n# Write requirements file\nrequirements_path = Path(\"../requirements_notebooks.txt\")\nrequirements_path.write_text(DEPENDENCIES.strip())\nprint(f\"\u2713 Requirements written to: {requirements_path.absolute()}\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  },
91
  {
92
  "cell_type": "code",
@@ -94,7 +136,29 @@
94
  "id": "7dc8c6ca",
95
  "metadata": {},
96
  "outputs": [],
97
- "source": "import subprocess\nimport sys\nfrom pathlib import Path\n\n# Install dependencies\nrequirements_path = Path(\"../requirements_notebooks.txt\")\n\nif requirements_path.exists():\n print(\"Installing dependencies... This may take a few minutes.\")\n result = subprocess.run(\n [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n capture_output=True,\n text=True\n )\n\n if result.returncode == 0:\n print(\"\u2713 All dependencies installed successfully!\")\n else:\n print(f\"\u26a0 Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")\nelse:\n print(\"\u26a0 Requirements file not found. Run previous cell first or skip if deps installed.\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  },
99
  {
100
  "cell_type": "markdown",
@@ -111,35 +175,32 @@
111
  "metadata": {},
112
  "outputs": [],
113
  "source": [
114
- "import torch\n",
115
- "\n",
116
  "print(\"=\" * 60)\n",
117
  "print(\"COMPUTE DEVICE DETECTION\")\n",
118
  "print(\"=\" * 60)\n",
119
  "\n",
120
- "# Check CUDA availability\n",
121
- "cuda_available = torch.cuda.is_available()\n",
122
- "print(f\"\\n\u2713 PyTorch Version: {torch.__version__}\")\n",
123
- "print(f\"\u2713 CUDA Available: {cuda_available}\")\n",
124
- "\n",
125
- "if cuda_available:\n",
126
- " print(f\"\u2713 CUDA Version: {torch.version.cuda}\")\n",
127
- " print(f\"\u2713 GPU Count: {torch.cuda.device_count()}\")\n",
128
- " for i in range(torch.cuda.device_count()):\n",
129
- " props = torch.cuda.get_device_properties(i)\n",
130
- " print(f\" - GPU {i}: {props.name} ({props.total_memory / (1024**3):.2f} GB)\")\n",
131
- " DEVICE = torch.device(\"cuda\")\n",
132
- "else:\n",
133
- " print(\"\u26a0 No GPU detected - using CPU for training\")\n",
134
- " DEVICE = torch.device(\"cpu\")\n",
135
  "\n",
136
- "# Check MPS (Apple Silicon)\n",
137
- "if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
138
- " print(\"\u2713 Apple MPS (Metal) available\")\n",
139
- " DEVICE = torch.device(\"mps\")\n",
 
 
 
 
 
 
 
140
  "\n",
141
- "print(f\"\\n\u2713 Selected Device: {DEVICE}\")\n",
142
- "print(\"=\" * 60)"
 
143
  ]
144
  },
145
  {
@@ -156,7 +217,83 @@
156
  "id": "0f63a5ce",
157
  "metadata": {},
158
  "outputs": [],
159
- "source": "import json\nimport os\nfrom pathlib import Path\n\n# Load configuration from notebook_config.json first (for HF Spaces)\nconfig_json_path = Path(\"notebook_config.json\")\nif config_json_path.exists():\n with open(config_json_path, \"r\") as f:\n loaded_config = json.load(f)\n print(f\"\u2713 Loaded configuration from: {config_json_path.absolute()}\")\nelse:\n loaded_config = {}\n print(f\"\u26a0 No notebook_config.json found, using defaults\")\n\n# Try loading .env file as fallback (for local dev)\ntry:\n from dotenv import load_dotenv\n env_path = Path(\"../.env\")\n if env_path.exists():\n load_dotenv(env_path)\n print(f\"\u2713 Loaded environment from: {env_path.absolute()}\")\nexcept ImportError:\n pass\n\n# Detect device\ntry:\n import torch\n if torch.cuda.is_available():\n DEVICE = torch.device(\"cuda\")\n elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n DEVICE = torch.device(\"mps\")\n else:\n DEVICE = torch.device(\"cpu\")\nexcept ImportError:\n DEVICE = \"cpu\"\n\n# Configuration class\nclass Config:\n # API Keys - priority: config.json > env vars > defaults\n GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"AIzaSyA3HdWTLk_zJQ5P9G8Z8a8BEYSTPvLglhs\")\n HUGGINGFACE_TOKEN = os.getenv(\"HUGGINGFACE_API_TOKEN\", os.getenv(\"HF_TOKEN\", \"\"))\n WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n \n # Gemini model\n GEMINI_MODEL = loaded_config.get(\"gemini_model\", \"gemini-2.5-flash\")\n \n # Paths\n BASE_DIR = Path(\"..\").resolve()\n DATASETS_DIR = BASE_DIR / \"datasets\"\n MODELS_DIR = BASE_DIR / \"models\"\n ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n \n # ML Settings\n RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n \n # Device\n DEVICE = DEVICE\n\nconfig = Config()\n\n# Validate required API keys\nprint(\"\\n\" + \"=\" * 60)\nprint(\"API CONFIGURATION STATUS\")\nprint(\"=\" * 60)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  },
161
  {
162
  "cell_type": "markdown",
@@ -172,7 +309,59 @@
172
  "id": "14cef3bc",
173
  "metadata": {},
174
  "outputs": [],
175
- "source": "import google.generativeai as genai\nimport json\nimport os\nfrom pathlib import Path\n\n# Load config (self-contained)\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n with open(config_json_path, 'r') as f:\n loaded_config = json.load(f)\nelse:\n loaded_config = {}\n\nGEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\nGEMINI_MODEL = loaded_config.get('gemini_model', 'gemini-2.5-flash')\n\ndef test_gemini_connection():\n if not GEMINI_API_KEY:\n return False, 'API key not configured'\n try:\n genai.configure(api_key=GEMINI_API_KEY)\n model = genai.GenerativeModel(GEMINI_MODEL)\n response = model.generate_content('Respond with only: OK')\n return True, f'Model: {GEMINI_MODEL}, Response: {response.text.strip()}'\n except Exception as e:\n try:\n model = genai.GenerativeModel('gemini-1.5-flash')\n response = model.generate_content('Respond with only: OK')\n return True, f'Model: gemini-1.5-flash (fallback), Response: {response.text.strip()}'\n except Exception as e2:\n return False, str(e2)\n\nprint('Testing Gemini API connection...')\nsuccess, message = test_gemini_connection()\nif success:\n print(f'\u2713 Gemini API: {message}')\nelse:\n print(f'\u26a0 Gemini API: Connection failed - {message}')\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  },
177
  {
178
  "cell_type": "markdown",
@@ -188,7 +377,45 @@
188
  "id": "beb1b036",
189
  "metadata": {},
190
  "outputs": [],
191
- "source": "import httpx\nimport json\nimport os\nfrom pathlib import Path\n\n# Load config (self-contained)\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n with open(config_json_path, 'r') as f:\n loaded_config = json.load(f)\nelse:\n loaded_config = {}\n\nWEBSCRAPER_API_KEY = loaded_config.get('webscraper_api_key', 'sk-fd14eaa7bceb478db7afc7256e514d2b')\nWEBSCRAPER_API_URL = loaded_config.get('webscraper_api_url', 'http://webscrapper.live/api/scrape')\n\ndef test_webscraper_connection_sync():\n try:\n with httpx.Client(timeout=30.0) as client:\n response = client.post(\n WEBSCRAPER_API_URL,\n json={'url': 'https://example.com'},\n headers={'Content-Type': 'application/json', 'X-API-Key': WEBSCRAPER_API_KEY}\n )\n if response.status_code == 200:\n return True, 'Connected'\n else:\n return False, f'Status {response.status_code}'\n except Exception as e:\n return False, str(e)\n\nprint('Testing Web Scraper API connection...')\nsuccess, message = test_webscraper_connection_sync()\nif success:\n print(f'\u2713 WebScraper API: Connected successfully')\nelse:\n print(f'\u26a0 WebScraper API: {message}')\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  },
193
  {
194
  "cell_type": "markdown",
@@ -204,7 +431,31 @@
204
  "id": "776236f8",
205
  "metadata": {},
206
  "outputs": [],
207
- "source": "from pathlib import Path\n\n# Define directories (self-contained)\nBASE_DIR = Path('..').resolve()\nDATASETS_DIR = BASE_DIR / 'datasets'\nMODELS_DIR = BASE_DIR / 'models'\nARTIFACTS_DIR = BASE_DIR / 'artifacts'\n\n# Create necessary directories\ndirectories = [\n DATASETS_DIR,\n MODELS_DIR,\n ARTIFACTS_DIR,\n BASE_DIR / 'logs',\n BASE_DIR / 'cache',\n]\n\nprint('Creating directory structure...')\nfor directory in directories:\n directory.mkdir(parents=True, exist_ok=True)\n print(f' \u2713 {directory}')\n\nprint('\\n\u2713 Directory structure ready!')\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  },
209
  {
210
  "cell_type": "markdown",
@@ -220,7 +471,67 @@
220
  "id": "6b854bac",
221
  "metadata": {},
222
  "outputs": [],
223
- "source": "import json\nimport sys\nimport os\nfrom pathlib import Path\n\n# Get values (self-contained)\npython_version = sys.version_info\n\ntry:\n import torch\n torch_version = torch.__version__\n cuda_available = torch.cuda.is_available()\n if cuda_available:\n DEVICE = 'cuda'\n elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n DEVICE = 'mps'\n else:\n DEVICE = 'cpu'\nexcept ImportError:\n torch_version = 'not installed'\n cuda_available = False\n DEVICE = 'cpu'\n\n# Load config\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n with open(config_json_path, 'r') as f:\n loaded_config = json.load(f)\nelse:\n loaded_config = {}\n\nBASE_DIR = Path('..').resolve()\nDATASETS_DIR = BASE_DIR / 'datasets'\nMODELS_DIR = BASE_DIR / 'models'\nARTIFACTS_DIR = BASE_DIR / 'artifacts'\nRANDOM_STATE = loaded_config.get('random_state', 42)\nTEST_SIZE = loaded_config.get('test_size', 0.2)\nCV_FOLDS = loaded_config.get('cv_folds', 5)\n\n# Export configuration for other notebooks\nnotebook_config = {\n 'device': str(DEVICE),\n 'python_version': f'{python_version.major}.{python_version.minor}.{python_version.micro}',\n 'torch_version': torch_version,\n 'cuda_available': cuda_available,\n 'base_dir': str(BASE_DIR),\n 'datasets_dir': str(DATASETS_DIR),\n 'models_dir': str(MODELS_DIR),\n 'artifacts_dir': str(ARTIFACTS_DIR),\n 'random_state': RANDOM_STATE,\n 'test_size': TEST_SIZE,\n 'cv_folds': CV_FOLDS,\n}\n\nconfig_path = Path('notebook_runtime_config.json')\nwith open(config_path, 'w') as f:\n json.dump(notebook_config, f, indent=2)\n\nprint(f'\u2713 Configuration exported to: {config_path.absolute()}')\nprint(json.dumps(notebook_config, indent=2))\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  },
225
  {
226
  "cell_type": "markdown",
@@ -236,7 +547,55 @@
236
  "id": "f409be56",
237
  "metadata": {},
238
  "outputs": [],
239
- "source": "import sys\nimport json\nimport os\nfrom pathlib import Path\n\npython_version = sys.version_info\n\ntry:\n import torch\n torch_version = torch.__version__\n if torch.cuda.is_available():\n DEVICE = 'cuda'\n elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n DEVICE = 'mps'\n else:\n DEVICE = 'cpu'\nexcept ImportError:\n torch_version = 'not installed'\n DEVICE = 'cpu'\n\n# Load config\nconfig_json_path = Path('notebook_config.json')\nif config_json_path.exists():\n with open(config_json_path, 'r') as f:\n loaded_config = json.load(f)\nelse:\n loaded_config = {}\n\nGEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\nHUGGINGFACE_TOKEN = os.getenv('HF_TOKEN', '')\n\nprint('\\n' + '=' * 60)\nprint('ENVIRONMENT SETUP COMPLETE')\nprint('=' * 60)\nprint(f'''\n\u2705 Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n\u2705 Device: {DEVICE}\n\u2705 PyTorch: {torch_version}\n\u2705 Gemini API: {'Ready' if GEMINI_API_KEY else 'Not configured'}\n\u2705 HuggingFace: {'Ready' if HUGGINGFACE_TOKEN else 'Using public access'}\n\u2705 WebScraper API: Ready\n\u2705 Directories: Created\n\nYou can now proceed to the next notebook:\n \u2192 01_data_acquisition.ipynb\n''')\nprint('=' * 60)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  }
241
  ],
242
  "metadata": {
@@ -246,4 +605,4 @@
246
  },
247
  "nbformat": 4,
248
  "nbformat_minor": 5
249
- }
 
51
  "\n",
52
  "# Python version check\n",
53
  "python_version = sys.version_info\n",
54
+ "print(f\"\\nβœ“ Python Version: {python_version.major}.{python_version.minor}.{python_version.micro}\")\n",
55
  "\n",
56
  "if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 10):\n",
57
  " raise EnvironmentError(\"Python 3.10+ is required. Please upgrade your Python installation.\")\n",
58
  "\n",
59
  "# System info\n",
60
+ "print(f\"βœ“ Platform: {platform.system()} {platform.release()}\")\n",
61
+ "print(f\"βœ“ Architecture: {platform.machine()}\")\n",
62
+ "print(f\"βœ“ Processor: {platform.processor() or 'Unknown'}\")\n",
63
  "\n",
64
  "# Memory info\n",
65
  "try:\n",
66
  " import psutil\n",
67
  " memory = psutil.virtual_memory()\n",
68
+ " print(f\"βœ“ Available Memory: {memory.available / (1024**3):.2f} GB / {memory.total / (1024**3):.2f} GB\")\n",
69
  "except ImportError:\n",
70
+ " print(\"⚠ psutil not installed - memory check skipped\")\n",
71
  "\n",
72
  "print(\"\\n\" + \"=\" * 60)"
73
  ]
 
86
  "id": "faa9b079",
87
  "metadata": {},
88
  "outputs": [],
89
+ "source": [
90
+ "from pathlib import Path\n",
91
+ "\n",
92
+ "# Core dependencies with pinned versions for reproducibility\n",
93
+ "# NOTE: torch/transformers intentionally excluded - not needed for sklearn models\n",
94
+ "# and too heavy for HF Space Docker containers\n",
95
+ "DEPENDENCIES = \"\"\"\n",
96
+ "# Core ML/AI\n",
97
+ "numpy>=1.24.0,<2.0.0\n",
98
+ "pandas>=2.0.0\n",
99
+ "scikit-learn>=1.3.0\n",
100
+ "scipy>=1.11.0\n",
101
+ "\n",
102
+ "# Gemini API (new SDK)\n",
103
+ "google-genai>=1.0.0\n",
104
+ "\n",
105
+ "# Data Processing\n",
106
+ "joblib>=1.3.0\n",
107
+ "tqdm>=4.65.0\n",
108
+ "pyarrow>=14.0.0\n",
109
+ "\n",
110
+ "# Feature Engineering\n",
111
+ "tldextract>=5.0.0\n",
112
+ "validators>=0.22.0\n",
113
+ "\n",
114
+ "# Web/API\n",
115
+ "httpx>=0.25.0\n",
116
+ "requests>=2.31.0\n",
117
+ "\n",
118
+ "# Hugging Face\n",
119
+ "huggingface_hub>=0.19.0\n",
120
+ "\n",
121
+ "# Utilities\n",
122
+ "python-dotenv>=1.0.0\n",
123
+ "pyyaml>=6.0.0\n",
124
+ "psutil>=5.9.0\n",
125
+ "\"\"\"\n",
126
+ "\n",
127
+ "# Write requirements file\n",
128
+ "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
129
+ "requirements_path.write_text(DEPENDENCIES.strip())\n",
130
+ "print(f\"βœ“ Requirements written to: {requirements_path.absolute()}\")\n"
131
+ ]
132
  },
133
  {
134
  "cell_type": "code",
 
136
  "id": "7dc8c6ca",
137
  "metadata": {},
138
  "outputs": [],
139
+ "source": [
140
+ "import subprocess\n",
141
+ "import sys\n",
142
+ "from pathlib import Path\n",
143
+ "\n",
144
+ "# Install dependencies\n",
145
+ "requirements_path = Path(\"../requirements_notebooks.txt\")\n",
146
+ "\n",
147
+ "if requirements_path.exists():\n",
148
+ " print(\"Installing dependencies... This may take a few minutes.\")\n",
149
+ " result = subprocess.run(\n",
150
+ " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-r\", str(requirements_path)],\n",
151
+ " capture_output=True,\n",
152
+ " text=True\n",
153
+ " )\n",
154
+ "\n",
155
+ " if result.returncode == 0:\n",
156
+ " print(\"βœ“ All dependencies installed successfully!\")\n",
157
+ " else:\n",
158
+ " print(f\"⚠ Installation warnings: {result.stderr[:500] if result.stderr else 'None'}\")\n",
159
+ "else:\n",
160
+ " print(\"⚠ Requirements file not found. Run previous cell first or skip if deps installed.\")\n"
161
+ ]
162
  },
163
  {
164
  "cell_type": "markdown",
 
175
  "metadata": {},
176
  "outputs": [],
177
  "source": [
 
 
178
  "print(\"=\" * 60)\n",
179
  "print(\"COMPUTE DEVICE DETECTION\")\n",
180
  "print(\"=\" * 60)\n",
181
  "\n",
182
+ "# CyberForge uses sklearn (CPU-only) β€” torch is optional\n",
183
+ "try:\n",
184
+ " import torch\n",
185
+ " cuda_available = torch.cuda.is_available()\n",
186
+ " print(f\"\\nβœ“ PyTorch Version: {torch.__version__}\")\n",
187
+ " print(f\"βœ“ CUDA Available: {cuda_available}\")\n",
 
 
 
 
 
 
 
 
 
188
  "\n",
189
+ " if cuda_available:\n",
190
+ " print(f\"βœ“ CUDA Version: {torch.version.cuda}\")\n",
191
+ " DEVICE = \"cuda\"\n",
192
+ " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
193
+ " print(\"βœ“ Apple MPS (Metal) available\")\n",
194
+ " DEVICE = \"mps\"\n",
195
+ " else:\n",
196
+ " DEVICE = \"cpu\"\n",
197
+ "except ImportError:\n",
198
+ " print(\"\\n⚠ PyTorch not installed (not required β€” sklearn models use CPU)\")\n",
199
+ " DEVICE = \"cpu\"\n",
200
  "\n",
201
+ "print(f\"\\nβœ“ Selected Device: {DEVICE}\")\n",
202
+ "print(\" (Note: CyberForge models use scikit-learn which runs on CPU)\")\n",
203
+ "print(\"=\" * 60)\n"
204
  ]
205
  },
206
  {
 
217
  "id": "0f63a5ce",
218
  "metadata": {},
219
  "outputs": [],
220
+ "source": [
221
+ "import json\n",
222
+ "import os\n",
223
+ "from pathlib import Path\n",
224
+ "\n",
225
+ "# Load configuration from notebook_config.json first (for HF Spaces)\n",
226
+ "config_json_path = Path(\"notebook_config.json\")\n",
227
+ "if config_json_path.exists():\n",
228
+ " with open(config_json_path, \"r\") as f:\n",
229
+ " loaded_config = json.load(f)\n",
230
+ " print(f\"βœ“ Loaded configuration from: {config_json_path.absolute()}\")\n",
231
+ "else:\n",
232
+ " loaded_config = {}\n",
233
+ " print(f\"⚠ No notebook_config.json found, using defaults\")\n",
234
+ "\n",
235
+ "# Try loading .env file as fallback (for local dev)\n",
236
+ "try:\n",
237
+ " from dotenv import load_dotenv\n",
238
+ " env_path = Path(\"../.env\")\n",
239
+ " if env_path.exists():\n",
240
+ " load_dotenv(env_path)\n",
241
+ " print(f\"βœ“ Loaded environment from: {env_path.absolute()}\")\n",
242
+ "except ImportError:\n",
243
+ " pass\n",
244
+ "\n",
245
+ "# Detect device (torch is optional)\n",
246
+ "DEVICE = \"cpu\"\n",
247
+ "try:\n",
248
+ " import torch\n",
249
+ " if torch.cuda.is_available():\n",
250
+ " DEVICE = \"cuda\"\n",
251
+ " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
252
+ " DEVICE = \"mps\"\n",
253
+ "except ImportError:\n",
254
+ " pass\n",
255
+ "\n",
256
+ "# Configuration class\n",
257
+ "class Config:\n",
258
+ " # API Keys - priority: config.json > env vars > defaults\n",
259
+ " GEMINI_API_KEY = loaded_config.get(\"gemini_api_key\") or os.getenv(\"GEMINI_API_KEY\", \"\")\n",
260
+ " HUGGINGFACE_TOKEN = loaded_config.get(\"hf_token\") or os.getenv(\"HF_TOKEN\", \"\")\n",
261
+ " WEBSCRAPER_API_KEY = loaded_config.get(\"webscraper_api_key\", \"sk-fd14eaa7bceb478db7afc7256e514d2b\")\n",
262
+ " WEBSCRAPER_API_URL = loaded_config.get(\"webscraper_api_url\", \"http://webscrapper.live/api/scrape\")\n",
263
+ " \n",
264
+ " # Gemini model\n",
265
+ " GEMINI_MODEL = loaded_config.get(\"gemini_model\", os.getenv(\"GEMINI_MODEL\", \"gemini-2.5-flash\"))\n",
266
+ " \n",
267
+ " # HF repos\n",
268
+ " HF_REPO = loaded_config.get(\"hf_repo\", \"Che237/cyberforge-models\")\n",
269
+ " HF_DATASETS_REPO = loaded_config.get(\"hf_datasets_repo\", \"Che237/cyberforge-datasets\")\n",
270
+ " \n",
271
+ " # Paths\n",
272
+ " BASE_DIR = Path(\"..\").resolve()\n",
273
+ " DATASETS_DIR = BASE_DIR / \"datasets\"\n",
274
+ " MODELS_DIR = BASE_DIR / \"models\"\n",
275
+ " ARTIFACTS_DIR = BASE_DIR / \"artifacts\"\n",
276
+ " \n",
277
+ " # ML Settings\n",
278
+ " RANDOM_STATE = loaded_config.get(\"random_state\", 42)\n",
279
+ " TEST_SIZE = loaded_config.get(\"test_size\", 0.2)\n",
280
+ " CV_FOLDS = loaded_config.get(\"cv_folds\", 5)\n",
281
+ " \n",
282
+ " # Device\n",
283
+ " DEVICE = DEVICE\n",
284
+ "\n",
285
+ "config = Config()\n",
286
+ "\n",
287
+ "# Validate required API keys\n",
288
+ "print(\"\\n\" + \"=\" * 60)\n",
289
+ "print(\"API CONFIGURATION STATUS\")\n",
290
+ "print(\"=\" * 60)\n",
291
+ "print(f\" Gemini API Key: {'βœ“ Set' if config.GEMINI_API_KEY else 'βœ— Missing'}\")\n",
292
+ "print(f\" HuggingFace Token: {'βœ“ Set' if config.HUGGINGFACE_TOKEN else '⚠ Not set (models won\\\\'t upload)'}\")\n",
293
+ "print(f\" Gemini Model: {config.GEMINI_MODEL}\")\n",
294
+ "print(f\" HF Model Repo: {config.HF_REPO}\")\n",
295
+ "print(f\" Device: {config.DEVICE}\")\n"
296
+ ]
297
  },
298
  {
299
  "cell_type": "markdown",
 
309
  "id": "14cef3bc",
310
  "metadata": {},
311
  "outputs": [],
312
+ "source": [
313
+ "# Gemini Integration β€” using google-genai (new SDK)\n",
314
+ "import json\n",
315
+ "import os\n",
316
+ "from pathlib import Path\n",
317
+ "\n",
318
+ "try:\n",
319
+ " from google import genai\n",
320
+ "except ImportError:\n",
321
+ " import subprocess, sys\n",
322
+ " subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-genai', '-q'])\n",
323
+ " from google import genai\n",
324
+ "\n",
325
+ "# Load config (self-contained)\n",
326
+ "config_json_path = Path('notebook_config.json')\n",
327
+ "if config_json_path.exists():\n",
328
+ " with open(config_json_path, 'r') as f:\n",
329
+ " loaded_config = json.load(f)\n",
330
+ "else:\n",
331
+ " loaded_config = {}\n",
332
+ "\n",
333
+ "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n",
334
+ "GEMINI_MODEL = loaded_config.get('gemini_model', os.getenv('GEMINI_MODEL', 'gemini-2.5-flash'))\n",
335
+ "\n",
336
+ "def test_gemini_connection():\n",
337
+ " if not GEMINI_API_KEY:\n",
338
+ " return False, 'API key not configured'\n",
339
+ " try:\n",
340
+ " client = genai.Client(api_key=GEMINI_API_KEY)\n",
341
+ " response = client.models.generate_content(\n",
342
+ " model=GEMINI_MODEL,\n",
343
+ " contents='Respond with only: OK'\n",
344
+ " )\n",
345
+ " return True, f'Model: {GEMINI_MODEL}, Response: {response.text.strip()}'\n",
346
+ " except Exception as e:\n",
347
+ " # Try fallback model\n",
348
+ " try:\n",
349
+ " client = genai.Client(api_key=GEMINI_API_KEY)\n",
350
+ " response = client.models.generate_content(\n",
351
+ " model='gemini-2.5-flash',\n",
352
+ " contents='Respond with only: OK'\n",
353
+ " )\n",
354
+ " return True, f'Model: gemini-2.5-flash (fallback), Response: {response.text.strip()}'\n",
355
+ " except Exception as e2:\n",
356
+ " return False, str(e2)\n",
357
+ "\n",
358
+ "print('Testing Gemini API connection...')\n",
359
+ "success, message = test_gemini_connection()\n",
360
+ "if success:\n",
361
+ " print(f'βœ“ Gemini API: {message}')\n",
362
+ "else:\n",
363
+ " print(f'⚠ Gemini API: Connection failed - {message}')\n"
364
+ ]
365
  },
366
  {
367
  "cell_type": "markdown",
 
377
  "id": "beb1b036",
378
  "metadata": {},
379
  "outputs": [],
380
+ "source": [
381
+ "import httpx\n",
382
+ "import json\n",
383
+ "import os\n",
384
+ "from pathlib import Path\n",
385
+ "\n",
386
+ "# Load config (self-contained)\n",
387
+ "config_json_path = Path('notebook_config.json')\n",
388
+ "if config_json_path.exists():\n",
389
+ " with open(config_json_path, 'r') as f:\n",
390
+ " loaded_config = json.load(f)\n",
391
+ "else:\n",
392
+ " loaded_config = {}\n",
393
+ "\n",
394
+ "WEBSCRAPER_API_KEY = loaded_config.get('webscraper_api_key', 'sk-fd14eaa7bceb478db7afc7256e514d2b')\n",
395
+ "WEBSCRAPER_API_URL = loaded_config.get('webscraper_api_url', 'http://webscrapper.live/api/scrape')\n",
396
+ "\n",
397
+ "def test_webscraper_connection_sync():\n",
398
+ " try:\n",
399
+ " with httpx.Client(timeout=30.0) as client:\n",
400
+ " response = client.post(\n",
401
+ " WEBSCRAPER_API_URL,\n",
402
+ " json={'url': 'https://example.com'},\n",
403
+ " headers={'Content-Type': 'application/json', 'X-API-Key': WEBSCRAPER_API_KEY}\n",
404
+ " )\n",
405
+ " if response.status_code == 200:\n",
406
+ " return True, 'Connected'\n",
407
+ " else:\n",
408
+ " return False, f'Status {response.status_code}'\n",
409
+ " except Exception as e:\n",
410
+ " return False, str(e)\n",
411
+ "\n",
412
+ "print('Testing Web Scraper API connection...')\n",
413
+ "success, message = test_webscraper_connection_sync()\n",
414
+ "if success:\n",
415
+ " print(f'βœ“ WebScraper API: Connected successfully')\n",
416
+ "else:\n",
417
+ " print(f'⚠ WebScraper API: {message}')\n"
418
+ ]
419
  },
420
  {
421
  "cell_type": "markdown",
 
431
  "id": "776236f8",
432
  "metadata": {},
433
  "outputs": [],
434
+ "source": [
435
+ "from pathlib import Path\n",
436
+ "\n",
437
+ "# Define directories (self-contained)\n",
438
+ "BASE_DIR = Path('..').resolve()\n",
439
+ "DATASETS_DIR = BASE_DIR / 'datasets'\n",
440
+ "MODELS_DIR = BASE_DIR / 'models'\n",
441
+ "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n",
442
+ "\n",
443
+ "# Create necessary directories\n",
444
+ "directories = [\n",
445
+ " DATASETS_DIR,\n",
446
+ " MODELS_DIR,\n",
447
+ " ARTIFACTS_DIR,\n",
448
+ " BASE_DIR / 'logs',\n",
449
+ " BASE_DIR / 'cache',\n",
450
+ "]\n",
451
+ "\n",
452
+ "print('Creating directory structure...')\n",
453
+ "for directory in directories:\n",
454
+ " directory.mkdir(parents=True, exist_ok=True)\n",
455
+ " print(f' βœ“ {directory}')\n",
456
+ "\n",
457
+ "print('\\nβœ“ Directory structure ready!')\n"
458
+ ]
459
  },
460
  {
461
  "cell_type": "markdown",
 
471
  "id": "6b854bac",
472
  "metadata": {},
473
  "outputs": [],
474
+ "source": [
475
+ "import json\n",
476
+ "import sys\n",
477
+ "import os\n",
478
+ "from pathlib import Path\n",
479
+ "\n",
480
+ "# Get values (self-contained)\n",
481
+ "python_version = sys.version_info\n",
482
+ "\n",
483
+ "DEVICE = 'cpu'\n",
484
+ "torch_version = 'not installed (not required)'\n",
485
+ "cuda_available = False\n",
486
+ "try:\n",
487
+ " import torch\n",
488
+ " torch_version = torch.__version__\n",
489
+ " cuda_available = torch.cuda.is_available()\n",
490
+ " if cuda_available:\n",
491
+ " DEVICE = 'cuda'\n",
492
+ " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
493
+ " DEVICE = 'mps'\n",
494
+ "except ImportError:\n",
495
+ " pass\n",
496
+ "\n",
497
+ "# Load config\n",
498
+ "config_json_path = Path('notebook_config.json')\n",
499
+ "if config_json_path.exists():\n",
500
+ " with open(config_json_path, 'r') as f:\n",
501
+ " loaded_config = json.load(f)\n",
502
+ "else:\n",
503
+ " loaded_config = {}\n",
504
+ "\n",
505
+ "BASE_DIR = Path('..').resolve()\n",
506
+ "DATASETS_DIR = BASE_DIR / 'datasets'\n",
507
+ "MODELS_DIR = BASE_DIR / 'models'\n",
508
+ "ARTIFACTS_DIR = BASE_DIR / 'artifacts'\n",
509
+ "RANDOM_STATE = loaded_config.get('random_state', 42)\n",
510
+ "TEST_SIZE = loaded_config.get('test_size', 0.2)\n",
511
+ "CV_FOLDS = loaded_config.get('cv_folds', 5)\n",
512
+ "\n",
513
+ "# Export configuration for other notebooks\n",
514
+ "notebook_config = {\n",
515
+ " 'device': str(DEVICE),\n",
516
+ " 'python_version': f'{python_version.major}.{python_version.minor}.{python_version.micro}',\n",
517
+ " 'torch_version': torch_version,\n",
518
+ " 'cuda_available': cuda_available,\n",
519
+ " 'base_dir': str(BASE_DIR),\n",
520
+ " 'datasets_dir': str(DATASETS_DIR),\n",
521
+ " 'models_dir': str(MODELS_DIR),\n",
522
+ " 'artifacts_dir': str(ARTIFACTS_DIR),\n",
523
+ " 'random_state': RANDOM_STATE,\n",
524
+ " 'test_size': TEST_SIZE,\n",
525
+ " 'cv_folds': CV_FOLDS,\n",
526
+ "}\n",
527
+ "\n",
528
+ "config_path = Path('notebook_runtime_config.json')\n",
529
+ "with open(config_path, 'w') as f:\n",
530
+ " json.dump(notebook_config, f, indent=2)\n",
531
+ "\n",
532
+ "print(f'βœ“ Configuration exported to: {config_path.absolute()}')\n",
533
+ "print(json.dumps(notebook_config, indent=2))\n"
534
+ ]
535
  },
536
  {
537
  "cell_type": "markdown",
 
547
  "id": "f409be56",
548
  "metadata": {},
549
  "outputs": [],
550
+ "source": [
551
+ "import sys\n",
552
+ "import json\n",
553
+ "import os\n",
554
+ "from pathlib import Path\n",
555
+ "\n",
556
+ "python_version = sys.version_info\n",
557
+ "\n",
558
+ "try:\n",
559
+ " import torch\n",
560
+ " torch_version = torch.__version__\n",
561
+ " if torch.cuda.is_available():\n",
562
+ " DEVICE = 'cuda'\n",
563
+ " elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():\n",
564
+ " DEVICE = 'mps'\n",
565
+ " else:\n",
566
+ " DEVICE = 'cpu'\n",
567
+ "except ImportError:\n",
568
+ " torch_version = 'not installed'\n",
569
+ " DEVICE = 'cpu'\n",
570
+ "\n",
571
+ "# Load config\n",
572
+ "config_json_path = Path('notebook_config.json')\n",
573
+ "if config_json_path.exists():\n",
574
+ " with open(config_json_path, 'r') as f:\n",
575
+ " loaded_config = json.load(f)\n",
576
+ "else:\n",
577
+ " loaded_config = {}\n",
578
+ "\n",
579
+ "GEMINI_API_KEY = loaded_config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY', '')\n",
580
+ "HUGGINGFACE_TOKEN = os.getenv('HF_TOKEN', '')\n",
581
+ "\n",
582
+ "print('\\n' + '=' * 60)\n",
583
+ "print('ENVIRONMENT SETUP COMPLETE')\n",
584
+ "print('=' * 60)\n",
585
+ "print(f'''\n",
586
+ "βœ… Python: {python_version.major}.{python_version.minor}.{python_version.micro}\n",
587
+ "βœ… Device: {DEVICE}\n",
588
+ "βœ… PyTorch: {torch_version}\n",
589
+ "βœ… Gemini API: {'Ready' if GEMINI_API_KEY else 'Not configured'}\n",
590
+ "βœ… HuggingFace: {'Ready' if HUGGINGFACE_TOKEN else 'Using public access'}\n",
591
+ "βœ… WebScraper API: Ready\n",
592
+ "βœ… Directories: Created\n",
593
+ "\n",
594
+ "You can now proceed to the next notebook:\n",
595
+ " β†’ 01_data_acquisition.ipynb\n",
596
+ "''')\n",
597
+ "print('=' * 60)\n"
598
+ ]
599
  }
600
  ],
601
  "metadata": {
 
605
  },
606
  "nbformat": 4,
607
  "nbformat_minor": 5
608
+ }