Spaces:
Sleeping
Sleeping
Hackathon submission: new README (3-5 min read), BLOG.md narrative, frontier baselines, design-principles framing
40de84e verified | """Kaggle T4 single-cell eval helper for devops-pipeline-gym. | |
| Designed to run in ONE Kaggle notebook cell. Free T4 GPU. No HF Jobs cost. | |
| Usage in Kaggle notebook (paste this entire script as one cell, or use | |
| the 4-line shim below that imports and calls run_eval()): | |
| # Single-cell shim | |
| !git clone https://github.com/Yashash4/devops-pipeline-gym /kaggle/working/dpg 2>/dev/null | |
| %cd /kaggle/working/dpg | |
| !pip install -q -e . | |
| import scripts.kaggle_eval as ke; ke.run_eval(mode="base") # or "sft" or "grpo" | |
| Modes: | |
| base : Qwen3-1.7B-bnb-4bit, no adapter | |
| sft : + SFT adapter (yashash045/devops-pipeline-gym-sft-adapter) | |
| grpo : + SFT + GRPO adapter (yashash045/devops-pipeline-gym-trained) | |
| Set HF_TOKEN in Kaggle Add-ons → Secrets before running. Results upload | |
| to the Hub model repo as eval_<mode>.json (or saved locally if HF_TOKEN | |
| absent). | |
| """ | |
| import os | |
| import signal | |
| import subprocess | |
| import sys | |
| import time | |
| import urllib.request | |
| from pathlib import Path | |
| def boot_env_server(port: int = 8000, timeout_s: int = 105): | |
| """Boot uvicorn on localhost in background, wait for /reset 200.""" | |
| log_path = "/tmp/env_server.log" | |
| log_fh = open(log_path, "w") | |
| proc = subprocess.Popen( | |
| [sys.executable, "-m", "uvicorn", "server.app:app", | |
| "--host", "127.0.0.1", "--port", str(port), | |
| "--log-level", "info"], | |
| stdout=log_fh, stderr=subprocess.STDOUT, | |
| ) | |
| time.sleep(15) | |
| deadline = time.time() + timeout_s | |
| while time.time() < deadline: | |
| if proc.poll() is not None: | |
| with open(log_path) as f: | |
| tail = f.read()[-2000:] | |
| raise RuntimeError(f"env-server died:\n{tail}") | |
| try: | |
| req = urllib.request.Request( | |
| f"http://localhost:{port}/reset", method="POST", | |
| data=b"{}", headers={"Content-Type": "application/json"}, | |
| ) | |
| with urllib.request.urlopen(req, timeout=5) as r: | |
| if r.status == 200: | |
| print(f"env-server healthy (PID {proc.pid})", flush=True) | |
| return proc | |
| except Exception: | |
| pass | |
| time.sleep(1.5) | |
| raise RuntimeError(f"env-server failed health check in {timeout_s}s") | |
| def _ensure_kaggle_deps(): | |
| """Upgrade bitsandbytes on Kaggle (default image ships <0.46 which can't 4-bit | |
| quantize Qwen3 models with the API our eval_baseline.py uses). | |
| Idempotent — safe to call multiple times. Costs ~10s on first call.""" | |
| print("[deps] Upgrading bitsandbytes>=0.46.1 (Kaggle ships an older version)...", | |
| flush=True) | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "-q", "-U", | |
| "bitsandbytes>=0.46.1"], | |
| check=False, # don't crash the whole eval if pip flakes; bnb may already be new enough | |
| ) | |
| def run_eval(mode: str = "base", n_seeds: int = 5, temperature: float = 0.3, | |
| upload_to_hub: bool = True): | |
| """Run multi-seed eval on T4. Saves to /kaggle/working/eval_<mode>.json.""" | |
| assert mode in ("base", "sft", "grpo"), f"mode must be base/sft/grpo, got {mode}" | |
| print(f"=== Eval mode={mode} n_seeds={n_seeds} temp={temperature} ===", flush=True) | |
| # 0. Ensure bnb >= 0.46 (Kaggle image fix) | |
| _ensure_kaggle_deps() | |
| adapters = { | |
| "base": None, | |
| "sft": "yashash045/devops-pipeline-gym-sft-adapter", | |
| "grpo": "yashash045/devops-pipeline-gym-trained", | |
| } | |
| # 1. Boot env-server | |
| print("[1/4] Booting env-server...", flush=True) | |
| env_proc = boot_env_server() | |
| try: | |
| # 2. Download adapter if needed | |
| model_arg = "unsloth/Qwen3-1.7B-bnb-4bit" | |
| if adapters[mode]: | |
| print(f"[2/4] Downloading adapter {adapters[mode]}...", flush=True) | |
| from huggingface_hub import snapshot_download | |
| model_arg = snapshot_download( | |
| repo_id=adapters[mode], | |
| local_dir=f"/kaggle/working/{mode}_adapter", | |
| ) | |
| print(f" adapter local: {model_arg}", flush=True) | |
| # 3. Run eval_baseline.py | |
| output_json = f"/kaggle/working/eval_{mode}.json" | |
| print(f"[3/4] Running eval (output: {output_json})...", flush=True) | |
| cmd = [ | |
| sys.executable, "training/eval_baseline.py", | |
| "--model", model_arg, | |
| "--env-url", "http://localhost:8000", | |
| "--output", output_json, | |
| "--n-seeds", str(n_seeds), | |
| ] | |
| subprocess.run(cmd, check=True, env={ | |
| **os.environ, | |
| "DEVOPS_EVAL_SEED_BASE": "5000", # avoid training seeds (6000+) | |
| }) | |
| # 4. Optional Hub upload | |
| if upload_to_hub and os.environ.get("HF_TOKEN"): | |
| print("[4/4] Uploading to Hub...", flush=True) | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=os.environ["HF_TOKEN"]) | |
| api.upload_file( | |
| path_or_fileobj=output_json, | |
| path_in_repo=f"eval_{mode}.json", | |
| repo_id="yashash045/devops-pipeline-gym-sft-adapter", | |
| repo_type="model", | |
| commit_message=f"Kaggle eval: mode={mode}, n_seeds={n_seeds}", | |
| ) | |
| print(f" uploaded: https://huggingface.co/yashash045/" | |
| f"devops-pipeline-gym-sft-adapter/blob/main/eval_{mode}.json", | |
| flush=True) | |
| except Exception as e: | |
| print(f" upload failed (saved locally): {e}", flush=True) | |
| else: | |
| print(f"[4/4] Saved locally: {output_json} (set HF_TOKEN to auto-upload)", | |
| flush=True) | |
| finally: | |
| env_proc.send_signal(signal.SIGTERM) | |
| try: | |
| env_proc.wait(timeout=10) | |
| except subprocess.TimeoutExpired: | |
| env_proc.kill() | |
| print(f"\n=== EVAL {mode} DONE ===\n", flush=True) | |
| def run_frontier(models=None, n_seeds: int = 3): | |
| """Frontier-model baselines via HF Router. CPU-only (no GPU needed).""" | |
| if models is None: | |
| models = [ | |
| ("Qwen/Qwen2.5-72B-Instruct", "qwen25_72b"), | |
| ("meta-llama/Llama-3.3-70B-Instruct", "llama33_70b"), | |
| ("deepseek-ai/DeepSeek-V3.1", "deepseek_v31"), | |
| ("mistralai/Mistral-Large-Instruct-2411", "mistral_large"), | |
| ("openai/gpt-oss-120b", "gpt_oss_120b"), | |
| ] | |
| env_proc = boot_env_server() | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=os.environ.get("HF_TOKEN")) | |
| for model_id, tag in models: | |
| output_json = f"/kaggle/working/eval_frontier_{tag}.json" | |
| print(f"\n=== Frontier: {model_id} ===", flush=True) | |
| try: | |
| subprocess.run( | |
| [sys.executable, "training/eval_baseline.py", | |
| "--model", model_id, | |
| "--use-hf-router", | |
| "--env-url", "http://localhost:8000", | |
| "--output", output_json, | |
| "--n-seeds", str(n_seeds), | |
| "--temperature", "0.3", | |
| "--max-tokens", "300"], | |
| check=True, timeout=1800, | |
| ) | |
| if api.token: | |
| api.upload_file( | |
| path_or_fileobj=output_json, | |
| path_in_repo=f"eval_frontier_{tag}.json", | |
| repo_id="yashash045/devops-pipeline-gym-sft-adapter", | |
| repo_type="model", | |
| commit_message=f"Kaggle frontier eval: {tag}", | |
| ) | |
| except Exception as e: | |
| print(f" {model_id} FAILED: {e}", flush=True) | |
| finally: | |
| env_proc.send_signal(signal.SIGTERM) | |
| try: env_proc.wait(timeout=10) | |
| except subprocess.TimeoutExpired: env_proc.kill() | |
| print("\n=== FRONTIER BASELINES DONE ===\n", flush=True) | |