devops-pipeline-gym / scripts /kaggle_eval.py
yashash045's picture
Hackathon submission: new README (3-5 min read), BLOG.md narrative, frontier baselines, design-principles framing
40de84e verified
"""Kaggle T4 single-cell eval helper for devops-pipeline-gym.
Designed to run in ONE Kaggle notebook cell. Free T4 GPU. No HF Jobs cost.
Usage in Kaggle notebook (paste this entire script as one cell, or use
the 4-line shim below that imports and calls run_eval()):
# Single-cell shim
!git clone https://github.com/Yashash4/devops-pipeline-gym /kaggle/working/dpg 2>/dev/null
%cd /kaggle/working/dpg
!pip install -q -e .
import scripts.kaggle_eval as ke; ke.run_eval(mode="base") # or "sft" or "grpo"
Modes:
base : Qwen3-1.7B-bnb-4bit, no adapter
sft : + SFT adapter (yashash045/devops-pipeline-gym-sft-adapter)
grpo : + SFT + GRPO adapter (yashash045/devops-pipeline-gym-trained)
Set HF_TOKEN in Kaggle Add-ons → Secrets before running. Results upload
to the Hub model repo as eval_<mode>.json (or saved locally if HF_TOKEN
absent).
"""
import os
import signal
import subprocess
import sys
import time
import urllib.request
from pathlib import Path
def boot_env_server(port: int = 8000, timeout_s: int = 105):
"""Boot uvicorn on localhost in background, wait for /reset 200."""
log_path = "/tmp/env_server.log"
log_fh = open(log_path, "w")
proc = subprocess.Popen(
[sys.executable, "-m", "uvicorn", "server.app:app",
"--host", "127.0.0.1", "--port", str(port),
"--log-level", "info"],
stdout=log_fh, stderr=subprocess.STDOUT,
)
time.sleep(15)
deadline = time.time() + timeout_s
while time.time() < deadline:
if proc.poll() is not None:
with open(log_path) as f:
tail = f.read()[-2000:]
raise RuntimeError(f"env-server died:\n{tail}")
try:
req = urllib.request.Request(
f"http://localhost:{port}/reset", method="POST",
data=b"{}", headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=5) as r:
if r.status == 200:
print(f"env-server healthy (PID {proc.pid})", flush=True)
return proc
except Exception:
pass
time.sleep(1.5)
raise RuntimeError(f"env-server failed health check in {timeout_s}s")
def _ensure_kaggle_deps():
"""Upgrade bitsandbytes on Kaggle (default image ships <0.46 which can't 4-bit
quantize Qwen3 models with the API our eval_baseline.py uses).
Idempotent — safe to call multiple times. Costs ~10s on first call."""
print("[deps] Upgrading bitsandbytes>=0.46.1 (Kaggle ships an older version)...",
flush=True)
subprocess.run(
[sys.executable, "-m", "pip", "install", "-q", "-U",
"bitsandbytes>=0.46.1"],
check=False, # don't crash the whole eval if pip flakes; bnb may already be new enough
)
def run_eval(mode: str = "base", n_seeds: int = 5, temperature: float = 0.3,
upload_to_hub: bool = True):
"""Run multi-seed eval on T4. Saves to /kaggle/working/eval_<mode>.json."""
assert mode in ("base", "sft", "grpo"), f"mode must be base/sft/grpo, got {mode}"
print(f"=== Eval mode={mode} n_seeds={n_seeds} temp={temperature} ===", flush=True)
# 0. Ensure bnb >= 0.46 (Kaggle image fix)
_ensure_kaggle_deps()
adapters = {
"base": None,
"sft": "yashash045/devops-pipeline-gym-sft-adapter",
"grpo": "yashash045/devops-pipeline-gym-trained",
}
# 1. Boot env-server
print("[1/4] Booting env-server...", flush=True)
env_proc = boot_env_server()
try:
# 2. Download adapter if needed
model_arg = "unsloth/Qwen3-1.7B-bnb-4bit"
if adapters[mode]:
print(f"[2/4] Downloading adapter {adapters[mode]}...", flush=True)
from huggingface_hub import snapshot_download
model_arg = snapshot_download(
repo_id=adapters[mode],
local_dir=f"/kaggle/working/{mode}_adapter",
)
print(f" adapter local: {model_arg}", flush=True)
# 3. Run eval_baseline.py
output_json = f"/kaggle/working/eval_{mode}.json"
print(f"[3/4] Running eval (output: {output_json})...", flush=True)
cmd = [
sys.executable, "training/eval_baseline.py",
"--model", model_arg,
"--env-url", "http://localhost:8000",
"--output", output_json,
"--n-seeds", str(n_seeds),
]
subprocess.run(cmd, check=True, env={
**os.environ,
"DEVOPS_EVAL_SEED_BASE": "5000", # avoid training seeds (6000+)
})
# 4. Optional Hub upload
if upload_to_hub and os.environ.get("HF_TOKEN"):
print("[4/4] Uploading to Hub...", flush=True)
try:
from huggingface_hub import HfApi
api = HfApi(token=os.environ["HF_TOKEN"])
api.upload_file(
path_or_fileobj=output_json,
path_in_repo=f"eval_{mode}.json",
repo_id="yashash045/devops-pipeline-gym-sft-adapter",
repo_type="model",
commit_message=f"Kaggle eval: mode={mode}, n_seeds={n_seeds}",
)
print(f" uploaded: https://huggingface.co/yashash045/"
f"devops-pipeline-gym-sft-adapter/blob/main/eval_{mode}.json",
flush=True)
except Exception as e:
print(f" upload failed (saved locally): {e}", flush=True)
else:
print(f"[4/4] Saved locally: {output_json} (set HF_TOKEN to auto-upload)",
flush=True)
finally:
env_proc.send_signal(signal.SIGTERM)
try:
env_proc.wait(timeout=10)
except subprocess.TimeoutExpired:
env_proc.kill()
print(f"\n=== EVAL {mode} DONE ===\n", flush=True)
def run_frontier(models=None, n_seeds: int = 3):
"""Frontier-model baselines via HF Router. CPU-only (no GPU needed)."""
if models is None:
models = [
("Qwen/Qwen2.5-72B-Instruct", "qwen25_72b"),
("meta-llama/Llama-3.3-70B-Instruct", "llama33_70b"),
("deepseek-ai/DeepSeek-V3.1", "deepseek_v31"),
("mistralai/Mistral-Large-Instruct-2411", "mistral_large"),
("openai/gpt-oss-120b", "gpt_oss_120b"),
]
env_proc = boot_env_server()
try:
from huggingface_hub import HfApi
api = HfApi(token=os.environ.get("HF_TOKEN"))
for model_id, tag in models:
output_json = f"/kaggle/working/eval_frontier_{tag}.json"
print(f"\n=== Frontier: {model_id} ===", flush=True)
try:
subprocess.run(
[sys.executable, "training/eval_baseline.py",
"--model", model_id,
"--use-hf-router",
"--env-url", "http://localhost:8000",
"--output", output_json,
"--n-seeds", str(n_seeds),
"--temperature", "0.3",
"--max-tokens", "300"],
check=True, timeout=1800,
)
if api.token:
api.upload_file(
path_or_fileobj=output_json,
path_in_repo=f"eval_frontier_{tag}.json",
repo_id="yashash045/devops-pipeline-gym-sft-adapter",
repo_type="model",
commit_message=f"Kaggle frontier eval: {tag}",
)
except Exception as e:
print(f" {model_id} FAILED: {e}", flush=True)
finally:
env_proc.send_signal(signal.SIGTERM)
try: env_proc.wait(timeout=10)
except subprocess.TimeoutExpired: env_proc.kill()
print("\n=== FRONTIER BASELINES DONE ===\n", flush=True)