Spaces:

yashash045
/

devops-pipeline-gym

Sleeping

File size: 7,933 Bytes

40de84e

"""Kaggle T4 single-cell eval helper for devops-pipeline-gym.

Designed to run in ONE Kaggle notebook cell. Free T4 GPU. No HF Jobs cost.

Usage in Kaggle notebook (paste this entire script as one cell, or use
the 4-line shim below that imports and calls run_eval()):

    # Single-cell shim
    !git clone https://github.com/Yashash4/devops-pipeline-gym /kaggle/working/dpg 2>/dev/null
    %cd /kaggle/working/dpg
    !pip install -q -e .
    import scripts.kaggle_eval as ke; ke.run_eval(mode="base")  # or "sft" or "grpo"

Modes:
    base : Qwen3-1.7B-bnb-4bit, no adapter
    sft  : + SFT adapter (yashash045/devops-pipeline-gym-sft-adapter)
    grpo : + SFT + GRPO adapter (yashash045/devops-pipeline-gym-trained)

Set HF_TOKEN in Kaggle Add-ons → Secrets before running. Results upload
to the Hub model repo as eval_<mode>.json (or saved locally if HF_TOKEN
absent).
"""

import os
import signal
import subprocess
import sys
import time
import urllib.request
from pathlib import Path


def boot_env_server(port: int = 8000, timeout_s: int = 105):
    """Boot uvicorn on localhost in background, wait for /reset 200."""
    log_path = "/tmp/env_server.log"
    log_fh = open(log_path, "w")
    proc = subprocess.Popen(
        [sys.executable, "-m", "uvicorn", "server.app:app",
         "--host", "127.0.0.1", "--port", str(port),
         "--log-level", "info"],
        stdout=log_fh, stderr=subprocess.STDOUT,
    )
    time.sleep(15)
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        if proc.poll() is not None:
            with open(log_path) as f:
                tail = f.read()[-2000:]
            raise RuntimeError(f"env-server died:\n{tail}")
        try:
            req = urllib.request.Request(
                f"http://localhost:{port}/reset", method="POST",
                data=b"{}", headers={"Content-Type": "application/json"},
            )
            with urllib.request.urlopen(req, timeout=5) as r:
                if r.status == 200:
                    print(f"env-server healthy (PID {proc.pid})", flush=True)
                    return proc
        except Exception:
            pass
        time.sleep(1.5)
    raise RuntimeError(f"env-server failed health check in {timeout_s}s")


def _ensure_kaggle_deps():
    """Upgrade bitsandbytes on Kaggle (default image ships <0.46 which can't 4-bit
    quantize Qwen3 models with the API our eval_baseline.py uses).
    Idempotent — safe to call multiple times. Costs ~10s on first call."""
    print("[deps] Upgrading bitsandbytes>=0.46.1 (Kaggle ships an older version)...",
          flush=True)
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", "-U",
         "bitsandbytes>=0.46.1"],
        check=False,  # don't crash the whole eval if pip flakes; bnb may already be new enough
    )


def run_eval(mode: str = "base", n_seeds: int = 5, temperature: float = 0.3,
             upload_to_hub: bool = True):
    """Run multi-seed eval on T4. Saves to /kaggle/working/eval_<mode>.json."""

    assert mode in ("base", "sft", "grpo"), f"mode must be base/sft/grpo, got {mode}"
    print(f"=== Eval mode={mode} n_seeds={n_seeds} temp={temperature} ===", flush=True)

    # 0. Ensure bnb >= 0.46 (Kaggle image fix)
    _ensure_kaggle_deps()

    adapters = {
        "base": None,
        "sft": "yashash045/devops-pipeline-gym-sft-adapter",
        "grpo": "yashash045/devops-pipeline-gym-trained",
    }

    # 1. Boot env-server
    print("[1/4] Booting env-server...", flush=True)
    env_proc = boot_env_server()

    try:
        # 2. Download adapter if needed
        model_arg = "unsloth/Qwen3-1.7B-bnb-4bit"
        if adapters[mode]:
            print(f"[2/4] Downloading adapter {adapters[mode]}...", flush=True)
            from huggingface_hub import snapshot_download
            model_arg = snapshot_download(
                repo_id=adapters[mode],
                local_dir=f"/kaggle/working/{mode}_adapter",
            )
            print(f"    adapter local: {model_arg}", flush=True)

        # 3. Run eval_baseline.py
        output_json = f"/kaggle/working/eval_{mode}.json"
        print(f"[3/4] Running eval (output: {output_json})...", flush=True)
        cmd = [
            sys.executable, "training/eval_baseline.py",
            "--model", model_arg,
            "--env-url", "http://localhost:8000",
            "--output", output_json,
            "--n-seeds", str(n_seeds),
        ]
        subprocess.run(cmd, check=True, env={
            **os.environ,
            "DEVOPS_EVAL_SEED_BASE": "5000",  # avoid training seeds (6000+)
        })

        # 4. Optional Hub upload
        if upload_to_hub and os.environ.get("HF_TOKEN"):
            print("[4/4] Uploading to Hub...", flush=True)
            try:
                from huggingface_hub import HfApi
                api = HfApi(token=os.environ["HF_TOKEN"])
                api.upload_file(
                    path_or_fileobj=output_json,
                    path_in_repo=f"eval_{mode}.json",
                    repo_id="yashash045/devops-pipeline-gym-sft-adapter",
                    repo_type="model",
                    commit_message=f"Kaggle eval: mode={mode}, n_seeds={n_seeds}",
                )
                print(f"    uploaded: https://huggingface.co/yashash045/"
                      f"devops-pipeline-gym-sft-adapter/blob/main/eval_{mode}.json",
                      flush=True)
            except Exception as e:
                print(f"    upload failed (saved locally): {e}", flush=True)
        else:
            print(f"[4/4] Saved locally: {output_json} (set HF_TOKEN to auto-upload)",
                  flush=True)
    finally:
        env_proc.send_signal(signal.SIGTERM)
        try:
            env_proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            env_proc.kill()

    print(f"\n=== EVAL {mode} DONE ===\n", flush=True)


def run_frontier(models=None, n_seeds: int = 3):
    """Frontier-model baselines via HF Router. CPU-only (no GPU needed)."""
    if models is None:
        models = [
            ("Qwen/Qwen2.5-72B-Instruct", "qwen25_72b"),
            ("meta-llama/Llama-3.3-70B-Instruct", "llama33_70b"),
            ("deepseek-ai/DeepSeek-V3.1", "deepseek_v31"),
            ("mistralai/Mistral-Large-Instruct-2411", "mistral_large"),
            ("openai/gpt-oss-120b", "gpt_oss_120b"),
        ]

    env_proc = boot_env_server()
    try:
        from huggingface_hub import HfApi
        api = HfApi(token=os.environ.get("HF_TOKEN"))

        for model_id, tag in models:
            output_json = f"/kaggle/working/eval_frontier_{tag}.json"
            print(f"\n=== Frontier: {model_id} ===", flush=True)
            try:
                subprocess.run(
                    [sys.executable, "training/eval_baseline.py",
                     "--model", model_id,
                     "--use-hf-router",
                     "--env-url", "http://localhost:8000",
                     "--output", output_json,
                     "--n-seeds", str(n_seeds),
                     "--temperature", "0.3",
                     "--max-tokens", "300"],
                    check=True, timeout=1800,
                )
                if api.token:
                    api.upload_file(
                        path_or_fileobj=output_json,
                        path_in_repo=f"eval_frontier_{tag}.json",
                        repo_id="yashash045/devops-pipeline-gym-sft-adapter",
                        repo_type="model",
                        commit_message=f"Kaggle frontier eval: {tag}",
                    )
            except Exception as e:
                print(f"    {model_id} FAILED: {e}", flush=True)
    finally:
        env_proc.send_signal(signal.SIGTERM)
        try: env_proc.wait(timeout=10)
        except subprocess.TimeoutExpired: env_proc.kill()

    print("\n=== FRONTIER BASELINES DONE ===\n", flush=True)