File size: 7,933 Bytes
40de84e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""Kaggle T4 single-cell eval helper for devops-pipeline-gym.

Designed to run in ONE Kaggle notebook cell. Free T4 GPU. No HF Jobs cost.

Usage in Kaggle notebook (paste this entire script as one cell, or use
the 4-line shim below that imports and calls run_eval()):

    # Single-cell shim
    !git clone https://github.com/Yashash4/devops-pipeline-gym /kaggle/working/dpg 2>/dev/null
    %cd /kaggle/working/dpg
    !pip install -q -e .
    import scripts.kaggle_eval as ke; ke.run_eval(mode="base")  # or "sft" or "grpo"

Modes:
    base : Qwen3-1.7B-bnb-4bit, no adapter
    sft  : + SFT adapter (yashash045/devops-pipeline-gym-sft-adapter)
    grpo : + SFT + GRPO adapter (yashash045/devops-pipeline-gym-trained)

Set HF_TOKEN in Kaggle Add-ons → Secrets before running. Results upload
to the Hub model repo as eval_<mode>.json (or saved locally if HF_TOKEN
absent).
"""

import os
import signal
import subprocess
import sys
import time
import urllib.request
from pathlib import Path


def boot_env_server(port: int = 8000, timeout_s: int = 105):
    """Boot uvicorn on localhost in background, wait for /reset 200."""
    log_path = "/tmp/env_server.log"
    log_fh = open(log_path, "w")
    proc = subprocess.Popen(
        [sys.executable, "-m", "uvicorn", "server.app:app",
         "--host", "127.0.0.1", "--port", str(port),
         "--log-level", "info"],
        stdout=log_fh, stderr=subprocess.STDOUT,
    )
    time.sleep(15)
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        if proc.poll() is not None:
            with open(log_path) as f:
                tail = f.read()[-2000:]
            raise RuntimeError(f"env-server died:\n{tail}")
        try:
            req = urllib.request.Request(
                f"http://localhost:{port}/reset", method="POST",
                data=b"{}", headers={"Content-Type": "application/json"},
            )
            with urllib.request.urlopen(req, timeout=5) as r:
                if r.status == 200:
                    print(f"env-server healthy (PID {proc.pid})", flush=True)
                    return proc
        except Exception:
            pass
        time.sleep(1.5)
    raise RuntimeError(f"env-server failed health check in {timeout_s}s")


def _ensure_kaggle_deps():
    """Upgrade bitsandbytes on Kaggle (default image ships <0.46 which can't 4-bit
    quantize Qwen3 models with the API our eval_baseline.py uses).
    Idempotent — safe to call multiple times. Costs ~10s on first call."""
    print("[deps] Upgrading bitsandbytes>=0.46.1 (Kaggle ships an older version)...",
          flush=True)
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", "-U",
         "bitsandbytes>=0.46.1"],
        check=False,  # don't crash the whole eval if pip flakes; bnb may already be new enough
    )


def run_eval(mode: str = "base", n_seeds: int = 5, temperature: float = 0.3,
             upload_to_hub: bool = True):
    """Run multi-seed eval on T4. Saves to /kaggle/working/eval_<mode>.json."""

    assert mode in ("base", "sft", "grpo"), f"mode must be base/sft/grpo, got {mode}"
    print(f"=== Eval mode={mode} n_seeds={n_seeds} temp={temperature} ===", flush=True)

    # 0. Ensure bnb >= 0.46 (Kaggle image fix)
    _ensure_kaggle_deps()

    adapters = {
        "base": None,
        "sft": "yashash045/devops-pipeline-gym-sft-adapter",
        "grpo": "yashash045/devops-pipeline-gym-trained",
    }

    # 1. Boot env-server
    print("[1/4] Booting env-server...", flush=True)
    env_proc = boot_env_server()

    try:
        # 2. Download adapter if needed
        model_arg = "unsloth/Qwen3-1.7B-bnb-4bit"
        if adapters[mode]:
            print(f"[2/4] Downloading adapter {adapters[mode]}...", flush=True)
            from huggingface_hub import snapshot_download
            model_arg = snapshot_download(
                repo_id=adapters[mode],
                local_dir=f"/kaggle/working/{mode}_adapter",
            )
            print(f"    adapter local: {model_arg}", flush=True)

        # 3. Run eval_baseline.py
        output_json = f"/kaggle/working/eval_{mode}.json"
        print(f"[3/4] Running eval (output: {output_json})...", flush=True)
        cmd = [
            sys.executable, "training/eval_baseline.py",
            "--model", model_arg,
            "--env-url", "http://localhost:8000",
            "--output", output_json,
            "--n-seeds", str(n_seeds),
        ]
        subprocess.run(cmd, check=True, env={
            **os.environ,
            "DEVOPS_EVAL_SEED_BASE": "5000",  # avoid training seeds (6000+)
        })

        # 4. Optional Hub upload
        if upload_to_hub and os.environ.get("HF_TOKEN"):
            print("[4/4] Uploading to Hub...", flush=True)
            try:
                from huggingface_hub import HfApi
                api = HfApi(token=os.environ["HF_TOKEN"])
                api.upload_file(
                    path_or_fileobj=output_json,
                    path_in_repo=f"eval_{mode}.json",
                    repo_id="yashash045/devops-pipeline-gym-sft-adapter",
                    repo_type="model",
                    commit_message=f"Kaggle eval: mode={mode}, n_seeds={n_seeds}",
                )
                print(f"    uploaded: https://huggingface.co/yashash045/"
                      f"devops-pipeline-gym-sft-adapter/blob/main/eval_{mode}.json",
                      flush=True)
            except Exception as e:
                print(f"    upload failed (saved locally): {e}", flush=True)
        else:
            print(f"[4/4] Saved locally: {output_json} (set HF_TOKEN to auto-upload)",
                  flush=True)
    finally:
        env_proc.send_signal(signal.SIGTERM)
        try:
            env_proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            env_proc.kill()

    print(f"\n=== EVAL {mode} DONE ===\n", flush=True)


def run_frontier(models=None, n_seeds: int = 3):
    """Frontier-model baselines via HF Router. CPU-only (no GPU needed)."""
    if models is None:
        models = [
            ("Qwen/Qwen2.5-72B-Instruct", "qwen25_72b"),
            ("meta-llama/Llama-3.3-70B-Instruct", "llama33_70b"),
            ("deepseek-ai/DeepSeek-V3.1", "deepseek_v31"),
            ("mistralai/Mistral-Large-Instruct-2411", "mistral_large"),
            ("openai/gpt-oss-120b", "gpt_oss_120b"),
        ]

    env_proc = boot_env_server()
    try:
        from huggingface_hub import HfApi
        api = HfApi(token=os.environ.get("HF_TOKEN"))

        for model_id, tag in models:
            output_json = f"/kaggle/working/eval_frontier_{tag}.json"
            print(f"\n=== Frontier: {model_id} ===", flush=True)
            try:
                subprocess.run(
                    [sys.executable, "training/eval_baseline.py",
                     "--model", model_id,
                     "--use-hf-router",
                     "--env-url", "http://localhost:8000",
                     "--output", output_json,
                     "--n-seeds", str(n_seeds),
                     "--temperature", "0.3",
                     "--max-tokens", "300"],
                    check=True, timeout=1800,
                )
                if api.token:
                    api.upload_file(
                        path_or_fileobj=output_json,
                        path_in_repo=f"eval_frontier_{tag}.json",
                        repo_id="yashash045/devops-pipeline-gym-sft-adapter",
                        repo_type="model",
                        commit_message=f"Kaggle frontier eval: {tag}",
                    )
            except Exception as e:
                print(f"    {model_id} FAILED: {e}", flush=True)
    finally:
        env_proc.send_signal(signal.SIGTERM)
        try: env_proc.wait(timeout=10)
        except subprocess.TimeoutExpired: env_proc.kill()

    print("\n=== FRONTIER BASELINES DONE ===\n", flush=True)