File size: 13,133 Bytes

0a55ff6

#!/usr/bin/env python3
"""
eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub.

Purpose
-------
Prove the *shape* of the RL evaluation loop with NO Prime Intellect credits and
NO GPU: drive the spec_rl HumanEval code task's rollouts against the local,
stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME
reward the verifiers environment computes (`@vf.reward code_reward`) — run the
model's candidate code against the problem's unit tests and return the FRACTION
of assertions that pass (dense RL signal; the pass@1 eval stays binary).

At the venue the same loop points at the DFlash-speculated vLLM endpoint instead
of the stub. Because greedy speculative decoding is lossless, the reward curve is
identical; only the cost per rollout drops. This script lets us validate the loop
end-to-end before any credits are spent.

Reward logic is NOT reimplemented here — it is imported verbatim from
`environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`,
`load_problems`), so what runs locally is byte-identical to what the verifiers
env scores at the venue.

Two execution paths (auto-selected, reported in the output)
-----------------------------------------------------------
  1. "verifiers"  — if `verifiers` imports AND `spec_rl.load_environment()`
     constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the
     real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API.
  2. "manual"     — otherwise, a minimal hand-rolled rollout loop: build the same
     chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes.
     This is the path that actually runs against the canned-completion stub
     (which serves only /v1/completions), and it is reported as such.

Note on the stub: it returns a fixed canned completion for EVERY prompt, so the
real HumanEval tests will almost always fail (reward 0.0). That is expected and
correct — the point here is to prove the loop runs end-to-end offline without
erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue.

SAFETY: scoring executes model-generated code in a timed subprocess (see
spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL
rollouts only in the disposable venue sandbox, never against real data.

Usage
-----
  # start a stub first:  make stub        (baseline, :8000)
  #                  or:  make stub-b      (dflash,   :8001)
  python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5
"""
from __future__ import annotations

import argparse
import json
import os
import sys
import urllib.error
import urllib.request
from pathlib import Path

# ---------------------------------------------------------------------------
# Import the spec_rl env module so reward logic is shared, not duplicated. The
# env lives in a sibling tree (environments/spec_rl/spec_rl.py); add it to the
# path. spec_rl is import-safe even when `verifiers` is absent (its vf import is
# guarded), so this works on the Mac with no GPU and no verifiers.
# ---------------------------------------------------------------------------
_HERE = Path(__file__).resolve()
_REPO = _HERE.parents[1]                      # .../laguna-hack
_GPU_HW = _HERE.parents[2]                    # .../gpu_and_inference_hw
_SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl"
if str(_SPEC_RL_DIR) not in sys.path:
    sys.path.insert(0, str(_SPEC_RL_DIR))

import spec_rl  # noqa: E402  — shared reward core (passes, STOP, load_problems, ...)

DEFAULT_OUT = _REPO / "results" / "eval_local.json"

# System prompt mirrors spec_rl.load_environment so the manual loop sends the
# exact same instruction the verifiers env would.
SYSTEM_PROMPT = (
    "You are an expert Python programmer. You will be given a function "
    "signature and docstring. Complete the function body only. Do not repeat "
    "the signature, do not add explanations, and do not wrap the code in "
    "markdown fences. Output only the indented function body."
)


# ---------------------------------------------------------------------------
# Endpoint helpers (stdlib urllib only — matches the rest of the harness).
# ---------------------------------------------------------------------------
def _post_json(url: str, payload: dict, timeout: int = 600) -> dict:
    data = json.dumps(payload).encode()
    req = urllib.request.Request(
        url, data=data, headers={"Content-Type": "application/json"}
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return json.loads(r.read().decode())


def _endpoint_has_chat(base_url: str) -> bool:
    """True if the endpoint answers /v1/chat/completions (vLLM does; stub does not)."""
    url = base_url.rstrip("/") + "/v1/chat/completions"
    probe = {
        "model": "probe",
        "messages": [{"role": "user", "content": "ping"}],
        "max_tokens": 1,
        "temperature": 0.0,
    }
    try:
        _post_json(url, probe, timeout=10)
        return True
    except urllib.error.HTTPError as e:
        # 4xx/5xx still means the route exists and parsed our body; only a
        # 404 means "no chat endpoint here" (the stub returns 404 for it).
        return e.code != 404
    except Exception:
        return False


def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str:
    """Greedy chat completion (Laguna/vLLM path)."""
    url = base_url.rstrip("/") + "/v1/chat/completions"
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
        ],
        "max_tokens": max_tokens,
        "temperature": 0.0,  # greedy => deterministic => lossless-comparable
        "stop": spec_rl.STOP,
    }
    obj = _post_json(url, payload)
    return obj["choices"][0]["message"]["content"] or ""


def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
    """Greedy text completion (the stub path; also valid for vLLM completions)."""
    url = base_url.rstrip("/") + "/v1/completions"
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,
        "stop": spec_rl.STOP,
    }
    obj = _post_json(url, payload)
    return obj["choices"][0]["text"] or ""


def _trim_at_stop(text: str) -> str:
    """Cut at the first STOP sequence, mirroring the env's code_passes reward."""
    for stop in spec_rl.STOP:
        idx = text.find(stop)
        if idx != -1:
            text = text[:idx]
    return text


# ---------------------------------------------------------------------------
# Path 1 — drive the real verifiers env, if (and only if) it constructs cleanly
# AND the endpoint speaks chat. Returns a results dict, or None to fall back.
# ---------------------------------------------------------------------------
def try_verifiers(base_url: str, model: str, n: int) -> dict | None:
    try:
        import verifiers as vf  # noqa: F401
    except Exception:
        return None
    # load_environment() builds a vf.SingleTurnEnv. In some verifiers versions
    # the symbols spec_rl references (e.g. vf.Dataset) may not exist; guard the
    # whole construction so a mismatch falls back to the manual loop instead of
    # crashing the eval.
    try:
        env = spec_rl.load_environment(num_examples=n)
    except Exception as e:  # AttributeError/ImportError/etc. -> manual fallback
        print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});"
              " falling back to manual rollout loop.")
        return None
    if not _endpoint_has_chat(base_url):
        print("[eval_local] endpoint has no /v1/chat/completions (the local stub "
              "serves only /v1/completions); using manual rollout loop instead.")
        return None
    try:
        from openai import OpenAI  # type: ignore
    except Exception:
        print("[eval_local] openai client not available; using manual rollout loop.")
        return None

    client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY")
    out = env.evaluate(client=client, model=model, num_examples=n, save_results=False)

    # Normalize verifiers' GenerateOutputs into our flat per-example shape.
    rewards = list(getattr(out, "reward", []) or [])
    completions = list(getattr(out, "completion", []) or [])
    infos = list(getattr(out, "info", []) or [])
    per_example = []
    for i, r in enumerate(rewards):
        info = infos[i] if i < len(infos) else {}
        per_example.append({
            "index": i,
            "task_id": (info or {}).get("task_id", f"example_{i}"),
            "score": float(r),
            "completion": completions[i] if i < len(completions) else "",
        })
    mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
    return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example}


# ---------------------------------------------------------------------------
# Path 2 — manual rollout loop (the offline / stub path). Reuses spec_rl.passes
# and spec_rl.STOP so the reward is identical to the env's @vf.reward.
# ---------------------------------------------------------------------------
def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict:
    problems = spec_rl.load_problems(n)
    use_chat = _endpoint_has_chat(base_url)
    transport = "chat" if use_chat else "completions"
    print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} "
          f"at {base_url} (model={model})")

    per_example = []
    for i, prob in enumerate(problems):
        if use_chat:
            raw = complete_chat(base_url, model, prob["prompt"], max_tokens)
        else:
            # Stub path: it ignores the prompt and returns a canned body, so we
            # send the bare code prompt the same way humaneval_subset.py does.
            raw = complete_text(base_url, model, prob["prompt"], max_tokens)
        completion = _trim_at_stop(raw)

        # Reward: identical logic to spec_rl's @vf.reward code_passes — rebuild
        # the problem from its own fields (never trust the model to echo it) and
        # run the unit tests in a timed subprocess.
        problem = {
            "prompt": prob["prompt"],
            "test": prob["test"],
            "entry_point": prob["entry_point"],
        }
        score = spec_rl.fraction_passing(problem, completion)
        per_example.append({
            "index": i,
            "task_id": prob["task_id"],
            "score": score,
            "completion": completion,
        })
        print(f"  [{i+1}/{len(problems)}] {prob['task_id']}: "
              f"reward={score:.3f}")

    mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
    return {
        "driver": "manual",
        "transport": transport,
        "mean_reward": mean,
        "per_example": per_example,
    }


def main() -> int:
    p = argparse.ArgumentParser(
        description="Run the spec_rl RL-eval loop offline against the local stub "
                    "(or any OpenAI-compatible endpoint) and compute the reward."
    )
    p.add_argument("--base-url", default="http://localhost:8000",
                   help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).")
    p.add_argument("--model", default="laguna")
    p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).")
    p.add_argument("--max-tokens", type=int, default=512)
    p.add_argument("--out", default=str(DEFAULT_OUT),
                   help="Where to write the small JSON summary.")
    p.add_argument("--force-manual", action="store_true",
                   help="Skip the verifiers path; always use the manual rollout loop.")
    args = p.parse_args()

    result = None
    if not args.force_manual:
        result = try_verifiers(args.base_url, args.model, args.n)
    if result is None:
        result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens)

    summary = {
        "base_url": args.base_url,
        "model": args.model,
        "n": len(result["per_example"]),
        "driver": result["driver"],
        "transport": result.get("transport", "chat"),
        "mean_reward": result["mean_reward"],
        "scores": [p["score"] for p in result["per_example"]],
        "per_example": [
            {"task_id": p["task_id"], "score": p["score"]}
            for p in result["per_example"]
        ],
    }

    print(json.dumps(
        {k: v for k, v in summary.items() if k != "per_example"}, indent=2
    ))
    print(f"[eval_local] driver={summary['driver']}  "
          f"mean_reward={summary['mean_reward']:.3f}  n={summary['n']}")

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(summary, indent=2))
    print(f"[eval_local] wrote {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())