Spaces:

lanczos
/

graphtestbed

Sleeping

File size: 7,816 Bytes

d094faf

"""Run MLEvolve on a GraphTestbed task, routed through CLIProxyAPI.

Usage:
    python -m agents.mlevolve.runner --task figraph
    python -m agents.mlevolve.runner --task figraph \\
        --model gpt-5.3-codex-spark --steps 100
    python -m agents.mlevolve.runner --task figraph \\
        --submit mlevolve-codex-spark

What this does:
    1. Build an mle-bench-shaped tree from the GraphTestbed task data
       (val-as-test for v1 — see adapter.py for why).
    2. Render config.yaml into _vendor/MLEvolve/config/, with the proxy
       endpoint + model wired into agent.code and agent.feedback.
    3. Invoke `python run.py …` from inside _vendor/MLEvolve/ with Hydra
       overrides for paths and run-budget.
    4. Harvest the latest submission.csv from runs/, normalize its column
       names, validate against the testbed schema, and (optionally) submit.

Known v1 limitation: the produced submission scores VAL-set predictions,
not TEST-set. To score on test, rerun the best runfile.py against
<workspace>/mlebench-tree/<task>/REAL_TEST_FEATURES.csv before submitting.
"""

from __future__ import annotations

import argparse
import os
import subprocess
import sys
from pathlib import Path

import pandas as pd

from agents.cliproxyapi import (
    ProxyEndpoint,
    openai_yaml_block,
    wait_until_ready,
)
from agents.common.submit import finalize
from agents.common.workspace import make_workspace
from agents.mlevolve.adapter import stage as stage_mlebench
from graphtestbed._manifest import task_config

DEFAULT_MODEL = "gpt-5.3-codex-spark"


def _resolve_mlevolve_dir() -> Path:
    explicit = os.environ.get("MLEVOLVE_DIR")
    if explicit:
        p = Path(explicit)
        if not (p / "run.py").exists():
            raise SystemExit(f"MLEVOLVE_DIR={p} does not contain run.py")
        return p
    vendored = Path(__file__).parent / "_vendor" / "MLEvolve"
    if (vendored / "run.py").exists():
        return vendored
    raise SystemExit(
        "Cannot locate MLEvolve.\n"
        "  Install: bash agents/mlevolve/install.sh\n"
        "  Or set MLEVOLVE_DIR to your existing clone."
    )


def _hydra_overrides(
    task: str, mlebench_root: Path, prepared: Path, ep: ProxyEndpoint,
    model: str, steps: int, time_limit_s: int, num_gpus: int,
) -> list[str]:
    """Build Hydra-style key=value overrides for run.py."""
    public = prepared / "public"
    block = openai_yaml_block(ep, model)
    cfg_metric = task_config(task)["metric"]["primary"]

    overrides = [
        f"exp_id={task}",
        f"exp_name={task}",
        f"dataset_dir={mlebench_root}",
        f"data_dir={public}",
        f"desc_file={public / 'description.md'}",
        f"start_cpu_id=0",
        f"cpu_number=4",
        # LLM routing → proxy
        f"agent.code.model={block['model']}",
        f"agent.code.base_url={block['base_url']}",
        f"agent.code.api_key={block['api_key']}",
        f"agent.feedback.model={block['model']}",
        f"agent.feedback.base_url={block['base_url']}",
        f"agent.feedback.api_key={block['api_key']}",
        # Run budget overrides
        f"agent.steps={steps}",
        f"agent.time_limit={time_limit_s}",
        f"agent.memory_embedding_device={'cuda' if num_gpus > 0 else 'cpu'}",
        f"agent.search.num_gpus={num_gpus}",
        f"use_grading_server=false",
        # Goal hint
        f"goal=Maximize {cfg_metric} on the test set",
        f"eval={cfg_metric}",
    ]
    return overrides


def _harvest_submission(
    task: str, mlevolve_dir: Path, dst: Path,
) -> Path:
    schema = task_config(task)["submission_schema"]
    runs = mlevolve_dir / "runs"
    if not runs.exists():
        raise SystemExit(f"No runs/ dir under {mlevolve_dir}")
    candidates = sorted(runs.rglob("submission.csv"),
                        key=lambda p: p.stat().st_mtime)
    if not candidates:
        raise SystemExit(
            f"No submission.csv produced under {runs}. "
            f"Inspect {dst / 'agent.log'} for the failure mode."
        )
    chosen = candidates[-1]
    df = pd.read_csv(chosen)
    expected = [schema["id_col"], schema["pred_col"]]
    if list(df.columns) != expected:
        if len(df.columns) == 2:
            print(f"  (renaming columns {list(df.columns)} → {expected})")
            df.columns = expected
        else:
            raise SystemExit(
                f"Cannot normalize {chosen}: got {list(df.columns)}, expected {expected}"
            )
    out = dst / "val_submission.csv"
    df.to_csv(out, index=False)
    print(f"✓ Picked {chosen.relative_to(mlevolve_dir)}")
    return out


def _print_followup(task: str, ws: Path, val_sub: Path) -> None:
    real_test = ws / "mlebench-tree" / task / "REAL_TEST_FEATURES.csv"
    print()
    print("⚠  v1 limitation: the file above scores VAL predictions.")
    print("   To score on the actual test set:")
    print(f"     1. Find the best runfile.py under "
          f"{Path('_vendor/MLEvolve/runs')}/<latest>/")
    print(f"     2. Re-run it with test.csv replaced by:")
    print(f"        {real_test}")
    print(f"     3. Submit the resulting CSV via:")
    print(f"        gtb submit {task} --file <path> --agent <name>")


def main() -> None:
    ap = argparse.ArgumentParser(prog="agents.mlevolve.runner")
    ap.add_argument("--task", required=True)
    ap.add_argument("--model", default=DEFAULT_MODEL,
                    help=f"default: {DEFAULT_MODEL}")
    ap.add_argument("--steps", type=int, default=100,
                    help="agent.steps (default: 100, upstream default 500 — "
                         "MCGS exploration count)")
    ap.add_argument("--time-limit-min", type=int, default=120,
                    help="agent.time_limit in minutes (default: 120)")
    ap.add_argument("--gpus", type=int, default=0,
                    help="search.num_gpus (default: 0 — CPU only)")
    ap.add_argument("--submit", default=None, metavar="AGENT_ID",
                    help="POST val-set submission to scoring API as this name. "
                         "Note: scores VAL not test (see runner docstring).")
    ap.add_argument("--workspace-root", type=Path, default=None)
    args = ap.parse_args()

    mlevolve_dir = _resolve_mlevolve_dir()
    ep = ProxyEndpoint.from_env()
    wait_until_ready(ep)
    print(f"✓ Proxy ready at {ep.base_url()}")
    print(f"✓ MLEvolve at {mlevolve_dir}")

    ws = make_workspace("mlevolve", args.task, args.workspace_root)
    mlebench_root = ws / "mlebench-tree"
    prepared = stage_mlebench(args.task, mlebench_root)
    print(f"✓ mle-bench tree staged at {mlebench_root}")

    overrides = _hydra_overrides(
        task=args.task,
        mlebench_root=mlebench_root,
        prepared=prepared,
        ep=ep,
        model=args.model,
        steps=args.steps,
        time_limit_s=args.time_limit_min * 60,
        num_gpus=args.gpus,
    )
    cmd = [sys.executable, "run.py", *overrides]

    print(f"→ Launching MLEvolve  task={args.task}  model={args.model}")
    print(f"  workspace: {ws}")
    log = ws / "agent.log"
    with log.open("wb") as lf:
        rc = subprocess.call(cmd, cwd=mlevolve_dir, stdout=lf, stderr=subprocess.STDOUT)
    print(f"  exit={rc}  log={log}")
    if rc != 0:
        raise SystemExit(rc)

    val_sub = _harvest_submission(args.task, mlevolve_dir, ws)
    _print_followup(args.task, ws, val_sub)

    # Note: don't auto-finalize against `test_features.csv` schema since this
    # is a val-set submission. Just print & stop.
    print()
    print(f"  val_submission: {val_sub}")
    if args.submit:
        print(f"  --submit was set; posting val-set predictions as "
              f"`{args.submit}` (will score 0 against test GT).")
        finalize(args.task, val_sub, args.submit)


if __name__ == "__main__":
    main()