Zhu Jiajun (jz28583)
Add agents/ harness integrations and HF Space scoring deployment
d094faf
"""Run MLEvolve on a GraphTestbed task, routed through CLIProxyAPI.
Usage:
python -m agents.mlevolve.runner --task figraph
python -m agents.mlevolve.runner --task figraph \\
--model gpt-5.3-codex-spark --steps 100
python -m agents.mlevolve.runner --task figraph \\
--submit mlevolve-codex-spark
What this does:
1. Build an mle-bench-shaped tree from the GraphTestbed task data
(val-as-test for v1 β€” see adapter.py for why).
2. Render config.yaml into _vendor/MLEvolve/config/, with the proxy
endpoint + model wired into agent.code and agent.feedback.
3. Invoke `python run.py …` from inside _vendor/MLEvolve/ with Hydra
overrides for paths and run-budget.
4. Harvest the latest submission.csv from runs/, normalize its column
names, validate against the testbed schema, and (optionally) submit.
Known v1 limitation: the produced submission scores VAL-set predictions,
not TEST-set. To score on test, rerun the best runfile.py against
<workspace>/mlebench-tree/<task>/REAL_TEST_FEATURES.csv before submitting.
"""
from __future__ import annotations
import argparse
import os
import subprocess
import sys
from pathlib import Path
import pandas as pd
from agents.cliproxyapi import (
ProxyEndpoint,
openai_yaml_block,
wait_until_ready,
)
from agents.common.submit import finalize
from agents.common.workspace import make_workspace
from agents.mlevolve.adapter import stage as stage_mlebench
from graphtestbed._manifest import task_config
DEFAULT_MODEL = "gpt-5.3-codex-spark"
def _resolve_mlevolve_dir() -> Path:
explicit = os.environ.get("MLEVOLVE_DIR")
if explicit:
p = Path(explicit)
if not (p / "run.py").exists():
raise SystemExit(f"MLEVOLVE_DIR={p} does not contain run.py")
return p
vendored = Path(__file__).parent / "_vendor" / "MLEvolve"
if (vendored / "run.py").exists():
return vendored
raise SystemExit(
"Cannot locate MLEvolve.\n"
" Install: bash agents/mlevolve/install.sh\n"
" Or set MLEVOLVE_DIR to your existing clone."
)
def _hydra_overrides(
task: str, mlebench_root: Path, prepared: Path, ep: ProxyEndpoint,
model: str, steps: int, time_limit_s: int, num_gpus: int,
) -> list[str]:
"""Build Hydra-style key=value overrides for run.py."""
public = prepared / "public"
block = openai_yaml_block(ep, model)
cfg_metric = task_config(task)["metric"]["primary"]
overrides = [
f"exp_id={task}",
f"exp_name={task}",
f"dataset_dir={mlebench_root}",
f"data_dir={public}",
f"desc_file={public / 'description.md'}",
f"start_cpu_id=0",
f"cpu_number=4",
# LLM routing β†’ proxy
f"agent.code.model={block['model']}",
f"agent.code.base_url={block['base_url']}",
f"agent.code.api_key={block['api_key']}",
f"agent.feedback.model={block['model']}",
f"agent.feedback.base_url={block['base_url']}",
f"agent.feedback.api_key={block['api_key']}",
# Run budget overrides
f"agent.steps={steps}",
f"agent.time_limit={time_limit_s}",
f"agent.memory_embedding_device={'cuda' if num_gpus > 0 else 'cpu'}",
f"agent.search.num_gpus={num_gpus}",
f"use_grading_server=false",
# Goal hint
f"goal=Maximize {cfg_metric} on the test set",
f"eval={cfg_metric}",
]
return overrides
def _harvest_submission(
task: str, mlevolve_dir: Path, dst: Path,
) -> Path:
schema = task_config(task)["submission_schema"]
runs = mlevolve_dir / "runs"
if not runs.exists():
raise SystemExit(f"No runs/ dir under {mlevolve_dir}")
candidates = sorted(runs.rglob("submission.csv"),
key=lambda p: p.stat().st_mtime)
if not candidates:
raise SystemExit(
f"No submission.csv produced under {runs}. "
f"Inspect {dst / 'agent.log'} for the failure mode."
)
chosen = candidates[-1]
df = pd.read_csv(chosen)
expected = [schema["id_col"], schema["pred_col"]]
if list(df.columns) != expected:
if len(df.columns) == 2:
print(f" (renaming columns {list(df.columns)} β†’ {expected})")
df.columns = expected
else:
raise SystemExit(
f"Cannot normalize {chosen}: got {list(df.columns)}, expected {expected}"
)
out = dst / "val_submission.csv"
df.to_csv(out, index=False)
print(f"βœ“ Picked {chosen.relative_to(mlevolve_dir)}")
return out
def _print_followup(task: str, ws: Path, val_sub: Path) -> None:
real_test = ws / "mlebench-tree" / task / "REAL_TEST_FEATURES.csv"
print()
print("⚠ v1 limitation: the file above scores VAL predictions.")
print(" To score on the actual test set:")
print(f" 1. Find the best runfile.py under "
f"{Path('_vendor/MLEvolve/runs')}/<latest>/")
print(f" 2. Re-run it with test.csv replaced by:")
print(f" {real_test}")
print(f" 3. Submit the resulting CSV via:")
print(f" gtb submit {task} --file <path> --agent <name>")
def main() -> None:
ap = argparse.ArgumentParser(prog="agents.mlevolve.runner")
ap.add_argument("--task", required=True)
ap.add_argument("--model", default=DEFAULT_MODEL,
help=f"default: {DEFAULT_MODEL}")
ap.add_argument("--steps", type=int, default=100,
help="agent.steps (default: 100, upstream default 500 β€” "
"MCGS exploration count)")
ap.add_argument("--time-limit-min", type=int, default=120,
help="agent.time_limit in minutes (default: 120)")
ap.add_argument("--gpus", type=int, default=0,
help="search.num_gpus (default: 0 β€” CPU only)")
ap.add_argument("--submit", default=None, metavar="AGENT_ID",
help="POST val-set submission to scoring API as this name. "
"Note: scores VAL not test (see runner docstring).")
ap.add_argument("--workspace-root", type=Path, default=None)
args = ap.parse_args()
mlevolve_dir = _resolve_mlevolve_dir()
ep = ProxyEndpoint.from_env()
wait_until_ready(ep)
print(f"βœ“ Proxy ready at {ep.base_url()}")
print(f"βœ“ MLEvolve at {mlevolve_dir}")
ws = make_workspace("mlevolve", args.task, args.workspace_root)
mlebench_root = ws / "mlebench-tree"
prepared = stage_mlebench(args.task, mlebench_root)
print(f"βœ“ mle-bench tree staged at {mlebench_root}")
overrides = _hydra_overrides(
task=args.task,
mlebench_root=mlebench_root,
prepared=prepared,
ep=ep,
model=args.model,
steps=args.steps,
time_limit_s=args.time_limit_min * 60,
num_gpus=args.gpus,
)
cmd = [sys.executable, "run.py", *overrides]
print(f"β†’ Launching MLEvolve task={args.task} model={args.model}")
print(f" workspace: {ws}")
log = ws / "agent.log"
with log.open("wb") as lf:
rc = subprocess.call(cmd, cwd=mlevolve_dir, stdout=lf, stderr=subprocess.STDOUT)
print(f" exit={rc} log={log}")
if rc != 0:
raise SystemExit(rc)
val_sub = _harvest_submission(args.task, mlevolve_dir, ws)
_print_followup(args.task, ws, val_sub)
# Note: don't auto-finalize against `test_features.csv` schema since this
# is a val-set submission. Just print & stop.
print()
print(f" val_submission: {val_sub}")
if args.submit:
print(f" --submit was set; posting val-set predictions as "
f"`{args.submit}` (will score 0 against test GT).")
finalize(args.task, val_sub, args.submit)
if __name__ == "__main__":
main()