"""Run MLEvolve on a GraphTestbed task, routed through CLIProxyAPI. Usage: python -m agents.mlevolve.runner --task figraph python -m agents.mlevolve.runner --task figraph \\ --model gpt-5.3-codex-spark --steps 100 python -m agents.mlevolve.runner --task figraph \\ --submit mlevolve-codex-spark What this does: 1. Build an mle-bench-shaped tree from the GraphTestbed task data (val-as-test for v1 — see adapter.py for why). 2. Render config.yaml into _vendor/MLEvolve/config/, with the proxy endpoint + model wired into agent.code and agent.feedback. 3. Invoke `python run.py …` from inside _vendor/MLEvolve/ with Hydra overrides for paths and run-budget. 4. Harvest the latest submission.csv from runs/, normalize its column names, validate against the testbed schema, and (optionally) submit. Known v1 limitation: the produced submission scores VAL-set predictions, not TEST-set. To score on test, rerun the best runfile.py against /mlebench-tree//REAL_TEST_FEATURES.csv before submitting. """ from __future__ import annotations import argparse import os import subprocess import sys from pathlib import Path import pandas as pd from agents.cliproxyapi import ( ProxyEndpoint, openai_yaml_block, wait_until_ready, ) from agents.common.submit import finalize from agents.common.workspace import make_workspace from agents.mlevolve.adapter import stage as stage_mlebench from graphtestbed._manifest import task_config DEFAULT_MODEL = "gpt-5.3-codex-spark" def _resolve_mlevolve_dir() -> Path: explicit = os.environ.get("MLEVOLVE_DIR") if explicit: p = Path(explicit) if not (p / "run.py").exists(): raise SystemExit(f"MLEVOLVE_DIR={p} does not contain run.py") return p vendored = Path(__file__).parent / "_vendor" / "MLEvolve" if (vendored / "run.py").exists(): return vendored raise SystemExit( "Cannot locate MLEvolve.\n" " Install: bash agents/mlevolve/install.sh\n" " Or set MLEVOLVE_DIR to your existing clone." ) def _hydra_overrides( task: str, mlebench_root: Path, prepared: Path, ep: ProxyEndpoint, model: str, steps: int, time_limit_s: int, num_gpus: int, ) -> list[str]: """Build Hydra-style key=value overrides for run.py.""" public = prepared / "public" block = openai_yaml_block(ep, model) cfg_metric = task_config(task)["metric"]["primary"] overrides = [ f"exp_id={task}", f"exp_name={task}", f"dataset_dir={mlebench_root}", f"data_dir={public}", f"desc_file={public / 'description.md'}", f"start_cpu_id=0", f"cpu_number=4", # LLM routing → proxy f"agent.code.model={block['model']}", f"agent.code.base_url={block['base_url']}", f"agent.code.api_key={block['api_key']}", f"agent.feedback.model={block['model']}", f"agent.feedback.base_url={block['base_url']}", f"agent.feedback.api_key={block['api_key']}", # Run budget overrides f"agent.steps={steps}", f"agent.time_limit={time_limit_s}", f"agent.memory_embedding_device={'cuda' if num_gpus > 0 else 'cpu'}", f"agent.search.num_gpus={num_gpus}", f"use_grading_server=false", # Goal hint f"goal=Maximize {cfg_metric} on the test set", f"eval={cfg_metric}", ] return overrides def _harvest_submission( task: str, mlevolve_dir: Path, dst: Path, ) -> Path: schema = task_config(task)["submission_schema"] runs = mlevolve_dir / "runs" if not runs.exists(): raise SystemExit(f"No runs/ dir under {mlevolve_dir}") candidates = sorted(runs.rglob("submission.csv"), key=lambda p: p.stat().st_mtime) if not candidates: raise SystemExit( f"No submission.csv produced under {runs}. " f"Inspect {dst / 'agent.log'} for the failure mode." ) chosen = candidates[-1] df = pd.read_csv(chosen) expected = [schema["id_col"], schema["pred_col"]] if list(df.columns) != expected: if len(df.columns) == 2: print(f" (renaming columns {list(df.columns)} → {expected})") df.columns = expected else: raise SystemExit( f"Cannot normalize {chosen}: got {list(df.columns)}, expected {expected}" ) out = dst / "val_submission.csv" df.to_csv(out, index=False) print(f"✓ Picked {chosen.relative_to(mlevolve_dir)}") return out def _print_followup(task: str, ws: Path, val_sub: Path) -> None: real_test = ws / "mlebench-tree" / task / "REAL_TEST_FEATURES.csv" print() print("⚠ v1 limitation: the file above scores VAL predictions.") print(" To score on the actual test set:") print(f" 1. Find the best runfile.py under " f"{Path('_vendor/MLEvolve/runs')}//") print(f" 2. Re-run it with test.csv replaced by:") print(f" {real_test}") print(f" 3. Submit the resulting CSV via:") print(f" gtb submit {task} --file --agent ") def main() -> None: ap = argparse.ArgumentParser(prog="agents.mlevolve.runner") ap.add_argument("--task", required=True) ap.add_argument("--model", default=DEFAULT_MODEL, help=f"default: {DEFAULT_MODEL}") ap.add_argument("--steps", type=int, default=100, help="agent.steps (default: 100, upstream default 500 — " "MCGS exploration count)") ap.add_argument("--time-limit-min", type=int, default=120, help="agent.time_limit in minutes (default: 120)") ap.add_argument("--gpus", type=int, default=0, help="search.num_gpus (default: 0 — CPU only)") ap.add_argument("--submit", default=None, metavar="AGENT_ID", help="POST val-set submission to scoring API as this name. " "Note: scores VAL not test (see runner docstring).") ap.add_argument("--workspace-root", type=Path, default=None) args = ap.parse_args() mlevolve_dir = _resolve_mlevolve_dir() ep = ProxyEndpoint.from_env() wait_until_ready(ep) print(f"✓ Proxy ready at {ep.base_url()}") print(f"✓ MLEvolve at {mlevolve_dir}") ws = make_workspace("mlevolve", args.task, args.workspace_root) mlebench_root = ws / "mlebench-tree" prepared = stage_mlebench(args.task, mlebench_root) print(f"✓ mle-bench tree staged at {mlebench_root}") overrides = _hydra_overrides( task=args.task, mlebench_root=mlebench_root, prepared=prepared, ep=ep, model=args.model, steps=args.steps, time_limit_s=args.time_limit_min * 60, num_gpus=args.gpus, ) cmd = [sys.executable, "run.py", *overrides] print(f"→ Launching MLEvolve task={args.task} model={args.model}") print(f" workspace: {ws}") log = ws / "agent.log" with log.open("wb") as lf: rc = subprocess.call(cmd, cwd=mlevolve_dir, stdout=lf, stderr=subprocess.STDOUT) print(f" exit={rc} log={log}") if rc != 0: raise SystemExit(rc) val_sub = _harvest_submission(args.task, mlevolve_dir, ws) _print_followup(args.task, ws, val_sub) # Note: don't auto-finalize against `test_features.csv` schema since this # is a val-set submission. Just print & stop. print() print(f" val_submission: {val_sub}") if args.submit: print(f" --submit was set; posting val-set predictions as " f"`{args.submit}` (will score 0 against test GT).") finalize(args.task, val_sub, args.submit) if __name__ == "__main__": main()