camera-motion-ab-eval / run_space_eval.py
kaier111's picture
Pin huggingface_hub<1 for gradio compatibility
dc95314 verified
#!/usr/bin/env python3
"""Call HF Space app API for remote A/B evaluation.
This lets us run experiments remotely (for faster iteration workflow) and
store returned JSON/CSV artifacts locally.
"""
from __future__ import annotations
import argparse
import os
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
from gradio_client import Client, handle_file
DEFAULT_SPACE = "kaier111/camera-motion-ab-eval"
DEFAULT_JUDGE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
@dataclass(frozen=True)
class RunResult:
summary_md: str
json_path: str
csv_path: str
logs: str
def _as_file_output_path(value: object) -> Optional[str]:
if isinstance(value, str):
return value
if isinstance(value, dict):
# gradio may return {"path": "..."} in some versions.
p = value.get("path")
return str(p) if p else None
return None
def call_space(
*,
space_id: str,
mode: str,
hf_token: str,
enable_llm_judge: bool,
judge_model: str,
judge_token: str,
builtin_cases: str,
max_shots: int,
video: str,
shots_jsonl: str,
gt_json: str,
sample_ids: str,
timeout_sec: int,
) -> RunResult:
httpx_timeout = max(120, int(timeout_sec))
client = Client(space_id, httpx_kwargs={"timeout": httpx_timeout})
stub_path = "/tmp/space_eval_stub.txt"
if not os.path.exists(stub_path):
with open(stub_path, "w", encoding="utf-8") as f:
f.write("stub")
# Some Gradio versions mark File params as required at API layer even
# when builtin mode does not consume them; provide stubs as fallback.
video_arg = handle_file(video) if video else handle_file(stub_path)
shots_arg = handle_file(shots_jsonl) if shots_jsonl else handle_file(stub_path)
gt_arg = handle_file(gt_json) if gt_json else handle_file(stub_path)
# Use positional args to avoid name-mapping drift across gradio-client versions.
job = client.submit(
mode,
hf_token,
enable_llm_judge,
judge_model,
judge_token,
builtin_cases,
float(max_shots),
video_arg,
shots_arg,
gt_arg,
sample_ids,
api_name="/run_eval",
)
out = job.result(timeout=timeout_sec)
if not isinstance(out, (list, tuple)) or len(out) != 4:
raise RuntimeError(f"Unexpected space output: {type(out)} -> {out}")
summary_md = str(out[0])
json_path = _as_file_output_path(out[1])
csv_path = _as_file_output_path(out[2])
logs = str(out[3])
if not json_path or not csv_path:
raise RuntimeError(f"Space did not return output files. json={out[1]} csv={out[2]}")
return RunResult(summary_md=summary_md, json_path=json_path, csv_path=csv_path, logs=logs)
def main() -> int:
parser = argparse.ArgumentParser(description="Run remote eval against HF Space app API")
parser.add_argument("--space-id", default=DEFAULT_SPACE)
parser.add_argument("--mode", choices=("builtin", "custom"), default="custom")
parser.add_argument("--hf-token", default=os.environ.get("HF_TOKEN", ""))
parser.add_argument("--enable-llm-judge", action="store_true")
parser.add_argument("--judge-model", default=os.environ.get("JUDGE_MODEL", DEFAULT_JUDGE_MODEL))
parser.add_argument("--judge-token", default=os.environ.get("JUDGE_TOKEN", ""))
parser.add_argument("--builtin-cases", default="baseus,runner,vertical")
parser.add_argument("--max-shots", type=int, default=1)
parser.add_argument("--video", default="")
parser.add_argument("--shots-jsonl", default="")
parser.add_argument("--gt-json", default="")
parser.add_argument("--sample-ids", default="")
parser.add_argument("--timeout-sec", type=int, default=3600)
parser.add_argument("--out-json", default="space_ab_report.json")
parser.add_argument("--out-csv", default="space_ab_report.csv")
parser.add_argument("--out-log", default="space_ab_report.log")
args = parser.parse_args()
if not args.hf_token:
raise RuntimeError("HF token required: --hf-token or HF_TOKEN")
if args.mode == "custom":
if not args.video:
raise RuntimeError("--video is required in custom mode")
if not args.gt_json:
raise RuntimeError("--gt-json is required in custom mode")
res = call_space(
space_id=args.space_id,
mode=args.mode,
hf_token=args.hf_token,
enable_llm_judge=bool(args.enable_llm_judge),
judge_model=args.judge_model,
judge_token=args.judge_token,
builtin_cases=args.builtin_cases,
max_shots=max(0, int(args.max_shots)),
video=args.video,
shots_jsonl=args.shots_jsonl,
gt_json=args.gt_json,
sample_ids=args.sample_ids,
timeout_sec=max(1, int(args.timeout_sec)),
)
out_json = Path(args.out_json).resolve()
out_csv = Path(args.out_csv).resolve()
out_log = Path(args.out_log).resolve()
out_json.parent.mkdir(parents=True, exist_ok=True)
out_csv.parent.mkdir(parents=True, exist_ok=True)
out_log.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(res.json_path, str(out_json))
shutil.copyfile(res.csv_path, str(out_csv))
out_log.write_text(res.logs, encoding="utf-8")
print("[SPACE] summary")
print(res.summary_md)
print(f"[SPACE] json -> {out_json}")
print(f"[SPACE] csv -> {out_csv}")
print(f"[SPACE] log -> {out_log}")
return 0
if __name__ == "__main__":
raise SystemExit(main())