VLAwithVariousSpeed / scripts /aggregate_eval_results.py

Upload folder using huggingface_hub

08ff31f verified about 1 month ago

5.26 kB

	#!/usr/bin/env python3
	"""Aggregate LIBERO eval JSONs into a per-(speed, suite) CSV.

	Layout expected:
	<results_dir>/speed_<tag>/{spatial,goal,object,long_t_}_<tag>.json

	For each speed_<tag> subdirectory, computes per-suite (spatial / goal / object /
	long) and overall success rate, mean_steps_success and mean_steps_all by
	concatenating the `episodes` lists from the relevant json files. The five
	`long_t_.json` shards are merged into a single `long` row.
	"""

	from __future__ import annotations

	import argparse
	import csv
	import json
	import math
	from pathlib import Path

	SUITE_LABELS = {
	"spatial": ["spatial"],
	"goal": ["goal"],
	"object": ["object"],
	"long": ["long_t0_1", "long_t2_3", "long_t4_5", "long_t6_7", "long_t8_9"],
	}


	def _label_from_stem(stem: str) -> str:
	# filename like "long_t0_1_0p75x" or "spatial_1x" -> drop the trailing speed tag
	return "_".join(stem.split("_")[:-1])


	def _aggregate(episodes: list[dict]) -> dict:
	n = len(episodes)
	succ = [e for e in episodes if e.get("success")]
	steps_all = [e["steps"] for e in episodes]
	steps_succ = [e["steps"] for e in succ]
	return {
	"n_episodes": n,
	"n_success": len(succ),
	"success_rate": len(succ) / n if n else math.nan,
	"mean_steps_success": (sum(steps_succ) / len(steps_succ)) if steps_succ else math.nan,
	"mean_steps_all": (sum(steps_all) / n) if n else math.nan,
	}


	def _collect_speed_dir(speed_dir: Path) -> dict[str, dict]:
	"""Return {suite_name: aggregate_dict} for one speed_<tag> directory."""
	by_label: dict[str, list[dict]] = {}
	for fp in sorted(speed_dir.glob("*.json")):
	label = _label_from_stem(fp.stem)
	with fp.open() as f:
	data = json.load(f)
	by_label[label] = data.get("episodes", [])

	rows: dict[str, dict] = {}
	all_eps: list[dict] = []
	for suite, labels in SUITE_LABELS.items():
	eps: list[dict] = []
	missing = [lbl for lbl in labels if lbl not in by_label]
	for lbl in labels:
	eps.extend(by_label.get(lbl, []))
	if not eps:
	print(f" [warn] {speed_dir.name}: no episodes for suite={suite} (missing={missing})")
	continue
	if missing:
	print(f" [warn] {speed_dir.name}: suite={suite} missing shards {missing}")
	rows[suite] = _aggregate(eps)
	all_eps.extend(eps)

	if all_eps:
	rows["overall"] = _aggregate(all_eps)
	return rows


	def _speed_from_dirname(name: str) -> str:
	# "speed_0p75x" -> "0.75", "speed_1x" -> "1.0"
	tag = name.removeprefix("speed_").removesuffix("x")
	return tag.replace("p", ".") if "p" in tag else f"{float(tag):.1f}"


	def main() -> None:
	ap = argparse.ArgumentParser(description=__doc__)
	ap.add_argument(
	"results_dir",
	type=Path,
	help="Directory containing speed_<tag>/ subdirectories with eval JSONs.",
	)
	ap.add_argument(
	"-o",
	"--output",
	type=Path,
	default=None,
	help="Output CSV path (default: <results_dir>/eval_summary.csv)",
	)
	args = ap.parse_args()

	if not args.results_dir.is_dir():
	ap.error(f"results_dir does not exist: {args.results_dir}")

	out_path = args.output or (args.results_dir / "eval_summary.csv")

	speed_dirs = sorted(p for p in args.results_dir.glob("speed_*") if p.is_dir())
	if not speed_dirs:
	ap.error(f"no speed_*/ subdirectories under {args.results_dir}")

	rows: list[dict] = []
	suite_order = list(SUITE_LABELS.keys()) + ["overall"]

	for sd in speed_dirs:
	speed = _speed_from_dirname(sd.name)
	suite_rows = _collect_speed_dir(sd)
	for suite in suite_order:
	if suite not in suite_rows:
	continue
	agg = suite_rows[suite]
	rows.append(
	{
	"speed": speed,
	"speed_tag": sd.name.removeprefix("speed_"),
	"suite": suite,
	"n_episodes": agg["n_episodes"],
	"n_success": agg["n_success"],
	"success_rate": round(agg["success_rate"], 4),
	"mean_steps_success": round(agg["mean_steps_success"], 2),
	"mean_steps_all": round(agg["mean_steps_all"], 2),
	}
	)

	fieldnames = [
	"speed",
	"speed_tag",
	"suite",
	"n_episodes",
	"n_success",
	"success_rate",
	"mean_steps_success",
	"mean_steps_all",
	]
	with out_path.open("w", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fieldnames)
	w.writeheader()
	w.writerows(rows)

	print(f"\nWrote {len(rows)} rows -> {out_path}")
	# also print a quick console table
	print()
	print(f"{'speed':<6} {'suite':<8} {'success':>10} {'sr':>7} {'steps_succ':>12} {'steps_all':>11}")
	for r in rows:
	print(
	f"{r['speed']:<6} {r['suite']:<8} "
	f"{r['n_success']:>4}/{r['n_episodes']:<5} "
	f"{r['success_rate']*100:>6.1f}% "
	f"{r['mean_steps_success']:>12.1f} "
	f"{r['mean_steps_all']:>11.1f}"
	)


	if __name__ == "__main__":
	main()