PandaEval12 Heatmaps

#!/usr/bin/env python3 import json import re from pathlib import Path from typing import Any, Dict, List, Tuple, Optional ROOT = Path("/workspace/v121rc_exp1") OUT_HTML = ROOT / "heatmap.html" CONFIGS = list("ABCDEFGHI") DATASETS = { "PandaEval12_1": "PandaEval12_1_results", "PandaEval12_2": "PandaEval12_2_results", } TEMPLATE_SUFFIX_RE = re.compile(r"_(?P[PRA])(?P\d+)$") def template_sort_key(t: str) -> int: if t == "BASE": return 0 m = re.match(r"([PRA])(\d+)$", t) if not m: return 9999 kind, idx = m.group(1), int(m.group(2)) base = {"P": 100, "R": 200, "A": 300}.get(kind, 1000) return base + idx def parse_template_from_stem(stem: str) -> str: m = TEMPLATE_SUFFIX_RE.search(stem) if not m: return "BASE" return f"{m.group('kind')}{int(m.group('idx'))}" def safe_load_json(path: Path) -> Optional[Any]: try: return json.loads(path.read_text(encoding="utf-8")) except Exception: return None def infer_steps(entries: List[Dict[str, Any]]) -> List[int]: steps = set() for e in entries: if not isinstance(e, dict): continue for k in e.keys(): if k.startswith("step_"): try: steps.add(int(k.split("_", 1)[1])) except Exception: pass return sorted(steps) def compute_acc_invalid(entries: List[Dict[str, Any]], step: int) -> Tuple[float, float]: key = f"step_{step}" n = len(entries) if n == 0: return float("nan"), float("nan") acc_sum = 0.0 valid_cnt = 0 for e in entries: v = (e.get(key) or {}) if isinstance(e, dict) else {} label = v.get("label", "") if label in ("Yes", "No"): valid_cnt += 1 try: acc_sum += float(v.get("accuracy", 0.0)) except Exception: acc_sum += 0.0 acc = acc_sum / n invalid_rate = 1.0 - (valid_cnt / n) return acc, invalid_rate def collect_dataset(ds_dirname: str) -> Dict[str, Any]: acc_sum: Dict[Tuple[int, str, str], float] = {} acc_cnt: Dict[Tuple[int, str, str], int] = {} inv_sum: Dict[Tuple[int, str, str], float] = {} inv_cnt: Dict[Tuple[int, str, str], int] = {} templates_set = set() steps_set = set() for cfg in CONFIGS: base = ROOT / cfg / ds_dirname if not base.exists(): continue for p in base.rglob("*_results.json"): stem = p.name[:-len("_results.json")] template = parse_template_from_stem(stem) templates_set.add(template) data = safe_load_json(p) if not isinstance(data, list): continue steps = infer_steps(data) for step in steps: steps_set.add(step) acc, inv = compute_acc_invalid(data, step) k = (step, template, cfg) if acc == acc: acc_sum[k] = acc_sum.get(k, 0.0) + acc acc_cnt[k] = acc_cnt.get(k, 0) + 1 if inv == inv: inv_sum[k] = inv_sum.get(k, 0.0) + inv inv_cnt[k] = inv_cnt.get(k, 0) + 1 templates = sorted(list(templates_set), key=template_sort_key) steps = sorted(list(steps_set)) by_step: Dict[int, Dict[str, List[List[Optional[float]]]]] = {} for step in steps: acc_mat: List[List[Optional[float]]] = [[None for _ in CONFIGS] for _ in templates] inv_mat: List[List[Optional[float]]] = [[None for _ in CONFIGS] for _ in templates] for ti, t in enumerate(templates): for ci, cfg in enumerate(CONFIGS): k = (step, t, cfg) if k in acc_cnt and acc_cnt[k] > 0: acc_mat[ti][ci] = acc_sum[k] / acc_cnt[k] if k in inv_cnt and inv_cnt[k] > 0: inv_mat[ti][ci] = inv_sum[k] / inv_cnt[k] by_step[step] = {"accuracy": acc_mat, "invalid_rate": inv_mat} return {"templates": templates, "steps": steps, "by_step": by_step} def main() -> None: collected = {} all_steps = set() all_templates = set() for ds_name, ds_dir in DATASETS.items(): collected[ds_name] = collect_dataset(ds_dir) all_steps.update(collected[ds_name]["steps"]) all_templates.update(collected[ds_name]["templates"]) templates = sorted(list(all_templates), key=template_sort_key) steps = sorted(list(all_steps)) datasets_payload = {} for ds_name in DATASETS.keys(): ds = collected[ds_name] ds_templates = ds["templates"] ds_template_to_idx = {t: i for i, t in enumerate(ds_templates)} by_step_unified = {} for step in steps: acc_mat = [[None for _ in CONFIGS] for _ in templates] inv_mat = [[None for _ in CONFIGS] for _ in templates] if step in ds["by_step"]: old = ds["by_step"][step] old_acc = old["accuracy"] old_inv = old["invalid_rate"] for ti, t in enumerate(templates): if t not in ds_template_to_idx: continue oti = ds_template_to_idx[t] for ci in range(len(CONFIGS)): acc_mat[ti][ci] = old_acc[oti][ci] inv_mat[ti][ci] = old_inv[oti][ci] by_step_unified[step] = {"accuracy": acc_mat, "invalid_rate": inv_mat} datasets_payload[ds_name] = {"by_step": by_step_unified} payload = { "configs": CONFIGS, "templates": templates, "steps": steps, "datasets": datasets_payload, } html = f""" PandaEval12 Heatmaps

Accuracy PandaEval12_1

Accuracy PandaEval12_2

Invalid rate PandaEval12_1

Invalid rate PandaEval12_2

Step: —

—

""" OUT_HTML.write_text(html, encoding="utf-8") print(f"Wrote: {OUT_HTML}") if __name__ == "__main__": main()

PandaEval12 — Template × Config Heatmaps