#!/usr/bin/env python3 """ Rebuild the frozen taxonomy JSON from the current annotations_v3/ state. Run this *once* after annotation is complete to lock the 28+ noun list. Later experiments load the frozen list via taxonomy.py, so class indices don't drift if more annotations are ever added. Usage: python3 experiments/build_taxonomy.py python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json """ import argparse import glob import json import os from collections import Counter from pathlib import Path REPO = Path(__file__).resolve().parents[1] def main(): ap = argparse.ArgumentParser() ap.add_argument( "--annotations_dir", default=str(REPO / "annotations_v3"), help="Directory containing v*/s*.json annotation files", ) ap.add_argument("--threshold", type=int, default=50, help="Minimum noun frequency to keep (Strategy A drops the rest)") ap.add_argument( "--out", default=str(REPO / "experiments" / "taxonomy_v3.json"), help="Output frozen taxonomy JSON", ) args = ap.parse_args() # Late import so building the list doesn't depend on the frozen file # being present yet. import sys sys.path.insert(0, str(REPO)) from experiments.taxonomy import ( VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun, ) paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json"))) if not paths: raise SystemExit(f"No json files under {args.annotations_dir}") verbs, nouns, hands = Counter(), Counter(), Counter() total = 0 dropped_unknown_verb = 0 dropped_unknown_hand = 0 for p in paths: try: with open(p) as f: d = json.load(f) except Exception as e: print(f" WARN: could not parse {p}: {e}") continue for s in d.get("segments", []): a = s.get("action_annotation", {}) v = a.get("action_name") n = a.get("object_name") h = a.get("hand_type") if not (v and n and h): continue total += 1 if v not in VERB_FINE: dropped_unknown_verb += 1 continue if h not in HAND: dropped_unknown_hand += 1 continue verbs[v] += 1 nouns[canonical_noun(n)] += 1 hands[h] += 1 kept = [n for n, c in nouns.most_common() if c >= args.threshold] # Stable alphabetical ordering within kept-set, so re-runs that swap two # near-tie classes don't flip indices. kept = sorted(kept, key=lambda n: (-nouns[n], n)) surviving_segs = 0 for p in paths: with open(p) as f: d = json.load(f) for s in d.get("segments", []): a = s.get("action_annotation", {}) v = a.get("action_name") n = a.get("object_name") h = a.get("hand_type") if not (v and n and h): continue if v not in VERB_FINE or h not in HAND: continue if canonical_noun(n) not in kept: continue surviving_segs += 1 out = { "threshold": args.threshold, "annotation_file_count": len(paths), "total_segments": total, "dropped_unknown_verb": dropped_unknown_verb, "dropped_unknown_hand": dropped_unknown_hand, "surviving_segments": surviving_segs, "verbs": VERB_FINE, "verb_composite": VERB_COMPOSITE, "hand": HAND, "nouns": kept, "noun_counts": {n: nouns[n] for n in kept}, "verb_counts": dict(verbs), "hand_counts": dict(hands), } Path(args.out).parent.mkdir(parents=True, exist_ok=True) with open(args.out, "w") as f: json.dump(out, f, ensure_ascii=False, indent=2) print(f"Scanned {len(paths)} files, {total} segments") print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / " f"{dropped_unknown_hand}") print(f"Kept {len(kept)} nouns (>= {args.threshold}):") for n in kept: print(f" {n}: {nouns[n]}") print(f"Surviving segments (Strategy A): " f"{surviving_segs} / {total} " f"({100 * surviving_segs / max(1, total):.1f}%)") print(f"Wrote {args.out}") if __name__ == "__main__": main()