| """ |
| Taxonomy for T10 Next-Action Triplet Prediction on DailyAct-5M. |
| |
| Design decisions (fixed per user): |
| * VERB_FINE: 17 primitives observed in annotations_v3 (Strategy: keep all) |
| * VERB_COMPOSITE: 6 classes by manual rollup |
| * NOUN: keep nouns with >=50 segments (Strategy A: drop others entirely) |
| * HAND: 3 classes {left, right, both} |
| |
| The noun list is *frozen* in taxonomy_v3.json so class indices stay stable even |
| as more annotations are added. Regenerate with `build_taxonomy.py` when you are |
| ready to lock the final list. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
| from typing import Dict, List, Optional |
|
|
| |
| |
| |
|
|
| VERB_FINE: List[str] = [ |
| "grasp", "move", "place", "adjust", |
| "pick_up", "hold", "pull", "put_down", |
| "close", "release", "rotate", "open", |
| "insert", "push", "align", "remove", |
| "stabilize", |
| ] |
| NUM_VERB_FINE = len(VERB_FINE) |
| VERB_FINE_IDX: Dict[str, int] = {v: i for i, v in enumerate(VERB_FINE)} |
|
|
|
|
| |
| |
| |
|
|
| VERB_COMPOSITE: List[str] = [ |
| "grasp-family", |
| "place-family", |
| "transport", |
| "adjust", |
| "state-change", |
| "release", |
| ] |
| NUM_VERB_COMPOSITE = len(VERB_COMPOSITE) |
| VERB_COMPOSITE_IDX: Dict[str, int] = {v: i for i, v in enumerate(VERB_COMPOSITE)} |
|
|
| _FINE_TO_COMPOSITE: Dict[str, str] = { |
| "grasp": "grasp-family", |
| "pick_up": "grasp-family", |
| "hold": "grasp-family", |
| "place": "place-family", |
| "put_down": "place-family", |
| "move": "transport", |
| "pull": "transport", |
| "push": "transport", |
| "adjust": "adjust", |
| "align": "adjust", |
| "stabilize": "adjust", |
| "open": "state-change", |
| "close": "state-change", |
| "rotate": "state-change", |
| "insert": "state-change", |
| "remove": "state-change", |
| "release": "release", |
| } |
| assert set(_FINE_TO_COMPOSITE.keys()) == set(VERB_FINE), ( |
| "Verb rollup must cover every fine verb" |
| ) |
|
|
|
|
| def verb_fine_to_composite_idx(verb_fine: str) -> int: |
| """Map a fine verb string -> composite class index (0..5).""" |
| composite = _FINE_TO_COMPOSITE[verb_fine] |
| return VERB_COMPOSITE_IDX[composite] |
|
|
|
|
| |
| |
| |
|
|
| HAND: List[str] = ["left", "right", "both"] |
| NUM_HAND = len(HAND) |
| HAND_IDX: Dict[str, int] = {h: i for i, h in enumerate(HAND)} |
|
|
|
|
| |
| |
| |
|
|
| NOUN_CANONICAL: Dict[str, str] = { |
| "折叠雨伞": "folding umbrella", |
| "mouse": "wired mouse", |
| } |
|
|
|
|
| def canonical_noun(n: str) -> str: |
| """Map raw noun string -> canonical name (handles CJK leak + aliases).""" |
| return NOUN_CANONICAL.get(n, n) |
|
|
|
|
| |
| |
| |
|
|
| TAXONOMY_FROZEN_PATH = Path(__file__).parent / "taxonomy_v3.json" |
| NOUN_KEEP_THRESHOLD = 50 |
|
|
|
|
| def _load_frozen() -> Optional[dict]: |
| if not TAXONOMY_FROZEN_PATH.exists(): |
| return None |
| with open(TAXONOMY_FROZEN_PATH) as f: |
| return json.load(f) |
|
|
|
|
| _frozen = _load_frozen() |
|
|
| if _frozen is not None: |
| NOUN: List[str] = list(_frozen["nouns"]) |
| FROZEN_ANNOTATION_COUNT: int = _frozen.get("annotation_file_count", -1) |
| FROZEN_SEGMENT_COUNT: int = _frozen.get("total_segments", -1) |
| else: |
| |
| |
| NOUN = [ |
| "towel", "sealed jar", "box", "tablecloth", "pot", "tape", "rice bowl", |
| "pants", "spoon", "marker", "cloth", "plate", "laptop", |
| "toothbrush case", "tea canister", "hanger", "wired keyboard", |
| "wired mouse", "laptop power adapter", "seasoning bottle", "mug", |
| "seasoning jar", "tray", "document", "coat", "tea bag", "water cup", |
| "shirt", |
| ] |
| FROZEN_ANNOTATION_COUNT = 167 |
| FROZEN_SEGMENT_COUNT = 4140 |
|
|
| NUM_NOUN = len(NOUN) |
| NOUN_IDX: Dict[str, int] = {n: i for i, n in enumerate(NOUN)} |
|
|
|
|
| def noun_to_idx(raw_noun: str) -> Optional[int]: |
| """Map raw noun -> class index, or None if noun should be dropped (Strategy A).""" |
| canon = canonical_noun(raw_noun) |
| return NOUN_IDX.get(canon, None) |
|
|
|
|
| |
| |
| |
|
|
| def classify_segment(action_annotation: dict) -> Optional[dict]: |
| """Convert a raw annotation dict into triplet label indices. |
| |
| Returns None if any field is missing or the noun is not in the kept list |
| (Strategy A: drop the segment). |
| """ |
| verb = action_annotation.get("action_name") |
| noun = action_annotation.get("object_name") |
| hand = action_annotation.get("hand_type") |
| if not (verb and noun and hand): |
| return None |
| if verb not in VERB_FINE_IDX: |
| return None |
| if hand not in HAND_IDX: |
| return None |
| n_idx = noun_to_idx(noun) |
| if n_idx is None: |
| return None |
| v_fine_idx = VERB_FINE_IDX[verb] |
| return { |
| "verb_fine": v_fine_idx, |
| "verb_composite": verb_fine_to_composite_idx(verb), |
| "noun": n_idx, |
| "hand": HAND_IDX[hand], |
| } |
|
|
|
|
| |
| |
| |
|
|
| def summary() -> str: |
| lines = [] |
| lines.append(f"Verb fine : {NUM_VERB_FINE}") |
| lines.append(f"Verb composite : {NUM_VERB_COMPOSITE}") |
| lines.append(f"Noun : {NUM_NOUN} (kept at >= {NOUN_KEEP_THRESHOLD} segments)") |
| lines.append(f"Hand : {NUM_HAND}") |
| lines.append(f"Frozen from : {FROZEN_ANNOTATION_COUNT} files, " |
| f"{FROZEN_SEGMENT_COUNT} segments") |
| return "\n".join(lines) |
|
|
|
|
| if __name__ == "__main__": |
| print(summary()) |
| print() |
| print("Verb fine list:", VERB_FINE) |
| print("Composite: ", VERB_COMPOSITE) |
| print("Noun list: ", NOUN) |
| print("Hand list: ", HAND) |
|
|