"""Shared serialization for the tiny-vocab physics MoE. Reuses physics_core.fmt_header / fmt_frame, but reduces every frame's free-text description to a tiny controlled keyword set so the learned vocab stays simulation-only. Controlled description set (after the `Frame N:` token): - "at rest" <- "All objects are at rest." - "in motion" <- "All objects are in motion." - "settling" <- "K of N objects are moving." (partial motion) Anything else -> dropped (description omitted; frame still emitted). """ from __future__ import annotations import re import physics_core as pc _AT_REST = re.compile(r"all objects are at rest", re.I) _IN_MOTION = re.compile(r"all objects are in motion", re.I) _PARTIAL = re.compile(r"\d+\s+of\s+\d+\s+objects are moving", re.I) def reduce_desc(raw: str) -> str: """Map a frame's free-text description to a controlled keyword (or '').""" if _AT_REST.search(raw): return "at rest" if _IN_MOTION.search(raw): return "in motion" if _PARTIAL.search(raw): return "settling" return "" def fmt_frame_reduced(fr: dict) -> str: """Like pc.fmt_frame but with the description replaced by a keyword.""" fr2 = dict(fr) fr2["description"] = reduce_desc(fr.get("description", "")) return pc.fmt_frame(fr2) def fmt_header_reduced(header: dict) -> str: """pc.fmt_header with the free-text Scene description blanked out. Keeps every structural line (Gravity / Timestep / Type / Difficulty / Static / Constraints) so the categorical `Type:` token survives, but the `Scene:` line carries no English prose -> vocab stays sim-only. """ h2 = dict(header) h2["description"] = "" return pc.fmt_header(h2) def serialize_scene(header: dict, frames: list) -> str: """Full scene text: reduced header + reduced frames (no trailing BOS/EOS).""" txt = fmt_header_reduced(header) txt += "".join(fmt_frame_reduced(fr) for fr in frames) return txt