AlexWortega commited on
Commit
970e1f2
·
unverified ·
1 Parent(s): 36a90c6

Add /evaluate endpoint: numerical position MSE model vs Pymunk

Browse files

@spaces.GPU function that runs the model rollout AND a fresh Pymunk rollout
from the same seed state, then returns per-frame mean Euclidean distance
between model and Pymunk positions (raw and normalized by scene-diagonal %).
Exposed as api_name=/evaluate so we can benchmark all featured demos with a
script. Wired to a small 'Compute position MSE vs Pymunk' accordion in the UI.

Files changed (1) hide show
  1. app.py +85 -0
app.py CHANGED
@@ -721,6 +721,84 @@ def scene_loaded(name: str) -> tuple[str, str]:
721
  return editor_html(bundle), json.dumps(bundle)
722
 
723
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
  # -----------------------------------------------------------------------------
725
  # Simulation (streamed)
726
  # -----------------------------------------------------------------------------
@@ -927,5 +1005,12 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
927
  run.click(simulate, [scene_state, scenario, n_frames, temperature],
928
  [view, view_truth, gif, status])
929
 
 
 
 
 
 
 
 
930
  if __name__ == "__main__":
931
  demo.launch()
 
721
  return editor_html(bundle), json.dumps(bundle)
722
 
723
 
724
+ # -----------------------------------------------------------------------------
725
+ # Numerical evaluation: model rollout vs Pymunk ground truth (position MSE).
726
+ # Exposed via the api_name="/evaluate" endpoint so we can benchmark featured
727
+ # demos from a script without scraping the UI.
728
+ # -----------------------------------------------------------------------------
729
+ @gpu(duration=300)
730
+ def evaluate(scene_json: str, scenario_name: str, n_frames: int):
731
+ bundle = json.loads(scene_json)
732
+ header = bundle["header"]
733
+ initial = bundle.get("initial_frames") or []
734
+ n_obj = (header.get("object_count")
735
+ or len(header.get("objects", []))
736
+ or (len(initial[0]["objects"]) if initial else 0))
737
+ x0, x1, y0, y1 = scene_bounds(header)
738
+ diag = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
739
+
740
+ gt_frames = pymunk_rollout(header, initial[-1], int(n_frames))
741
+ gt_by_frame = {f["frame"]: f for f in gt_frames}
742
+
743
+ from llama_cpp import Llama # noqa: F401 (preload may be required)
744
+ llm = get_llm(lambda s: None)
745
+ budget = int(min(2000, n_obj * 36 + 100))
746
+
747
+ rolled: list[dict] = list(initial)
748
+ last_idx = initial[-1]["frame"] if initial else 0
749
+ per_frame: list[dict] = []
750
+ t0 = time.time()
751
+
752
+ for _ in range(int(n_frames)):
753
+ prompt, _ctx_frames = fit_prompt(llm, header, rolled, budget)
754
+ next_idx = last_idx + 2
755
+ stops = [f"Frame {next_idx+d}:" for d in range(0, 4)]
756
+ out = llm.create_completion(prompt, max_tokens=budget, temperature=0.0, top_p=0.95, stop=stops)
757
+ text = out["choices"][0]["text"]
758
+ parsed = parse_frame(split_first_frame(text), n_obj)
759
+ modeled = len(parsed)
760
+ prev_objs = {o["id"]: o for o in rolled[-1]["objects"]} if rolled else {}
761
+ new_objs = dict(parsed) if parsed else dict(prev_objs)
762
+ if modeled < n_obj:
763
+ for oid, o in prev_objs.items():
764
+ new_objs.setdefault(oid, o)
765
+ last_idx += 1
766
+ rolled.append({
767
+ "frame": last_idx,
768
+ "description": emitted_description(text) or f"Frame {last_idx}: simulation in progress.",
769
+ "objects": list(new_objs.values()),
770
+ })
771
+
772
+ gt = gt_by_frame.get(last_idx)
773
+ if gt:
774
+ gt_pos = {o["id"]: o["position"] for o in gt["objects"]}
775
+ errs = []
776
+ for oid, o in new_objs.items():
777
+ if oid in gt_pos:
778
+ dx = gt_pos[oid]["x"] - o["position"]["x"]
779
+ dy = gt_pos[oid]["y"] - o["position"]["y"]
780
+ errs.append((dx * dx + dy * dy) ** 0.5)
781
+ per_frame.append({
782
+ "frame": last_idx, "modeled": modeled,
783
+ "mean_dist": (sum(errs) / len(errs)) if errs else None,
784
+ "max_dist": max(errs) if errs else None,
785
+ })
786
+
787
+ valid = [p for p in per_frame if p["mean_dist"] is not None]
788
+ mean_dist = (sum(p["mean_dist"] for p in valid) / len(valid)) if valid else None
789
+ return json.dumps({
790
+ "scenario": scenario_name,
791
+ "n_obj": n_obj,
792
+ "scene_diag": diag,
793
+ "frames_done": len(per_frame),
794
+ "frames_held_avg": sum(n_obj - p["modeled"] for p in per_frame) / max(1, len(per_frame)),
795
+ "mean_dist": mean_dist,
796
+ "mean_dist_pct_diag": (mean_dist / diag * 100.0) if (mean_dist and diag) else None,
797
+ "elapsed": round(time.time() - t0, 2),
798
+ "per_frame": per_frame,
799
+ })
800
+
801
+
802
  # -----------------------------------------------------------------------------
803
  # Simulation (streamed)
804
  # -----------------------------------------------------------------------------
 
1005
  run.click(simulate, [scene_state, scenario, n_frames, temperature],
1006
  [view, view_truth, gif, status])
1007
 
1008
+ with gr.Accordion("📊 Compute position MSE vs Pymunk (numerical)", open=False):
1009
+ with gr.Row():
1010
+ eval_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to evaluate")
1011
+ eval_btn = gr.Button("Run evaluation", scale=1)
1012
+ eval_out = gr.Code(language="json", lines=12, label="Result")
1013
+ eval_btn.click(evaluate, [scene_state, scenario, eval_frames], [eval_out])
1014
+
1015
  if __name__ == "__main__":
1016
  demo.launch()