Spaces:
Running on Zero
Running on Zero
Add /evaluate endpoint: numerical position MSE model vs Pymunk
Browse files@spaces.GPU function that runs the model rollout AND a fresh Pymunk rollout
from the same seed state, then returns per-frame mean Euclidean distance
between model and Pymunk positions (raw and normalized by scene-diagonal %).
Exposed as api_name=/evaluate so we can benchmark all featured demos with a
script. Wired to a small 'Compute position MSE vs Pymunk' accordion in the UI.
app.py
CHANGED
|
@@ -721,6 +721,84 @@ def scene_loaded(name: str) -> tuple[str, str]:
|
|
| 721 |
return editor_html(bundle), json.dumps(bundle)
|
| 722 |
|
| 723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
# -----------------------------------------------------------------------------
|
| 725 |
# Simulation (streamed)
|
| 726 |
# -----------------------------------------------------------------------------
|
|
@@ -927,5 +1005,12 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
|
|
| 927 |
run.click(simulate, [scene_state, scenario, n_frames, temperature],
|
| 928 |
[view, view_truth, gif, status])
|
| 929 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
if __name__ == "__main__":
|
| 931 |
demo.launch()
|
|
|
|
| 721 |
return editor_html(bundle), json.dumps(bundle)
|
| 722 |
|
| 723 |
|
| 724 |
+
# -----------------------------------------------------------------------------
|
| 725 |
+
# Numerical evaluation: model rollout vs Pymunk ground truth (position MSE).
|
| 726 |
+
# Exposed via the api_name="/evaluate" endpoint so we can benchmark featured
|
| 727 |
+
# demos from a script without scraping the UI.
|
| 728 |
+
# -----------------------------------------------------------------------------
|
| 729 |
+
@gpu(duration=300)
|
| 730 |
+
def evaluate(scene_json: str, scenario_name: str, n_frames: int):
|
| 731 |
+
bundle = json.loads(scene_json)
|
| 732 |
+
header = bundle["header"]
|
| 733 |
+
initial = bundle.get("initial_frames") or []
|
| 734 |
+
n_obj = (header.get("object_count")
|
| 735 |
+
or len(header.get("objects", []))
|
| 736 |
+
or (len(initial[0]["objects"]) if initial else 0))
|
| 737 |
+
x0, x1, y0, y1 = scene_bounds(header)
|
| 738 |
+
diag = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
|
| 739 |
+
|
| 740 |
+
gt_frames = pymunk_rollout(header, initial[-1], int(n_frames))
|
| 741 |
+
gt_by_frame = {f["frame"]: f for f in gt_frames}
|
| 742 |
+
|
| 743 |
+
from llama_cpp import Llama # noqa: F401 (preload may be required)
|
| 744 |
+
llm = get_llm(lambda s: None)
|
| 745 |
+
budget = int(min(2000, n_obj * 36 + 100))
|
| 746 |
+
|
| 747 |
+
rolled: list[dict] = list(initial)
|
| 748 |
+
last_idx = initial[-1]["frame"] if initial else 0
|
| 749 |
+
per_frame: list[dict] = []
|
| 750 |
+
t0 = time.time()
|
| 751 |
+
|
| 752 |
+
for _ in range(int(n_frames)):
|
| 753 |
+
prompt, _ctx_frames = fit_prompt(llm, header, rolled, budget)
|
| 754 |
+
next_idx = last_idx + 2
|
| 755 |
+
stops = [f"Frame {next_idx+d}:" for d in range(0, 4)]
|
| 756 |
+
out = llm.create_completion(prompt, max_tokens=budget, temperature=0.0, top_p=0.95, stop=stops)
|
| 757 |
+
text = out["choices"][0]["text"]
|
| 758 |
+
parsed = parse_frame(split_first_frame(text), n_obj)
|
| 759 |
+
modeled = len(parsed)
|
| 760 |
+
prev_objs = {o["id"]: o for o in rolled[-1]["objects"]} if rolled else {}
|
| 761 |
+
new_objs = dict(parsed) if parsed else dict(prev_objs)
|
| 762 |
+
if modeled < n_obj:
|
| 763 |
+
for oid, o in prev_objs.items():
|
| 764 |
+
new_objs.setdefault(oid, o)
|
| 765 |
+
last_idx += 1
|
| 766 |
+
rolled.append({
|
| 767 |
+
"frame": last_idx,
|
| 768 |
+
"description": emitted_description(text) or f"Frame {last_idx}: simulation in progress.",
|
| 769 |
+
"objects": list(new_objs.values()),
|
| 770 |
+
})
|
| 771 |
+
|
| 772 |
+
gt = gt_by_frame.get(last_idx)
|
| 773 |
+
if gt:
|
| 774 |
+
gt_pos = {o["id"]: o["position"] for o in gt["objects"]}
|
| 775 |
+
errs = []
|
| 776 |
+
for oid, o in new_objs.items():
|
| 777 |
+
if oid in gt_pos:
|
| 778 |
+
dx = gt_pos[oid]["x"] - o["position"]["x"]
|
| 779 |
+
dy = gt_pos[oid]["y"] - o["position"]["y"]
|
| 780 |
+
errs.append((dx * dx + dy * dy) ** 0.5)
|
| 781 |
+
per_frame.append({
|
| 782 |
+
"frame": last_idx, "modeled": modeled,
|
| 783 |
+
"mean_dist": (sum(errs) / len(errs)) if errs else None,
|
| 784 |
+
"max_dist": max(errs) if errs else None,
|
| 785 |
+
})
|
| 786 |
+
|
| 787 |
+
valid = [p for p in per_frame if p["mean_dist"] is not None]
|
| 788 |
+
mean_dist = (sum(p["mean_dist"] for p in valid) / len(valid)) if valid else None
|
| 789 |
+
return json.dumps({
|
| 790 |
+
"scenario": scenario_name,
|
| 791 |
+
"n_obj": n_obj,
|
| 792 |
+
"scene_diag": diag,
|
| 793 |
+
"frames_done": len(per_frame),
|
| 794 |
+
"frames_held_avg": sum(n_obj - p["modeled"] for p in per_frame) / max(1, len(per_frame)),
|
| 795 |
+
"mean_dist": mean_dist,
|
| 796 |
+
"mean_dist_pct_diag": (mean_dist / diag * 100.0) if (mean_dist and diag) else None,
|
| 797 |
+
"elapsed": round(time.time() - t0, 2),
|
| 798 |
+
"per_frame": per_frame,
|
| 799 |
+
})
|
| 800 |
+
|
| 801 |
+
|
| 802 |
# -----------------------------------------------------------------------------
|
| 803 |
# Simulation (streamed)
|
| 804 |
# -----------------------------------------------------------------------------
|
|
|
|
| 1005 |
run.click(simulate, [scene_state, scenario, n_frames, temperature],
|
| 1006 |
[view, view_truth, gif, status])
|
| 1007 |
|
| 1008 |
+
with gr.Accordion("📊 Compute position MSE vs Pymunk (numerical)", open=False):
|
| 1009 |
+
with gr.Row():
|
| 1010 |
+
eval_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to evaluate")
|
| 1011 |
+
eval_btn = gr.Button("Run evaluation", scale=1)
|
| 1012 |
+
eval_out = gr.Code(language="json", lines=12, label="Result")
|
| 1013 |
+
eval_btn.click(evaluate, [scene_state, scenario, eval_frames], [eval_out])
|
| 1014 |
+
|
| 1015 |
if __name__ == "__main__":
|
| 1016 |
demo.launch()
|