Spaces:

AlexWortega
/

physics-llm

Running on Zero

App Files Files Community

AlexWortega commited on 2 days ago

Commit

970e1f2

unverified ·

1 Parent(s): 36a90c6

Add /evaluate endpoint: numerical position MSE model vs Pymunk

Browse files

@spaces.GPU function that runs the model rollout AND a fresh Pymunk rollout
from the same seed state, then returns per-frame mean Euclidean distance
between model and Pymunk positions (raw and normalized by scene-diagonal %).
Exposed as api_name=/evaluate so we can benchmark all featured demos with a
script. Wired to a small 'Compute position MSE vs Pymunk' accordion in the UI.

Files changed (1) hide show

app.py +85 -0

app.py CHANGED Viewed

@@ -721,6 +721,84 @@ def scene_loaded(name: str) -> tuple[str, str]:
     return editor_html(bundle), json.dumps(bundle)
 # -----------------------------------------------------------------------------
 # Simulation (streamed)
 # -----------------------------------------------------------------------------
@@ -927,5 +1005,12 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
     run.click(simulate, [scene_state, scenario, n_frames, temperature],
               [view, view_truth, gif, status])
 if __name__ == "__main__":
     demo.launch()

     return editor_html(bundle), json.dumps(bundle)
+# -----------------------------------------------------------------------------
+# Numerical evaluation: model rollout vs Pymunk ground truth (position MSE).
+# Exposed via the api_name="/evaluate" endpoint so we can benchmark featured
+# demos from a script without scraping the UI.
+# -----------------------------------------------------------------------------
+@gpu(duration=300)
+def evaluate(scene_json: str, scenario_name: str, n_frames: int):
+    bundle = json.loads(scene_json)
+    header = bundle["header"]
+    initial = bundle.get("initial_frames") or []
+    n_obj = (header.get("object_count")
+             or len(header.get("objects", []))
+             or (len(initial[0]["objects"]) if initial else 0))
+    x0, x1, y0, y1 = scene_bounds(header)
+    diag = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
+    gt_frames = pymunk_rollout(header, initial[-1], int(n_frames))
+    gt_by_frame = {f["frame"]: f for f in gt_frames}
+    from llama_cpp import Llama  # noqa: F401  (preload may be required)
+    llm = get_llm(lambda s: None)
+    budget = int(min(2000, n_obj * 36 + 100))
+    rolled: list[dict] = list(initial)
+    last_idx = initial[-1]["frame"] if initial else 0
+    per_frame: list[dict] = []
+    t0 = time.time()
+    for _ in range(int(n_frames)):
+        prompt, _ctx_frames = fit_prompt(llm, header, rolled, budget)
+        next_idx = last_idx + 2
+        stops = [f"Frame {next_idx+d}:" for d in range(0, 4)]
+        out = llm.create_completion(prompt, max_tokens=budget, temperature=0.0, top_p=0.95, stop=stops)
+        text = out["choices"][0]["text"]
+        parsed = parse_frame(split_first_frame(text), n_obj)
+        modeled = len(parsed)
+        prev_objs = {o["id"]: o for o in rolled[-1]["objects"]} if rolled else {}
+        new_objs = dict(parsed) if parsed else dict(prev_objs)
+        if modeled < n_obj:
+            for oid, o in prev_objs.items():
+                new_objs.setdefault(oid, o)
+        last_idx += 1
+        rolled.append({
+            "frame": last_idx,
+            "description": emitted_description(text) or f"Frame {last_idx}: simulation in progress.",
+            "objects": list(new_objs.values()),
+        })
+        gt = gt_by_frame.get(last_idx)
+        if gt:
+            gt_pos = {o["id"]: o["position"] for o in gt["objects"]}
+            errs = []
+            for oid, o in new_objs.items():
+                if oid in gt_pos:
+                    dx = gt_pos[oid]["x"] - o["position"]["x"]
+                    dy = gt_pos[oid]["y"] - o["position"]["y"]
+                    errs.append((dx * dx + dy * dy) ** 0.5)
+            per_frame.append({
+                "frame": last_idx, "modeled": modeled,
+                "mean_dist": (sum(errs) / len(errs)) if errs else None,
+                "max_dist": max(errs) if errs else None,
+            })
+    valid = [p for p in per_frame if p["mean_dist"] is not None]
+    mean_dist = (sum(p["mean_dist"] for p in valid) / len(valid)) if valid else None
+    return json.dumps({
+        "scenario": scenario_name,
+        "n_obj": n_obj,
+        "scene_diag": diag,
+        "frames_done": len(per_frame),
+        "frames_held_avg": sum(n_obj - p["modeled"] for p in per_frame) / max(1, len(per_frame)),
+        "mean_dist": mean_dist,
+        "mean_dist_pct_diag": (mean_dist / diag * 100.0) if (mean_dist and diag) else None,
+        "elapsed": round(time.time() - t0, 2),
+        "per_frame": per_frame,
+    })
 # -----------------------------------------------------------------------------
 # Simulation (streamed)
 # -----------------------------------------------------------------------------
     run.click(simulate, [scene_state, scenario, n_frames, temperature],
               [view, view_truth, gif, status])
+    with gr.Accordion("📊 Compute position MSE vs Pymunk (numerical)", open=False):
+        with gr.Row():
+            eval_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to evaluate")
+            eval_btn = gr.Button("Run evaluation", scale=1)
+        eval_out = gr.Code(language="json", lines=12, label="Result")
+        eval_btn.click(evaluate, [scene_state, scenario, eval_frames], [eval_out])
 if __name__ == "__main__":
     demo.launch()