Spaces:

AlexWortega
/

physics-llm

Running on Zero

App Files Files Community

AlexWortega commited on 10 days ago

Commit

f9ff64c

unverified ·

1 Parent(s): b5308c5

Fix CPU perf: pin n_threads=2 (cpu-basic), tighter token budget

Browse files

Root cause of ~2000s/frame: n_threads=None made llama.cpp read the host CPU
count and oversubscribe the 2 real vCPUs. Pin n_threads/n_threads_batch=2,
trim n_ctx to 2048, keep 3 frames of context, cap max_tokens at ~n_obj*30+50,
and default the slider to 15 frames.

Files changed (1) hide show

app.py +12 -5

app.py CHANGED Viewed

@@ -31,8 +31,12 @@ EXAMPLES_DIR = HERE / "backend" / "examples"
 GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
 GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
-N_CTX = 4096
-KEEP_FRAMES = 4  # header + last N frames kept in the prompt (CPU speed)
 # -----------------------------------------------------------------------------
 # Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
@@ -156,7 +160,8 @@ def get_llm(log=lambda s: None):
     log("Downloading model (≈216 MB, first run only)…")
     path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
     log("Loading LFM2-350M into llama.cpp…")
-    _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=None, verbose=False)
     log("Model ready.")
     return _LLM
@@ -273,7 +278,9 @@ def simulate(scenario: str, n_frames: int, temperature: float):
         yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
         return
-    budget = int(min(600, n_obj * 45 + 80))
     t0 = time.time()
     for step in range(int(n_frames)):
@@ -341,7 +348,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
                 value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
                 label="Scenario",
             )
-            n_frames = gr.Slider(5, 60, value=30, step=1, label="Frames to predict")
             temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
                                     label="Temperature (0 = greedy)")
             run = gr.Button("▶ Simulate", variant="primary")

 GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
 GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
+N_CTX = 2048
+KEEP_FRAMES = 3  # header + last N frames kept in the prompt (CPU speed)
+# cpu-basic Spaces have 2 vCPUs, but the container reports the host's (large)
+# CPU count — letting llama.cpp auto-pick n_threads oversubscribes the 2 real
+# cores and is ~100x slower. Pin to the real core count.
+N_THREADS = 2
 # -----------------------------------------------------------------------------
 # Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
     log("Downloading model (≈216 MB, first run only)…")
     path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
     log("Loading LFM2-350M into llama.cpp…")
+    _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=N_THREADS,
+                 n_threads_batch=N_THREADS, verbose=False)
     log("Model ready.")
     return _LLM
         yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
         return
+    # One object line ≈ 24 tokens; cap tightly so we don't generate deep into
+    # the next frame (we only keep the first frame anyway).
+    budget = int(min(420, n_obj * 30 + 50))
     t0 = time.time()
     for step in range(int(n_frames)):
                 value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
                 label="Scenario",
             )
+            n_frames = gr.Slider(5, 60, value=15, step=1, label="Frames to predict")
             temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
                                     label="Temperature (0 = greedy)")
             run = gr.Button("▶ Simulate", variant="primary")