Spaces:
Running on Zero
Running on Zero
Fix CPU perf: pin n_threads=2 (cpu-basic), tighter token budget
Browse filesRoot cause of ~2000s/frame: n_threads=None made llama.cpp read the host CPU
count and oversubscribe the 2 real vCPUs. Pin n_threads/n_threads_batch=2,
trim n_ctx to 2048, keep 3 frames of context, cap max_tokens at ~n_obj*30+50,
and default the slider to 15 frames.
app.py
CHANGED
|
@@ -31,8 +31,12 @@ EXAMPLES_DIR = HERE / "backend" / "examples"
|
|
| 31 |
|
| 32 |
GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
|
| 33 |
GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
|
| 34 |
-
N_CTX =
|
| 35 |
-
KEEP_FRAMES =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# -----------------------------------------------------------------------------
|
| 38 |
# Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
|
|
@@ -156,7 +160,8 @@ def get_llm(log=lambda s: None):
|
|
| 156 |
log("Downloading model (≈216 MB, first run only)…")
|
| 157 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
|
| 158 |
log("Loading LFM2-350M into llama.cpp…")
|
| 159 |
-
_LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=
|
|
|
|
| 160 |
log("Model ready.")
|
| 161 |
return _LLM
|
| 162 |
|
|
@@ -273,7 +278,9 @@ def simulate(scenario: str, n_frames: int, temperature: float):
|
|
| 273 |
yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
|
| 274 |
return
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
| 277 |
t0 = time.time()
|
| 278 |
|
| 279 |
for step in range(int(n_frames)):
|
|
@@ -341,7 +348,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
|
|
| 341 |
value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
|
| 342 |
label="Scenario",
|
| 343 |
)
|
| 344 |
-
n_frames = gr.Slider(5, 60, value=
|
| 345 |
temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 346 |
label="Temperature (0 = greedy)")
|
| 347 |
run = gr.Button("▶ Simulate", variant="primary")
|
|
|
|
| 31 |
|
| 32 |
GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
|
| 33 |
GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
|
| 34 |
+
N_CTX = 2048
|
| 35 |
+
KEEP_FRAMES = 3 # header + last N frames kept in the prompt (CPU speed)
|
| 36 |
+
# cpu-basic Spaces have 2 vCPUs, but the container reports the host's (large)
|
| 37 |
+
# CPU count — letting llama.cpp auto-pick n_threads oversubscribes the 2 real
|
| 38 |
+
# cores and is ~100x slower. Pin to the real core count.
|
| 39 |
+
N_THREADS = 2
|
| 40 |
|
| 41 |
# -----------------------------------------------------------------------------
|
| 42 |
# Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
|
|
|
|
| 160 |
log("Downloading model (≈216 MB, first run only)…")
|
| 161 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
|
| 162 |
log("Loading LFM2-350M into llama.cpp…")
|
| 163 |
+
_LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=N_THREADS,
|
| 164 |
+
n_threads_batch=N_THREADS, verbose=False)
|
| 165 |
log("Model ready.")
|
| 166 |
return _LLM
|
| 167 |
|
|
|
|
| 278 |
yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
|
| 279 |
return
|
| 280 |
|
| 281 |
+
# One object line ≈ 24 tokens; cap tightly so we don't generate deep into
|
| 282 |
+
# the next frame (we only keep the first frame anyway).
|
| 283 |
+
budget = int(min(420, n_obj * 30 + 50))
|
| 284 |
t0 = time.time()
|
| 285 |
|
| 286 |
for step in range(int(n_frames)):
|
|
|
|
| 348 |
value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
|
| 349 |
label="Scenario",
|
| 350 |
)
|
| 351 |
+
n_frames = gr.Slider(5, 60, value=15, step=1, label="Frames to predict")
|
| 352 |
temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 353 |
label="Temperature (0 = greedy)")
|
| 354 |
run = gr.Button("▶ Simulate", variant="primary")
|