AlexWortega commited on
Commit
f9ff64c
·
unverified ·
1 Parent(s): b5308c5

Fix CPU perf: pin n_threads=2 (cpu-basic), tighter token budget

Browse files

Root cause of ~2000s/frame: n_threads=None made llama.cpp read the host CPU
count and oversubscribe the 2 real vCPUs. Pin n_threads/n_threads_batch=2,
trim n_ctx to 2048, keep 3 frames of context, cap max_tokens at ~n_obj*30+50,
and default the slider to 15 frames.

Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -31,8 +31,12 @@ EXAMPLES_DIR = HERE / "backend" / "examples"
31
 
32
  GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
33
  GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
34
- N_CTX = 4096
35
- KEEP_FRAMES = 4 # header + last N frames kept in the prompt (CPU speed)
 
 
 
 
36
 
37
  # -----------------------------------------------------------------------------
38
  # Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
@@ -156,7 +160,8 @@ def get_llm(log=lambda s: None):
156
  log("Downloading model (≈216 MB, first run only)…")
157
  path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
158
  log("Loading LFM2-350M into llama.cpp…")
159
- _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=None, verbose=False)
 
160
  log("Model ready.")
161
  return _LLM
162
 
@@ -273,7 +278,9 @@ def simulate(scenario: str, n_frames: int, temperature: float):
273
  yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
274
  return
275
 
276
- budget = int(min(600, n_obj * 45 + 80))
 
 
277
  t0 = time.time()
278
 
279
  for step in range(int(n_frames)):
@@ -341,7 +348,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
341
  value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
342
  label="Scenario",
343
  )
344
- n_frames = gr.Slider(5, 60, value=30, step=1, label="Frames to predict")
345
  temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
346
  label="Temperature (0 = greedy)")
347
  run = gr.Button("▶ Simulate", variant="primary")
 
31
 
32
  GGUF_REPO = "AlexWortega/lfm2-scenarios-GGUF"
33
  GGUF_FILE = "lfm2-scenarios-Q4_K_M.gguf"
34
+ N_CTX = 2048
35
+ KEEP_FRAMES = 3 # header + last N frames kept in the prompt (CPU speed)
36
+ # cpu-basic Spaces have 2 vCPUs, but the container reports the host's (large)
37
+ # CPU count — letting llama.cpp auto-pick n_threads oversubscribes the 2 real
38
+ # cores and is ~100x slower. Pin to the real core count.
39
+ N_THREADS = 2
40
 
41
  # -----------------------------------------------------------------------------
42
  # Prompt format (ported from PhysicsLLMEngine/browser_demo/src/promptFormat.ts)
 
160
  log("Downloading model (≈216 MB, first run only)…")
161
  path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
162
  log("Loading LFM2-350M into llama.cpp…")
163
+ _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=N_THREADS,
164
+ n_threads_batch=N_THREADS, verbose=False)
165
  log("Model ready.")
166
  return _LLM
167
 
 
278
  yield (gif_frames[-1] if gif_frames else None), None, f"Model load failed: {exc}"
279
  return
280
 
281
+ # One object line 24 tokens; cap tightly so we don't generate deep into
282
+ # the next frame (we only keep the first frame anyway).
283
+ budget = int(min(420, n_obj * 30 + 50))
284
  t0 = time.time()
285
 
286
  for step in range(int(n_frames)):
 
348
  value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
349
  label="Scenario",
350
  )
351
+ n_frames = gr.Slider(5, 60, value=15, step=1, label="Frames to predict")
352
  temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
353
  label="Temperature (0 = greedy)")
354
  run = gr.Button("▶ Simulate", variant="primary")