Spaces:

juliensimon
/

trinity-arena

Running on Zero

Julien Simon Claude Opus 4.6 commited on 1 day ago

Commit

eceb156

1 Parent(s): 2c9d67a

Fix GPU task aborted: call @spaces.GPU synchronously, not via to_thread

asyncio.to_thread loses Gradio's request context, causing ZeroGPU to
abort GPU allocation. Now Nano runs synchronously in the main thread.
Mini+Large API calls are launched first so they fly in parallel while
Nano blocks on GPU.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

app.py +20 -4
openrouter.py +9 -5
test_openrouter.py +1 -1

app.py CHANGED Viewed

@@ -162,8 +162,11 @@ async def handle_comparison(prompt: str, request: gr.Request):
     model_results = {"nano": None, "mini": None, "large": None}
     yield _build_ui_state(model_results, None)
-    # Phase 2: Fire all models in parallel, yield as each completes
-    async def call_one(key):
         m = MODELS[key]
         return key, await call_model(
             model_id=m["id"], prompt=prompt,
@@ -171,9 +174,22 @@ async def handle_comparison(prompt: str, request: gr.Request):
             provider=m.get("provider", "openrouter"),
         )
-    tasks = [asyncio.create_task(call_one(k)) for k in ["nano", "mini", "large"]]
-    for coro in asyncio.as_completed(tasks):
         key, result = await coro
         model_results[key] = result
         costs = compute_costs({k: v for k, v in model_results.items() if v is not None})

     model_results = {"nano": None, "mini": None, "large": None}
     yield _build_ui_state(model_results, None)
+    # Phase 2: Start Mini+Large async, then run Nano synchronously.
+    # @spaces.GPU needs Gradio's request context (main thread), so Nano
+    # cannot use asyncio.to_thread. We launch API calls first so they
+    # fly in parallel while Nano blocks on GPU.
+    async def call_api(key):
         m = MODELS[key]
         return key, await call_model(
             model_id=m["id"], prompt=prompt,
             provider=m.get("provider", "openrouter"),
         )
+    mini_task = asyncio.create_task(call_api("mini"))
+    large_task = asyncio.create_task(call_api("large"))
+    # Nano runs synchronously (blocks event loop, but ZeroGPU requires it)
+    nano_m = MODELS["nano"]
+    nano_result = await call_model(
+        model_id=nano_m["id"], prompt=prompt,
+        max_tokens=nano_m["max_tokens"], timeout=MODEL_TIMEOUT,
+        provider=nano_m.get("provider", "openrouter"),
+    )
+    model_results["nano"] = nano_result
+    costs = compute_costs({"nano": nano_result})
+    yield _build_ui_state(model_results, costs)
+    # Collect Mini and Large as they complete
+    for coro in asyncio.as_completed([mini_task, large_task]):
         key, result = await coro
         model_results[key] = result
         costs = compute_costs({k: v for k, v in model_results.items() if v is not None})

openrouter.py CHANGED Viewed

@@ -63,15 +63,18 @@ async def _call_openrouter(
     }
-async def _call_local(
     model_id: str, prompt: str, max_tokens: int, timeout: int
 ) -> dict:
-    """Call Trinity Nano locally via ZeroGPU."""
     try:
         if _nano_generate is None:
             raise RuntimeError("Local model not available (spaces package not installed)")
-        result = await asyncio.to_thread(_nano_generate, prompt, max_tokens)
-        return result
     except Exception as e:
         print(f"[Nano error] {type(e).__name__}: {e}")
         return {
@@ -87,7 +90,8 @@ async def call_model(
     provider: str = "openrouter",
 ) -> dict:
     if provider == "local":
-        return await _call_local(model_id, prompt, max_tokens, timeout)
     return await _call_openrouter(model_id, prompt, max_tokens, timeout)

     }
+def _call_local_sync(
     model_id: str, prompt: str, max_tokens: int, timeout: int
 ) -> dict:
+    """Call Trinity Nano locally via ZeroGPU (synchronous).
+    Must be called from the main thread — @spaces.GPU needs Gradio's
+    request context to allocate GPU, which is lost in thread pools.
+    """
     try:
         if _nano_generate is None:
             raise RuntimeError("Local model not available (spaces package not installed)")
+        return _nano_generate(prompt, max_tokens)
     except Exception as e:
         print(f"[Nano error] {type(e).__name__}: {e}")
         return {
     provider: str = "openrouter",
 ) -> dict:
     if provider == "local":
+        # Synchronous — @spaces.GPU needs Gradio's request context
+        return _call_local_sync(model_id, prompt, max_tokens, timeout)
     return await _call_openrouter(model_id, prompt, max_tokens, timeout)

test_openrouter.py CHANGED Viewed

@@ -84,7 +84,7 @@ async def test_call_models_parallel():
     }
     with patch("openrouter.httpx.AsyncClient") as MockClient, \
-         patch("openrouter._call_local", new_callable=AsyncMock, return_value=mock_local_result):
         mock_client = AsyncMock()
         mock_client.post.return_value = mock_response
         MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)

     }
     with patch("openrouter.httpx.AsyncClient") as MockClient, \
+         patch("openrouter._call_local_sync", return_value=mock_local_result):
         mock_client = AsyncMock()
         mock_client.post.return_value = mock_response
         MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)