Spaces:
Running on Zero
Running on Zero
Julien Simon Claude Opus 4.6 commited on
Commit ·
eceb156
1
Parent(s): 2c9d67a
Fix GPU task aborted: call @spaces.GPU synchronously, not via to_thread
Browse filesasyncio.to_thread loses Gradio's request context, causing ZeroGPU to
abort GPU allocation. Now Nano runs synchronously in the main thread.
Mini+Large API calls are launched first so they fly in parallel while
Nano blocks on GPU.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- app.py +20 -4
- openrouter.py +9 -5
- test_openrouter.py +1 -1
app.py
CHANGED
|
@@ -162,8 +162,11 @@ async def handle_comparison(prompt: str, request: gr.Request):
|
|
| 162 |
model_results = {"nano": None, "mini": None, "large": None}
|
| 163 |
yield _build_ui_state(model_results, None)
|
| 164 |
|
| 165 |
-
# Phase 2:
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
m = MODELS[key]
|
| 168 |
return key, await call_model(
|
| 169 |
model_id=m["id"], prompt=prompt,
|
|
@@ -171,9 +174,22 @@ async def handle_comparison(prompt: str, request: gr.Request):
|
|
| 171 |
provider=m.get("provider", "openrouter"),
|
| 172 |
)
|
| 173 |
|
| 174 |
-
|
|
|
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
key, result = await coro
|
| 178 |
model_results[key] = result
|
| 179 |
costs = compute_costs({k: v for k, v in model_results.items() if v is not None})
|
|
|
|
| 162 |
model_results = {"nano": None, "mini": None, "large": None}
|
| 163 |
yield _build_ui_state(model_results, None)
|
| 164 |
|
| 165 |
+
# Phase 2: Start Mini+Large async, then run Nano synchronously.
|
| 166 |
+
# @spaces.GPU needs Gradio's request context (main thread), so Nano
|
| 167 |
+
# cannot use asyncio.to_thread. We launch API calls first so they
|
| 168 |
+
# fly in parallel while Nano blocks on GPU.
|
| 169 |
+
async def call_api(key):
|
| 170 |
m = MODELS[key]
|
| 171 |
return key, await call_model(
|
| 172 |
model_id=m["id"], prompt=prompt,
|
|
|
|
| 174 |
provider=m.get("provider", "openrouter"),
|
| 175 |
)
|
| 176 |
|
| 177 |
+
mini_task = asyncio.create_task(call_api("mini"))
|
| 178 |
+
large_task = asyncio.create_task(call_api("large"))
|
| 179 |
|
| 180 |
+
# Nano runs synchronously (blocks event loop, but ZeroGPU requires it)
|
| 181 |
+
nano_m = MODELS["nano"]
|
| 182 |
+
nano_result = await call_model(
|
| 183 |
+
model_id=nano_m["id"], prompt=prompt,
|
| 184 |
+
max_tokens=nano_m["max_tokens"], timeout=MODEL_TIMEOUT,
|
| 185 |
+
provider=nano_m.get("provider", "openrouter"),
|
| 186 |
+
)
|
| 187 |
+
model_results["nano"] = nano_result
|
| 188 |
+
costs = compute_costs({"nano": nano_result})
|
| 189 |
+
yield _build_ui_state(model_results, costs)
|
| 190 |
+
|
| 191 |
+
# Collect Mini and Large as they complete
|
| 192 |
+
for coro in asyncio.as_completed([mini_task, large_task]):
|
| 193 |
key, result = await coro
|
| 194 |
model_results[key] = result
|
| 195 |
costs = compute_costs({k: v for k, v in model_results.items() if v is not None})
|
openrouter.py
CHANGED
|
@@ -63,15 +63,18 @@ async def _call_openrouter(
|
|
| 63 |
}
|
| 64 |
|
| 65 |
|
| 66 |
-
|
| 67 |
model_id: str, prompt: str, max_tokens: int, timeout: int
|
| 68 |
) -> dict:
|
| 69 |
-
"""Call Trinity Nano locally via ZeroGPU.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
try:
|
| 71 |
if _nano_generate is None:
|
| 72 |
raise RuntimeError("Local model not available (spaces package not installed)")
|
| 73 |
-
|
| 74 |
-
return result
|
| 75 |
except Exception as e:
|
| 76 |
print(f"[Nano error] {type(e).__name__}: {e}")
|
| 77 |
return {
|
|
@@ -87,7 +90,8 @@ async def call_model(
|
|
| 87 |
provider: str = "openrouter",
|
| 88 |
) -> dict:
|
| 89 |
if provider == "local":
|
| 90 |
-
|
|
|
|
| 91 |
return await _call_openrouter(model_id, prompt, max_tokens, timeout)
|
| 92 |
|
| 93 |
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
|
| 66 |
+
def _call_local_sync(
|
| 67 |
model_id: str, prompt: str, max_tokens: int, timeout: int
|
| 68 |
) -> dict:
|
| 69 |
+
"""Call Trinity Nano locally via ZeroGPU (synchronous).
|
| 70 |
+
|
| 71 |
+
Must be called from the main thread — @spaces.GPU needs Gradio's
|
| 72 |
+
request context to allocate GPU, which is lost in thread pools.
|
| 73 |
+
"""
|
| 74 |
try:
|
| 75 |
if _nano_generate is None:
|
| 76 |
raise RuntimeError("Local model not available (spaces package not installed)")
|
| 77 |
+
return _nano_generate(prompt, max_tokens)
|
|
|
|
| 78 |
except Exception as e:
|
| 79 |
print(f"[Nano error] {type(e).__name__}: {e}")
|
| 80 |
return {
|
|
|
|
| 90 |
provider: str = "openrouter",
|
| 91 |
) -> dict:
|
| 92 |
if provider == "local":
|
| 93 |
+
# Synchronous — @spaces.GPU needs Gradio's request context
|
| 94 |
+
return _call_local_sync(model_id, prompt, max_tokens, timeout)
|
| 95 |
return await _call_openrouter(model_id, prompt, max_tokens, timeout)
|
| 96 |
|
| 97 |
|
test_openrouter.py
CHANGED
|
@@ -84,7 +84,7 @@ async def test_call_models_parallel():
|
|
| 84 |
}
|
| 85 |
|
| 86 |
with patch("openrouter.httpx.AsyncClient") as MockClient, \
|
| 87 |
-
patch("openrouter.
|
| 88 |
mock_client = AsyncMock()
|
| 89 |
mock_client.post.return_value = mock_response
|
| 90 |
MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
|
|
| 84 |
}
|
| 85 |
|
| 86 |
with patch("openrouter.httpx.AsyncClient") as MockClient, \
|
| 87 |
+
patch("openrouter._call_local_sync", return_value=mock_local_result):
|
| 88 |
mock_client = AsyncMock()
|
| 89 |
mock_client.post.return_value = mock_response
|
| 90 |
MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|