Julien Simon Claude Opus 4.6 commited on
Commit
eceb156
·
1 Parent(s): 2c9d67a

Fix GPU task aborted: call @spaces.GPU synchronously, not via to_thread

Browse files

asyncio.to_thread loses Gradio's request context, causing ZeroGPU to
abort GPU allocation. Now Nano runs synchronously in the main thread.
Mini+Large API calls are launched first so they fly in parallel while
Nano blocks on GPU.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +20 -4
  2. openrouter.py +9 -5
  3. test_openrouter.py +1 -1
app.py CHANGED
@@ -162,8 +162,11 @@ async def handle_comparison(prompt: str, request: gr.Request):
162
  model_results = {"nano": None, "mini": None, "large": None}
163
  yield _build_ui_state(model_results, None)
164
 
165
- # Phase 2: Fire all models in parallel, yield as each completes
166
- async def call_one(key):
 
 
 
167
  m = MODELS[key]
168
  return key, await call_model(
169
  model_id=m["id"], prompt=prompt,
@@ -171,9 +174,22 @@ async def handle_comparison(prompt: str, request: gr.Request):
171
  provider=m.get("provider", "openrouter"),
172
  )
173
 
174
- tasks = [asyncio.create_task(call_one(k)) for k in ["nano", "mini", "large"]]
 
175
 
176
- for coro in asyncio.as_completed(tasks):
 
 
 
 
 
 
 
 
 
 
 
 
177
  key, result = await coro
178
  model_results[key] = result
179
  costs = compute_costs({k: v for k, v in model_results.items() if v is not None})
 
162
  model_results = {"nano": None, "mini": None, "large": None}
163
  yield _build_ui_state(model_results, None)
164
 
165
+ # Phase 2: Start Mini+Large async, then run Nano synchronously.
166
+ # @spaces.GPU needs Gradio's request context (main thread), so Nano
167
+ # cannot use asyncio.to_thread. We launch API calls first so they
168
+ # fly in parallel while Nano blocks on GPU.
169
+ async def call_api(key):
170
  m = MODELS[key]
171
  return key, await call_model(
172
  model_id=m["id"], prompt=prompt,
 
174
  provider=m.get("provider", "openrouter"),
175
  )
176
 
177
+ mini_task = asyncio.create_task(call_api("mini"))
178
+ large_task = asyncio.create_task(call_api("large"))
179
 
180
+ # Nano runs synchronously (blocks event loop, but ZeroGPU requires it)
181
+ nano_m = MODELS["nano"]
182
+ nano_result = await call_model(
183
+ model_id=nano_m["id"], prompt=prompt,
184
+ max_tokens=nano_m["max_tokens"], timeout=MODEL_TIMEOUT,
185
+ provider=nano_m.get("provider", "openrouter"),
186
+ )
187
+ model_results["nano"] = nano_result
188
+ costs = compute_costs({"nano": nano_result})
189
+ yield _build_ui_state(model_results, costs)
190
+
191
+ # Collect Mini and Large as they complete
192
+ for coro in asyncio.as_completed([mini_task, large_task]):
193
  key, result = await coro
194
  model_results[key] = result
195
  costs = compute_costs({k: v for k, v in model_results.items() if v is not None})
openrouter.py CHANGED
@@ -63,15 +63,18 @@ async def _call_openrouter(
63
  }
64
 
65
 
66
- async def _call_local(
67
  model_id: str, prompt: str, max_tokens: int, timeout: int
68
  ) -> dict:
69
- """Call Trinity Nano locally via ZeroGPU."""
 
 
 
 
70
  try:
71
  if _nano_generate is None:
72
  raise RuntimeError("Local model not available (spaces package not installed)")
73
- result = await asyncio.to_thread(_nano_generate, prompt, max_tokens)
74
- return result
75
  except Exception as e:
76
  print(f"[Nano error] {type(e).__name__}: {e}")
77
  return {
@@ -87,7 +90,8 @@ async def call_model(
87
  provider: str = "openrouter",
88
  ) -> dict:
89
  if provider == "local":
90
- return await _call_local(model_id, prompt, max_tokens, timeout)
 
91
  return await _call_openrouter(model_id, prompt, max_tokens, timeout)
92
 
93
 
 
63
  }
64
 
65
 
66
+ def _call_local_sync(
67
  model_id: str, prompt: str, max_tokens: int, timeout: int
68
  ) -> dict:
69
+ """Call Trinity Nano locally via ZeroGPU (synchronous).
70
+
71
+ Must be called from the main thread — @spaces.GPU needs Gradio's
72
+ request context to allocate GPU, which is lost in thread pools.
73
+ """
74
  try:
75
  if _nano_generate is None:
76
  raise RuntimeError("Local model not available (spaces package not installed)")
77
+ return _nano_generate(prompt, max_tokens)
 
78
  except Exception as e:
79
  print(f"[Nano error] {type(e).__name__}: {e}")
80
  return {
 
90
  provider: str = "openrouter",
91
  ) -> dict:
92
  if provider == "local":
93
+ # Synchronous @spaces.GPU needs Gradio's request context
94
+ return _call_local_sync(model_id, prompt, max_tokens, timeout)
95
  return await _call_openrouter(model_id, prompt, max_tokens, timeout)
96
 
97
 
test_openrouter.py CHANGED
@@ -84,7 +84,7 @@ async def test_call_models_parallel():
84
  }
85
 
86
  with patch("openrouter.httpx.AsyncClient") as MockClient, \
87
- patch("openrouter._call_local", new_callable=AsyncMock, return_value=mock_local_result):
88
  mock_client = AsyncMock()
89
  mock_client.post.return_value = mock_response
90
  MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)
 
84
  }
85
 
86
  with patch("openrouter.httpx.AsyncClient") as MockClient, \
87
+ patch("openrouter._call_local_sync", return_value=mock_local_result):
88
  mock_client = AsyncMock()
89
  mock_client.post.return_value = mock_response
90
  MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client)