serenichron commited on
Commit
6a11a38
·
1 Parent(s): 67f3d72

Revert to FastAPI-first with Gradio at /ui path

Browse files

- FastAPI app at root for OpenAI-compatible API routes
- Gradio UI mounted at /ui path
- With Gradio 5.16.1+, this should work with ZeroGPU
- API routes: /health, /v1/chat/completions, /v1/models

Files changed (1) hide show
  1. app.py +111 -106
app.py CHANGED
@@ -18,8 +18,8 @@ from typing import Optional
18
 
19
  import gradio as gr
20
  import httpx
21
- from fastapi import Header, HTTPException, Request
22
- from fastapi.responses import StreamingResponse, JSONResponse
23
  from huggingface_hub import HfApi
24
 
25
  from config import get_config, get_quota_tracker
@@ -167,10 +167,7 @@ async def serverless_generate(
167
  )
168
 
169
  if response.status_code != 200:
170
- raise HTTPException(
171
- status_code=response.status_code,
172
- detail=f"HF Serverless error: {response.text}",
173
- )
174
 
175
  result = response.json()
176
 
@@ -179,10 +176,7 @@ async def serverless_generate(
179
  if "generated_text" in result[0]:
180
  return result[0]["generated_text"]
181
 
182
- raise HTTPException(
183
- status_code=500,
184
- detail=f"Unexpected response format from HF Serverless: {result}",
185
- )
186
 
187
 
188
  # --- Gradio Chat Function (GPU decorated for ZeroGPU) ---
@@ -230,100 +224,17 @@ def gradio_chat(
230
  return f"Error generating response: {str(e)}"
231
 
232
 
233
- # --- Build Gradio Interface ---
234
-
235
- with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
236
- gr.Markdown(
237
- """
238
- # ZeroGPU OpenCode Provider
239
-
240
- OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
241
 
242
- **API Endpoint:** `/v1/chat/completions`
243
-
244
- ## Usage with opencode
245
-
246
- Configure in `~/.config/opencode/opencode.json`:
247
-
248
- ```json
249
- {
250
- "providers": {
251
- "zerogpu": {
252
- "npm": "@ai-sdk/openai-compatible",
253
- "options": {
254
- "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
255
- "headers": {
256
- "Authorization": "Bearer hf_YOUR_TOKEN"
257
- }
258
- },
259
- "models": {
260
- "llama-8b": {
261
- "name": "meta-llama/Llama-3.1-8B-Instruct"
262
- }
263
- }
264
- }
265
- }
266
- }
267
- ```
268
-
269
- ---
270
- """
271
- )
272
-
273
- with gr.Row():
274
- with gr.Column(scale=1):
275
- model_dropdown = gr.Dropdown(
276
- label="Model",
277
- choices=[
278
- "meta-llama/Llama-3.1-8B-Instruct",
279
- "mistralai/Mistral-7B-Instruct-v0.3",
280
- "Qwen/Qwen2.5-7B-Instruct",
281
- "Qwen/Qwen2.5-14B-Instruct",
282
- ],
283
- value="meta-llama/Llama-3.1-8B-Instruct",
284
- allow_custom_value=True,
285
- )
286
- temperature_slider = gr.Slider(
287
- label="Temperature",
288
- minimum=0.0,
289
- maximum=2.0,
290
- value=0.7,
291
- step=0.1,
292
- )
293
- max_tokens_slider = gr.Slider(
294
- label="Max Tokens",
295
- minimum=64,
296
- maximum=4096,
297
- value=512,
298
- step=64,
299
- )
300
-
301
- gr.Markdown(
302
- f"""
303
- ### Status
304
- - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
305
- - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
306
- """
307
- )
308
-
309
- with gr.Column(scale=3):
310
- chatbot = gr.ChatInterface(
311
- fn=gradio_chat,
312
- additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
313
- title="",
314
- )
315
-
316
-
317
- # --- Add OpenAI-compatible API routes to Gradio's internal FastAPI app ---
318
-
319
- # Get the underlying FastAPI app from Gradio
320
- app = demo.app
321
 
322
 
323
- @app.post("/v1/chat/completions")
324
- async def chat_completions(
325
- request: Request,
326
- ):
327
  """
328
  OpenAI-compatible chat completions endpoint.
329
 
@@ -471,7 +382,7 @@ async def chat_completions(
471
  )
472
 
473
 
474
- @app.get("/v1/models")
475
  async def list_models(request: Request):
476
  """List available models (returns info about current model if loaded)."""
477
  authorization = request.headers.get("authorization")
@@ -502,7 +413,7 @@ async def list_models(request: Request):
502
  return {"object": "list", "data": models}
503
 
504
 
505
- @app.get("/health")
506
  async def health_check():
507
  """Health check endpoint."""
508
  return {
@@ -513,10 +424,104 @@ async def health_check():
513
  }
514
 
515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  # --- Launch the application ---
517
  # On HuggingFace Spaces, the runtime handles the launch automatically
518
- # The demo object is exposed for the Gradio SDK to use
519
 
520
  if __name__ == "__main__":
521
- # Local development
522
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
18
 
19
  import gradio as gr
20
  import httpx
21
+ from fastapi import FastAPI, Request
22
+ from fastapi.responses import StreamingResponse, JSONResponse, RedirectResponse
23
  from huggingface_hub import HfApi
24
 
25
  from config import get_config, get_quota_tracker
 
167
  )
168
 
169
  if response.status_code != 200:
170
+ raise Exception(f"HF Serverless error: {response.text}")
 
 
 
171
 
172
  result = response.json()
173
 
 
176
  if "generated_text" in result[0]:
177
  return result[0]["generated_text"]
178
 
179
+ raise Exception(f"Unexpected response format from HF Serverless: {result}")
 
 
 
180
 
181
 
182
  # --- Gradio Chat Function (GPU decorated for ZeroGPU) ---
 
224
  return f"Error generating response: {str(e)}"
225
 
226
 
227
+ # --- Create FastAPI app for API routes ---
 
 
 
 
 
 
 
228
 
229
+ api_app = FastAPI(
230
+ title="ZeroGPU OpenCode Provider",
231
+ description="OpenAI-compatible API for HuggingFace models on ZeroGPU",
232
+ version="1.0.0",
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
 
236
+ @api_app.post("/v1/chat/completions")
237
+ async def chat_completions(request: Request):
 
 
238
  """
239
  OpenAI-compatible chat completions endpoint.
240
 
 
382
  )
383
 
384
 
385
+ @api_app.get("/v1/models")
386
  async def list_models(request: Request):
387
  """List available models (returns info about current model if loaded)."""
388
  authorization = request.headers.get("authorization")
 
413
  return {"object": "list", "data": models}
414
 
415
 
416
+ @api_app.get("/health")
417
  async def health_check():
418
  """Health check endpoint."""
419
  return {
 
424
  }
425
 
426
 
427
+ @api_app.get("/")
428
+ async def root_redirect():
429
+ """Redirect root to Gradio UI."""
430
+ return RedirectResponse(url="/ui/")
431
+
432
+
433
+ # --- Build Gradio Interface ---
434
+
435
+ with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
436
+ gr.Markdown(
437
+ """
438
+ # ZeroGPU OpenCode Provider
439
+
440
+ OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
441
+
442
+ **API Endpoint:** `/v1/chat/completions`
443
+
444
+ ## Usage with opencode
445
+
446
+ Configure in `~/.config/opencode/opencode.json`:
447
+
448
+ ```json
449
+ {
450
+ "providers": {
451
+ "zerogpu": {
452
+ "npm": "@ai-sdk/openai-compatible",
453
+ "options": {
454
+ "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
455
+ "headers": {
456
+ "Authorization": "Bearer hf_YOUR_TOKEN"
457
+ }
458
+ },
459
+ "models": {
460
+ "llama-8b": {
461
+ "name": "meta-llama/Llama-3.1-8B-Instruct"
462
+ }
463
+ }
464
+ }
465
+ }
466
+ }
467
+ ```
468
+
469
+ ---
470
+ """
471
+ )
472
+
473
+ with gr.Row():
474
+ with gr.Column(scale=1):
475
+ model_dropdown = gr.Dropdown(
476
+ label="Model",
477
+ choices=[
478
+ "meta-llama/Llama-3.1-8B-Instruct",
479
+ "mistralai/Mistral-7B-Instruct-v0.3",
480
+ "Qwen/Qwen2.5-7B-Instruct",
481
+ "Qwen/Qwen2.5-14B-Instruct",
482
+ ],
483
+ value="meta-llama/Llama-3.1-8B-Instruct",
484
+ allow_custom_value=True,
485
+ )
486
+ temperature_slider = gr.Slider(
487
+ label="Temperature",
488
+ minimum=0.0,
489
+ maximum=2.0,
490
+ value=0.7,
491
+ step=0.1,
492
+ )
493
+ max_tokens_slider = gr.Slider(
494
+ label="Max Tokens",
495
+ minimum=64,
496
+ maximum=4096,
497
+ value=512,
498
+ step=64,
499
+ )
500
+
501
+ gr.Markdown(
502
+ f"""
503
+ ### Status
504
+ - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
505
+ - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
506
+ """
507
+ )
508
+
509
+ with gr.Column(scale=3):
510
+ chatbot = gr.ChatInterface(
511
+ fn=gradio_chat,
512
+ additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
513
+ title="",
514
+ )
515
+
516
+
517
+ # --- Mount Gradio on FastAPI at /ui, keeping API routes at root ---
518
+ # This is the key: mount Gradio ONTO our FastAPI app, not the other way around
519
+ app = gr.mount_gradio_app(api_app, demo, path="/ui")
520
+
521
+
522
  # --- Launch the application ---
523
  # On HuggingFace Spaces, the runtime handles the launch automatically
 
524
 
525
  if __name__ == "__main__":
526
+ import uvicorn
527
+ uvicorn.run(app, host="0.0.0.0", port=7860)