serenichron commited on
Commit
b759464
·
1 Parent(s): ae2483e

Fix route mounting: use gr.mount_gradio_app properly

Browse files

- Define FastAPI routes on api_app before mounting
- Mount Gradio demo into FastAPI app at root path
- Remove duplicate route definitions
- API endpoints now available at /v1/*, /health

Files changed (1) hide show
  1. app.py +99 -87
app.py CHANGED
@@ -242,93 +242,17 @@ def gradio_chat(
242
  yield f"Error generating response: {str(e)}"
243
 
244
 
245
- # Build Gradio Blocks interface
246
- with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
247
- gr.Markdown(
248
- """
249
- # ZeroGPU OpenCode Provider
250
-
251
- OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
252
-
253
- **API Endpoint:** `/v1/chat/completions`
254
-
255
- ## Usage with opencode
256
-
257
- Configure in `~/.config/opencode/opencode.json`:
258
-
259
- ```json
260
- {
261
- "providers": {
262
- "zerogpu": {
263
- "npm": "@ai-sdk/openai-compatible",
264
- "options": {
265
- "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
266
- "headers": {
267
- "Authorization": "Bearer hf_YOUR_TOKEN"
268
- }
269
- },
270
- "models": {
271
- "llama-8b": {
272
- "name": "meta-llama/Llama-3.1-8B-Instruct"
273
- }
274
- }
275
- }
276
- }
277
- }
278
- ```
279
-
280
- ---
281
- """
282
- )
283
-
284
- with gr.Row():
285
- with gr.Column(scale=1):
286
- model_dropdown = gr.Dropdown(
287
- label="Model",
288
- choices=[
289
- "meta-llama/Llama-3.1-8B-Instruct",
290
- "mistralai/Mistral-7B-Instruct-v0.3",
291
- "Qwen/Qwen2.5-7B-Instruct",
292
- "Qwen/Qwen2.5-14B-Instruct",
293
- ],
294
- value="meta-llama/Llama-3.1-8B-Instruct",
295
- allow_custom_value=True,
296
- )
297
- temperature_slider = gr.Slider(
298
- label="Temperature",
299
- minimum=0.0,
300
- maximum=2.0,
301
- value=0.7,
302
- step=0.1,
303
- )
304
- max_tokens_slider = gr.Slider(
305
- label="Max Tokens",
306
- minimum=64,
307
- maximum=4096,
308
- value=512,
309
- step=64,
310
- )
311
-
312
- gr.Markdown(
313
- f"""
314
- ### Status
315
- - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
316
- - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
317
- """
318
- )
319
 
320
- with gr.Column(scale=3):
321
- chatbot = gr.ChatInterface(
322
- fn=gradio_chat,
323
- additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
324
- title="",
325
- )
326
-
327
-
328
- # --- Add OpenAI-compatible API routes to Gradio's FastAPI app ---
329
 
330
 
331
- @demo.app.post("/v1/chat/completions")
332
  async def chat_completions(
333
  request: ChatCompletionRequest,
334
  authorization: Optional[str] = Header(None),
@@ -464,7 +388,7 @@ async def chat_completions(
464
  )
465
 
466
 
467
- @demo.app.get("/v1/models")
468
  async def list_models(authorization: Optional[str] = Header(None)):
469
  """List available models (returns info about current model if loaded)."""
470
  token = extract_token(authorization)
@@ -494,7 +418,7 @@ async def list_models(authorization: Optional[str] = Header(None)):
494
  return {"object": "list", "data": models}
495
 
496
 
497
- @demo.app.get("/health")
498
  async def health_check():
499
  """Health check endpoint."""
500
  return {
@@ -505,5 +429,93 @@ async def health_check():
505
  }
506
 
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  if __name__ == "__main__":
509
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
242
  yield f"Error generating response: {str(e)}"
243
 
244
 
245
+ # --- FastAPI app for OpenAI-compatible routes ---
246
+ from fastapi import FastAPI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ api_app = FastAPI(
249
+ title="ZeroGPU OpenCode Provider",
250
+ description="OpenAI-compatible API for HuggingFace models on ZeroGPU",
251
+ version="1.0.0",
252
+ )
 
 
 
 
253
 
254
 
255
+ @api_app.post("/v1/chat/completions")
256
  async def chat_completions(
257
  request: ChatCompletionRequest,
258
  authorization: Optional[str] = Header(None),
 
388
  )
389
 
390
 
391
+ @api_app.get("/v1/models")
392
  async def list_models(authorization: Optional[str] = Header(None)):
393
  """List available models (returns info about current model if loaded)."""
394
  token = extract_token(authorization)
 
418
  return {"object": "list", "data": models}
419
 
420
 
421
+ @api_app.get("/health")
422
  async def health_check():
423
  """Health check endpoint."""
424
  return {
 
429
  }
430
 
431
 
432
+ # Build Gradio Blocks interface
433
+ with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
434
+ gr.Markdown(
435
+ """
436
+ # ZeroGPU OpenCode Provider
437
+
438
+ OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
439
+
440
+ **API Endpoint:** `/v1/chat/completions`
441
+
442
+ ## Usage with opencode
443
+
444
+ Configure in `~/.config/opencode/opencode.json`:
445
+
446
+ ```json
447
+ {
448
+ "providers": {
449
+ "zerogpu": {
450
+ "npm": "@ai-sdk/openai-compatible",
451
+ "options": {
452
+ "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
453
+ "headers": {
454
+ "Authorization": "Bearer hf_YOUR_TOKEN"
455
+ }
456
+ },
457
+ "models": {
458
+ "llama-8b": {
459
+ "name": "meta-llama/Llama-3.1-8B-Instruct"
460
+ }
461
+ }
462
+ }
463
+ }
464
+ }
465
+ ```
466
+
467
+ ---
468
+ """
469
+ )
470
+
471
+ with gr.Row():
472
+ with gr.Column(scale=1):
473
+ model_dropdown = gr.Dropdown(
474
+ label="Model",
475
+ choices=[
476
+ "meta-llama/Llama-3.1-8B-Instruct",
477
+ "mistralai/Mistral-7B-Instruct-v0.3",
478
+ "Qwen/Qwen2.5-7B-Instruct",
479
+ "Qwen/Qwen2.5-14B-Instruct",
480
+ ],
481
+ value="meta-llama/Llama-3.1-8B-Instruct",
482
+ allow_custom_value=True,
483
+ )
484
+ temperature_slider = gr.Slider(
485
+ label="Temperature",
486
+ minimum=0.0,
487
+ maximum=2.0,
488
+ value=0.7,
489
+ step=0.1,
490
+ )
491
+ max_tokens_slider = gr.Slider(
492
+ label="Max Tokens",
493
+ minimum=64,
494
+ maximum=4096,
495
+ value=512,
496
+ step=64,
497
+ )
498
+
499
+ gr.Markdown(
500
+ f"""
501
+ ### Status
502
+ - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
503
+ - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
504
+ """
505
+ )
506
+
507
+ with gr.Column(scale=3):
508
+ chatbot = gr.ChatInterface(
509
+ fn=gradio_chat,
510
+ additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
511
+ title="",
512
+ )
513
+
514
+
515
+ # Mount Gradio into FastAPI app - Gradio UI at root, API at /v1/*
516
+ app = gr.mount_gradio_app(api_app, demo, path="/")
517
+
518
+
519
  if __name__ == "__main__":
520
+ import uvicorn
521
+ uvicorn.run(app, host="0.0.0.0", port=7860)