Spaces:

serenichron
/

opencode-zerogpu

Sleeping

serenichron commited on 25 days ago

Commit

6a11a38

1 Parent(s): 67f3d72

Revert to FastAPI-first with Gradio at /ui path

- FastAPI app at root for OpenAI-compatible API routes
- Gradio UI mounted at /ui path
- With Gradio 5.16.1+, this should work with ZeroGPU
- API routes: /health, /v1/chat/completions, /v1/models

Files changed (1) hide show

app.py +111 -106

app.py CHANGED Viewed

@@ -18,8 +18,8 @@ from typing import Optional
 import gradio as gr
 import httpx
-from fastapi import Header, HTTPException, Request
-from fastapi.responses import StreamingResponse, JSONResponse
 from huggingface_hub import HfApi
 from config import get_config, get_quota_tracker
@@ -167,10 +167,7 @@ async def serverless_generate(
         )
         if response.status_code != 200:
-            raise HTTPException(
-                status_code=response.status_code,
-                detail=f"HF Serverless error: {response.text}",
-            )
         result = response.json()
@@ -179,10 +176,7 @@ async def serverless_generate(
             if "generated_text" in result[0]:
                 return result[0]["generated_text"]
-        raise HTTPException(
-            status_code=500,
-            detail=f"Unexpected response format from HF Serverless: {result}",
-        )
 # --- Gradio Chat Function (GPU decorated for ZeroGPU) ---
@@ -230,100 +224,17 @@ def gradio_chat(
         return f"Error generating response: {str(e)}"
-# --- Build Gradio Interface ---
-with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
-    gr.Markdown(
-        """
-        # ZeroGPU OpenCode Provider
-        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
-        **API Endpoint:** `/v1/chat/completions`
-        ## Usage with opencode
-        Configure in `~/.config/opencode/opencode.json`:
-        ```json
-        {
-          "providers": {
-            "zerogpu": {
-              "npm": "@ai-sdk/openai-compatible",
-              "options": {
-                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
-                "headers": {
-                  "Authorization": "Bearer hf_YOUR_TOKEN"
-                }
-              },
-              "models": {
-                "llama-8b": {
-                  "name": "meta-llama/Llama-3.1-8B-Instruct"
-                }
-              }
-            }
-          }
-        }
-        ```
-        ---
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            model_dropdown = gr.Dropdown(
-                label="Model",
-                choices=[
-                    "meta-llama/Llama-3.1-8B-Instruct",
-                    "mistralai/Mistral-7B-Instruct-v0.3",
-                    "Qwen/Qwen2.5-7B-Instruct",
-                    "Qwen/Qwen2.5-14B-Instruct",
-                ],
-                value="meta-llama/Llama-3.1-8B-Instruct",
-                allow_custom_value=True,
-            )
-            temperature_slider = gr.Slider(
-                label="Temperature",
-                minimum=0.0,
-                maximum=2.0,
-                value=0.7,
-                step=0.1,
-            )
-            max_tokens_slider = gr.Slider(
-                label="Max Tokens",
-                minimum=64,
-                maximum=4096,
-                value=512,
-                step=64,
-            )
-            gr.Markdown(
-                f"""
-                ### Status
-                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
-                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
-                """
-            )
-        with gr.Column(scale=3):
-            chatbot = gr.ChatInterface(
-                fn=gradio_chat,
-                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
-                title="",
-            )
-# --- Add OpenAI-compatible API routes to Gradio's internal FastAPI app ---
-# Get the underlying FastAPI app from Gradio
-app = demo.app
-@app.post("/v1/chat/completions")
-async def chat_completions(
-    request: Request,
-):
     """
     OpenAI-compatible chat completions endpoint.
@@ -471,7 +382,7 @@ async def chat_completions(
         )
-@app.get("/v1/models")
 async def list_models(request: Request):
     """List available models (returns info about current model if loaded)."""
     authorization = request.headers.get("authorization")
@@ -502,7 +413,7 @@ async def list_models(request: Request):
     return {"object": "list", "data": models}
-@app.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
@@ -513,10 +424,104 @@ async def health_check():
     }
 # --- Launch the application ---
 # On HuggingFace Spaces, the runtime handles the launch automatically
-# The demo object is exposed for the Gradio SDK to use
 if __name__ == "__main__":
-    # Local development
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse, JSONResponse, RedirectResponse
 from huggingface_hub import HfApi
 from config import get_config, get_quota_tracker
         )
         if response.status_code != 200:
+            raise Exception(f"HF Serverless error: {response.text}")
         result = response.json()
             if "generated_text" in result[0]:
                 return result[0]["generated_text"]
+        raise Exception(f"Unexpected response format from HF Serverless: {result}")
 # --- Gradio Chat Function (GPU decorated for ZeroGPU) ---
         return f"Error generating response: {str(e)}"
+# --- Create FastAPI app for API routes ---
+api_app = FastAPI(
+    title="ZeroGPU OpenCode Provider",
+    description="OpenAI-compatible API for HuggingFace models on ZeroGPU",
+    version="1.0.0",
+)
+@api_app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
     """
     OpenAI-compatible chat completions endpoint.
         )
+@api_app.get("/v1/models")
 async def list_models(request: Request):
     """List available models (returns info about current model if loaded)."""
     authorization = request.headers.get("authorization")
     return {"object": "list", "data": models}
+@api_app.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
     }
+@api_app.get("/")
+async def root_redirect():
+    """Redirect root to Gradio UI."""
+    return RedirectResponse(url="/ui/")
+# --- Build Gradio Interface ---
+with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
+    gr.Markdown(
+        """
+        # ZeroGPU OpenCode Provider
+        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
+        **API Endpoint:** `/v1/chat/completions`
+        ## Usage with opencode
+        Configure in `~/.config/opencode/opencode.json`:
+        ```json
+        {
+          "providers": {
+            "zerogpu": {
+              "npm": "@ai-sdk/openai-compatible",
+              "options": {
+                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
+                "headers": {
+                  "Authorization": "Bearer hf_YOUR_TOKEN"
+                }
+              },
+              "models": {
+                "llama-8b": {
+                  "name": "meta-llama/Llama-3.1-8B-Instruct"
+                }
+              }
+            }
+          }
+        }
+        ```
+        ---
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                label="Model",
+                choices=[
+                    "meta-llama/Llama-3.1-8B-Instruct",
+                    "mistralai/Mistral-7B-Instruct-v0.3",
+                    "Qwen/Qwen2.5-7B-Instruct",
+                    "Qwen/Qwen2.5-14B-Instruct",
+                ],
+                value="meta-llama/Llama-3.1-8B-Instruct",
+                allow_custom_value=True,
+            )
+            temperature_slider = gr.Slider(
+                label="Temperature",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.7,
+                step=0.1,
+            )
+            max_tokens_slider = gr.Slider(
+                label="Max Tokens",
+                minimum=64,
+                maximum=4096,
+                value=512,
+                step=64,
+            )
+            gr.Markdown(
+                f"""
+                ### Status
+                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
+                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
+                """
+            )
+        with gr.Column(scale=3):
+            chatbot = gr.ChatInterface(
+                fn=gradio_chat,
+                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
+                title="",
+            )
+# --- Mount Gradio on FastAPI at /ui, keeping API routes at root ---
+# This is the key: mount Gradio ONTO our FastAPI app, not the other way around
+app = gr.mount_gradio_app(api_app, demo, path="/ui")
 # --- Launch the application ---
 # On HuggingFace Spaces, the runtime handles the launch automatically
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)