Spaces:

serenichron
/

opencode-zerogpu

Sleeping

serenichron commited on 13 days ago

Commit

b759464

1 Parent(s): ae2483e

Fix route mounting: use gr.mount_gradio_app properly

- Define FastAPI routes on api_app before mounting
- Mount Gradio demo into FastAPI app at root path
- Remove duplicate route definitions
- API endpoints now available at /v1/*, /health

Files changed (1) hide show

app.py +99 -87

app.py CHANGED Viewed

@@ -242,93 +242,17 @@ def gradio_chat(
         yield f"Error generating response: {str(e)}"
-# Build Gradio Blocks interface
-with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
-    gr.Markdown(
-        """
-        # ZeroGPU OpenCode Provider
-        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
-        **API Endpoint:** `/v1/chat/completions`
-        ## Usage with opencode
-        Configure in `~/.config/opencode/opencode.json`:
-        ```json
-        {
-          "providers": {
-            "zerogpu": {
-              "npm": "@ai-sdk/openai-compatible",
-              "options": {
-                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
-                "headers": {
-                  "Authorization": "Bearer hf_YOUR_TOKEN"
-                }
-              },
-              "models": {
-                "llama-8b": {
-                  "name": "meta-llama/Llama-3.1-8B-Instruct"
-                }
-              }
-            }
-          }
-        }
-        ```
-        ---
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            model_dropdown = gr.Dropdown(
-                label="Model",
-                choices=[
-                    "meta-llama/Llama-3.1-8B-Instruct",
-                    "mistralai/Mistral-7B-Instruct-v0.3",
-                    "Qwen/Qwen2.5-7B-Instruct",
-                    "Qwen/Qwen2.5-14B-Instruct",
-                ],
-                value="meta-llama/Llama-3.1-8B-Instruct",
-                allow_custom_value=True,
-            )
-            temperature_slider = gr.Slider(
-                label="Temperature",
-                minimum=0.0,
-                maximum=2.0,
-                value=0.7,
-                step=0.1,
-            )
-            max_tokens_slider = gr.Slider(
-                label="Max Tokens",
-                minimum=64,
-                maximum=4096,
-                value=512,
-                step=64,
-            )
-            gr.Markdown(
-                f"""
-                ### Status
-                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
-                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
-                """
-            )
-        with gr.Column(scale=3):
-            chatbot = gr.ChatInterface(
-                fn=gradio_chat,
-                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
-                title="",
-            )
-# --- Add OpenAI-compatible API routes to Gradio's FastAPI app ---
-@demo.app.post("/v1/chat/completions")
 async def chat_completions(
     request: ChatCompletionRequest,
     authorization: Optional[str] = Header(None),
@@ -464,7 +388,7 @@ async def chat_completions(
         )
-@demo.app.get("/v1/models")
 async def list_models(authorization: Optional[str] = Header(None)):
     """List available models (returns info about current model if loaded)."""
     token = extract_token(authorization)
@@ -494,7 +418,7 @@ async def list_models(authorization: Optional[str] = Header(None)):
     return {"object": "list", "data": models}
-@demo.app.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
@@ -505,5 +429,93 @@ async def health_check():
     }
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

         yield f"Error generating response: {str(e)}"
+# --- FastAPI app for OpenAI-compatible routes ---
+from fastapi import FastAPI
+api_app = FastAPI(
+    title="ZeroGPU OpenCode Provider",
+    description="OpenAI-compatible API for HuggingFace models on ZeroGPU",
+    version="1.0.0",
+)
+@api_app.post("/v1/chat/completions")
 async def chat_completions(
     request: ChatCompletionRequest,
     authorization: Optional[str] = Header(None),
         )
+@api_app.get("/v1/models")
 async def list_models(authorization: Optional[str] = Header(None)):
     """List available models (returns info about current model if loaded)."""
     token = extract_token(authorization)
     return {"object": "list", "data": models}
+@api_app.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
     }
+# Build Gradio Blocks interface
+with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
+    gr.Markdown(
+        """
+        # ZeroGPU OpenCode Provider
+        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
+        **API Endpoint:** `/v1/chat/completions`
+        ## Usage with opencode
+        Configure in `~/.config/opencode/opencode.json`:
+        ```json
+        {
+          "providers": {
+            "zerogpu": {
+              "npm": "@ai-sdk/openai-compatible",
+              "options": {
+                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
+                "headers": {
+                  "Authorization": "Bearer hf_YOUR_TOKEN"
+                }
+              },
+              "models": {
+                "llama-8b": {
+                  "name": "meta-llama/Llama-3.1-8B-Instruct"
+                }
+              }
+            }
+          }
+        }
+        ```
+        ---
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                label="Model",
+                choices=[
+                    "meta-llama/Llama-3.1-8B-Instruct",
+                    "mistralai/Mistral-7B-Instruct-v0.3",
+                    "Qwen/Qwen2.5-7B-Instruct",
+                    "Qwen/Qwen2.5-14B-Instruct",
+                ],
+                value="meta-llama/Llama-3.1-8B-Instruct",
+                allow_custom_value=True,
+            )
+            temperature_slider = gr.Slider(
+                label="Temperature",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.7,
+                step=0.1,
+            )
+            max_tokens_slider = gr.Slider(
+                label="Max Tokens",
+                minimum=64,
+                maximum=4096,
+                value=512,
+                step=64,
+            )
+            gr.Markdown(
+                f"""
+                ### Status
+                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
+                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
+                """
+            )
+        with gr.Column(scale=3):
+            chatbot = gr.ChatInterface(
+                fn=gradio_chat,
+                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
+                title="",
+            )
+# Mount Gradio into FastAPI app - Gradio UI at root, API at /v1/*
+app = gr.mount_gradio_app(api_app, demo, path="/")
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)