Spaces:

serenichron
/

opencode-zerogpu

Sleeping

serenichron commited on 16 days ago

Commit

ae2483e

1 Parent(s): 5ea35f6

Fix Gradio + FastAPI integration for HuggingFace Spaces

- Use demo.app to add FastAPI routes to Gradio's internal app
- Remove examples from ChatInterface to avoid caching issues
- Add error handling in gradio_chat for model loading failures
- Simplify app structure for better HF Spaces compatibility

Files changed (1) hide show

app.py +132 -152

app.py CHANGED Viewed

@@ -11,12 +11,11 @@ This Gradio app provides:
 import logging
 import time
-from contextlib import asynccontextmanager
 from typing import Optional
 import gradio as gr
 import httpx
-from fastapi import FastAPI, Header, HTTPException, Request
 from fastapi.responses import StreamingResponse, JSONResponse
 from huggingface_hub import HfApi
@@ -195,28 +194,141 @@ async def serverless_generate(
         )
-# --- FastAPI App ---
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Application lifespan events."""
-    logger.info("Starting ZeroGPU OpenCode Provider")
-    logger.info(f"ZeroGPU available: {ZEROGPU_AVAILABLE}")
-    logger.info(f"Fallback enabled: {config.fallback_enabled}")
-    yield
-    logger.info("Shutting down ZeroGPU OpenCode Provider")
-api = FastAPI(
-    title="ZeroGPU OpenCode Provider",
-    description="OpenAI-compatible API for HuggingFace models on ZeroGPU",
-    version="1.0.0",
-    lifespan=lifespan,
-)
-@api.post("/v1/chat/completions")
 async def chat_completions(
     request: ChatCompletionRequest,
     authorization: Optional[str] = Header(None),
@@ -352,7 +464,7 @@ async def chat_completions(
         )
-@api.get("/v1/models")
 async def list_models(authorization: Optional[str] = Header(None)):
     """List available models (returns info about current model if loaded)."""
     token = extract_token(authorization)
@@ -382,7 +494,7 @@ async def list_models(authorization: Optional[str] = Header(None)):
     return {"object": "list", "data": models}
-@api.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
@@ -393,137 +505,5 @@ async def health_check():
     }
-# --- Gradio Interface ---
-def gradio_chat(
-    message: str,
-    history: list[list[str]],
-    model_id: str,
-    temperature: float,
-    max_tokens: int,
-):
-    """Gradio chat interface handler."""
-    # Validate model_id
-    if not model_id:
-        yield "Please select a model first."
-        return
-    # Build messages from history
-    messages = []
-    for user_msg, assistant_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
-    messages.append({"role": "user", "content": message})
-    # Apply chat template
-    prompt = apply_chat_template(model_id, messages)
-    # Generate response (streaming)
-    response = ""
-    for token in zerogpu_generate_stream(
-        model_id=model_id,
-        prompt=prompt,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=0.95,
-        stop_sequences=None,
-    ):
-        response += token
-        yield response
-# Gradio Blocks interface
-with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
-    gr.Markdown(
-        """
-        # ZeroGPU OpenCode Provider
-        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
-        **API Endpoint:** `/v1/chat/completions`
-        ## Usage with opencode
-        Configure in `~/.config/opencode/opencode.json`:
-        ```json
-        {
-          "providers": {
-            "zerogpu": {
-              "npm": "@ai-sdk/openai-compatible",
-              "options": {
-                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
-                "headers": {
-                  "Authorization": "Bearer hf_YOUR_TOKEN"
-                }
-              },
-              "models": {
-                "llama-8b": {
-                  "name": "meta-llama/Llama-3.1-8B-Instruct"
-                }
-              }
-            }
-          }
-        }
-        ```
-        ---
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            model_dropdown = gr.Dropdown(
-                label="Model",
-                choices=[
-                    "meta-llama/Llama-3.1-8B-Instruct",
-                    "mistralai/Mistral-7B-Instruct-v0.3",
-                    "Qwen/Qwen2.5-7B-Instruct",
-                    "Qwen/Qwen2.5-14B-Instruct",
-                ],
-                value="meta-llama/Llama-3.1-8B-Instruct",
-                allow_custom_value=True,
-            )
-            temperature_slider = gr.Slider(
-                label="Temperature",
-                minimum=0.0,
-                maximum=2.0,
-                value=0.7,
-                step=0.1,
-            )
-            max_tokens_slider = gr.Slider(
-                label="Max Tokens",
-                minimum=64,
-                maximum=4096,
-                value=512,
-                step=64,
-            )
-            gr.Markdown(
-                f"""
-                ### Status
-                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
-                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
-                """
-            )
-        with gr.Column(scale=3):
-            chatbot = gr.ChatInterface(
-                fn=gradio_chat,
-                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
-                title="",
-                examples=[
-                    ["Hello! How are you?"],
-                    ["Explain quantum computing in simple terms."],
-                    ["Write a Python function to calculate fibonacci numbers."],
-                ],
-            )
-# Mount FastAPI to Gradio
-demo = gr.mount_gradio_app(demo, api, path="/")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import logging
 import time
 from typing import Optional
 import gradio as gr
 import httpx
+from fastapi import Header, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 from huggingface_hub import HfApi
         )
+# --- Gradio Interface ---
+def gradio_chat(
+    message: str,
+    history: list[list[str]],
+    model_id: str,
+    temperature: float,
+    max_tokens: int,
+):
+    """Gradio chat interface handler."""
+    # Validate model_id
+    if not model_id:
+        yield "Please select a model first."
+        return
+    # Build messages from history
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    # Apply chat template
+    try:
+        prompt = apply_chat_template(model_id, messages)
+    except Exception as e:
+        yield f"Error loading model: {str(e)}"
+        return
+    # Generate response (streaming)
+    response = ""
+    try:
+        for token in zerogpu_generate_stream(
+            model_id=model_id,
+            prompt=prompt,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=0.95,
+            stop_sequences=None,
+        ):
+            response += token
+            yield response
+    except Exception as e:
+        yield f"Error generating response: {str(e)}"
+# Build Gradio Blocks interface
+with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
+    gr.Markdown(
+        """
+        # ZeroGPU OpenCode Provider
+        OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
+        **API Endpoint:** `/v1/chat/completions`
+        ## Usage with opencode
+        Configure in `~/.config/opencode/opencode.json`:
+        ```json
+        {
+          "providers": {
+            "zerogpu": {
+              "npm": "@ai-sdk/openai-compatible",
+              "options": {
+                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
+                "headers": {
+                  "Authorization": "Bearer hf_YOUR_TOKEN"
+                }
+              },
+              "models": {
+                "llama-8b": {
+                  "name": "meta-llama/Llama-3.1-8B-Instruct"
+                }
+              }
+            }
+          }
+        }
+        ```
+        ---
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                label="Model",
+                choices=[
+                    "meta-llama/Llama-3.1-8B-Instruct",
+                    "mistralai/Mistral-7B-Instruct-v0.3",
+                    "Qwen/Qwen2.5-7B-Instruct",
+                    "Qwen/Qwen2.5-14B-Instruct",
+                ],
+                value="meta-llama/Llama-3.1-8B-Instruct",
+                allow_custom_value=True,
+            )
+            temperature_slider = gr.Slider(
+                label="Temperature",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.7,
+                step=0.1,
+            )
+            max_tokens_slider = gr.Slider(
+                label="Max Tokens",
+                minimum=64,
+                maximum=4096,
+                value=512,
+                step=64,
+            )
+            gr.Markdown(
+                f"""
+                ### Status
+                - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'}
+                - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'}
+                """
+            )
+        with gr.Column(scale=3):
+            chatbot = gr.ChatInterface(
+                fn=gradio_chat,
+                additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider],
+                title="",
+            )
+# --- Add OpenAI-compatible API routes to Gradio's FastAPI app ---
+@demo.app.post("/v1/chat/completions")
 async def chat_completions(
     request: ChatCompletionRequest,
     authorization: Optional[str] = Header(None),
         )
+@demo.app.get("/v1/models")
 async def list_models(authorization: Optional[str] = Header(None)):
     """List available models (returns info about current model if loaded)."""
     token = extract_token(authorization)
     return {"object": "list", "data": models}
+@demo.app.get("/health")
 async def health_check():
     """Health check endpoint."""
     return {
     }
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)