Spaces:

serenichron
/

opencode-zerogpu

Sleeping

serenichron commited on 16 days ago

Commit

217c046

1 Parent(s): 728a4ac

Use Gradio's native gr.api() for custom endpoints

- Remove all FastAPI/Starlette code
- Use gr.api() to register health and chat_completions endpoints
- Endpoints available at /api/health and /api/chat_completions
- Pure Gradio approach for ZeroGPU compatibility
- OpenAI-compatible response format maintained

Files changed (1) hide show

app.py +85 -255

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 HuggingFace ZeroGPU Space - OpenAI-compatible inference provider for opencode.
 This Gradio app provides:
-- OpenAI-compatible /v1/chat/completions endpoint
 - Pass-through model selection (any HF model ID)
 - ZeroGPU H200 inference with HF Serverless fallback
 - HF Token authentication
@@ -14,16 +14,11 @@ import spaces
 import logging
 import time
-import json
 from typing import Optional
 import gradio as gr
 import httpx
 from huggingface_hub import HfApi
-from starlette.applications import Starlette
-from starlette.routing import Route, Mount
-from starlette.responses import JSONResponse, StreamingResponse, RedirectResponse
-from starlette.requests import Request
 from config import get_config, get_quota_tracker
 from models import (
@@ -38,7 +33,6 @@ from openai_compat import (
     create_chat_response,
     create_error_response,
     estimate_tokens,
-    stream_response_generator,
 )
 logger = logging.getLogger(__name__)
@@ -67,17 +61,6 @@ def validate_hf_token(token: str) -> bool:
         return False
-def extract_token(authorization: Optional[str]) -> Optional[str]:
-    """Extract the token from the Authorization header."""
-    if not authorization:
-        return None
-    if authorization.startswith("Bearer "):
-        return authorization[7:]
-    return authorization
 # --- ZeroGPU Inference Functions ---
 # These MUST be decorated with @spaces.GPU for ZeroGPU detection
@@ -89,7 +72,6 @@ def zerogpu_generate(
     max_new_tokens: int,
     temperature: float,
     top_p: float,
-    stop_sequences: Optional[list[str]],
 ) -> str:
     """Generate text using ZeroGPU (H200 GPU)."""
     start_time = time.time()
@@ -100,7 +82,7 @@ def zerogpu_generate(
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
-        stop_sequences=stop_sequences,
     )
     # Track quota usage
@@ -110,37 +92,10 @@ def zerogpu_generate(
     return result
-@spaces.GPU(duration=120)
-def zerogpu_generate_stream(
-    model_id: str,
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    stop_sequences: Optional[list[str]],
-):
-    """Generate text with streaming using ZeroGPU (H200 GPU)."""
-    start_time = time.time()
-    for token in generate_text_stream(
-        model_id=model_id,
-        prompt=prompt,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stop_sequences=stop_sequences,
-    ):
-        yield token
-    # Track quota usage
-    duration = time.time() - start_time
-    quota_tracker.add_usage(duration)
 # --- HF Serverless Fallback ---
-async def serverless_generate(
     model_id: str,
     prompt: str,
     max_new_tokens: int,
@@ -148,7 +103,7 @@ async def serverless_generate(
     top_p: float,
     token: str,
 ) -> str:
-    """Generate text using HuggingFace Serverless Inference API."""
     url = f"https://api-inference.huggingface.co/models/{model_id}"
     payload = {
@@ -161,8 +116,8 @@ async def serverless_generate(
         },
     }
-    async with httpx.AsyncClient() as client:
-        response = await client.post(
             url,
             json=payload,
             headers={"Authorization": f"Bearer {token}"},
@@ -227,99 +182,59 @@ def gradio_chat(
         return f"Error generating response: {str(e)}"
-# --- API Route Handlers (Starlette) ---
-async def health_check(request: Request):
     """Health check endpoint."""
-    return JSONResponse({
         "status": "healthy",
         "zerogpu_available": ZEROGPU_AVAILABLE,
         "quota_remaining_minutes": quota_tracker.remaining_minutes(),
         "fallback_enabled": config.fallback_enabled,
-    })
-async def list_models(request: Request):
-    """List available models (returns info about current model if loaded)."""
-    authorization = request.headers.get("authorization")
-    token = extract_token(authorization)
-    if not token or not validate_hf_token(token):
-        return JSONResponse(
-            create_error_response(
-                message="Invalid or missing HuggingFace token",
-                error_type="authentication_error",
-                code="invalid_api_key",
-            ).model_dump(),
-            status_code=401,
-        )
-    current = get_current_model()
-    models = []
-    if current:
-        models.append(
-            {
-                "id": current.model_id,
-                "object": "model",
-                "created": int(time.time()),
-                "owned_by": "huggingface",
-            }
-        )
-    return JSONResponse({"object": "list", "data": models})
-async def chat_completions(request: Request):
     """
-    OpenAI-compatible chat completions endpoint.
-    Supports both streaming and non-streaming responses.
     """
-    # Get authorization header
-    authorization = request.headers.get("authorization")
     # Validate authentication
-    token = extract_token(authorization)
     if not token or not validate_hf_token(token):
-        return JSONResponse(
-            create_error_response(
-                message="Invalid or missing HuggingFace token",
-                error_type="authentication_error",
-                code="invalid_api_key",
-            ).model_dump(),
-            status_code=401,
-        )
-    # Parse request body
-    try:
-        body = await request.json()
-        chat_request = ChatCompletionRequest(**body)
-    except Exception as e:
-        return JSONResponse(
-            create_error_response(
-                message=f"Invalid request body: {str(e)}",
-                error_type="invalid_request_error",
-            ).model_dump(),
-            status_code=400,
-        )
-    # Extract inference parameters
-    params = InferenceParams.from_request(chat_request)
     # Apply chat template
     try:
-        prompt = apply_chat_template(params.model_id, params.messages)
     except Exception as e:
         logger.error(f"Failed to apply chat template: {e}")
-        return JSONResponse(
-            create_error_response(
-                message=f"Failed to load model or apply chat template: {str(e)}",
-                error_type="invalid_request_error",
-                param="model",
-            ).model_dump(),
-            status_code=400,
-        )
     prompt_tokens = estimate_tokens(prompt)
@@ -327,96 +242,48 @@ async def chat_completions(request: Request):
     use_zerogpu = ZEROGPU_AVAILABLE and not quota_tracker.quota_exhausted
     if not use_zerogpu and not config.fallback_enabled:
-        return JSONResponse(
-            create_error_response(
-                message="ZeroGPU quota exhausted and fallback is disabled",
-                error_type="server_error",
-                code="quota_exhausted",
-            ).model_dump(),
-            status_code=503,
-        )
     try:
-        if params.stream:
-            # Streaming response
-            if use_zerogpu:
-                token_gen = zerogpu_generate_stream(
-                    model_id=params.model_id,
-                    prompt=prompt,
-                    max_new_tokens=params.max_new_tokens,
-                    temperature=params.temperature,
-                    top_p=params.top_p,
-                    stop_sequences=params.stop_sequences,
-                )
-            else:
-                # Fallback doesn't support streaming, so generate full response
-                # and simulate streaming
-                logger.info("Using HF Serverless fallback (no streaming)")
-                full_response = await serverless_generate(
-                    model_id=params.model_id,
-                    prompt=prompt,
-                    max_new_tokens=params.max_new_tokens,
-                    temperature=params.temperature,
-                    top_p=params.top_p,
-                    token=token,
-                )
-                def simulate_stream():
-                    # Yield the full response as a single chunk
-                    yield full_response
-                token_gen = simulate_stream()
-            return StreamingResponse(
-                stream_response_generator(params.model_id, token_gen),
-                media_type="text/event-stream",
-                headers={
-                    "Cache-Control": "no-cache",
-                    "Connection": "keep-alive",
-                    "X-Accel-Buffering": "no",
-                },
             )
         else:
-            # Non-streaming response
-            if use_zerogpu:
-                response_text = zerogpu_generate(
-                    model_id=params.model_id,
-                    prompt=prompt,
-                    max_new_tokens=params.max_new_tokens,
-                    temperature=params.temperature,
-                    top_p=params.top_p,
-                    stop_sequences=params.stop_sequences,
-                )
-            else:
-                logger.info("Using HF Serverless fallback")
-                response_text = await serverless_generate(
-                    model_id=params.model_id,
-                    prompt=prompt,
-                    max_new_tokens=params.max_new_tokens,
-                    temperature=params.temperature,
-                    top_p=params.top_p,
-                    token=token,
-                )
-            completion_tokens = estimate_tokens(response_text)
-            response = create_chat_response(
-                model=params.model_id,
-                content=response_text,
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
             )
-            return JSONResponse(response.model_dump())
     except Exception as e:
         logger.exception(f"Inference error: {e}")
-        return JSONResponse(
-            create_error_response(
-                message=f"Inference failed: {str(e)}",
-                error_type="server_error",
-            ).model_dump(),
-            status_code=500,
-        )
 # --- Build Gradio Interface ---
@@ -428,7 +295,9 @@ with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
         OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
-        **API Endpoint:** `/v1/chat/completions`
         ## Usage with opencode
@@ -440,7 +309,7 @@ with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
             "zerogpu": {
               "npm": "@ai-sdk/openai-compatible",
               "options": {
-                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/v1",
                 "headers": {
                   "Authorization": "Bearer hf_YOUR_TOKEN"
                 }
@@ -502,54 +371,15 @@ with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo:
                 title="",
             )
-# --- Create combined ASGI app with API routes BEFORE Gradio ---
-# This ensures our API routes take precedence over Gradio's catch-all
-api_routes = [
-    Route("/health", health_check, methods=["GET"]),
-    Route("/v1/models", list_models, methods=["GET"]),
-    Route("/v1/chat/completions", chat_completions, methods=["POST"]),
-]
-# Create a Starlette app for API routes
-api_app = Starlette(routes=api_routes)
-# Custom ASGI middleware that routes API paths to our handlers
-class APIRoutingMiddleware:
-    def __init__(self, app, api_app, api_paths):
-        self.app = app  # Gradio app
-        self.api_app = api_app  # Starlette app with API routes
-        self.api_paths = api_paths  # Paths to route to API
-    async def __call__(self, scope, receive, send):
-        if scope["type"] == "http":
-            path = scope["path"]
-            # Check if this path should go to our API
-            for api_path in self.api_paths:
-                if path == api_path or path.startswith(api_path + "/"):
-                    await self.api_app(scope, receive, send)
-                    return
-        # Otherwise, let Gradio handle it
-        await self.app(scope, receive, send)
-# Get Gradio's ASGI app and wrap it with our middleware
-gradio_app = demo.app
-# Wrap Gradio with our API routing middleware
-app = APIRoutingMiddleware(
-    gradio_app,
-    api_app,
-    api_paths=["/health", "/v1"]
-)
 # --- Launch the application ---
 # On HuggingFace Spaces, the runtime handles the launch automatically
-# The demo object is exposed for ZeroGPU detection
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 HuggingFace ZeroGPU Space - OpenAI-compatible inference provider for opencode.
 This Gradio app provides:
+- OpenAI-compatible API via Gradio's native API system
 - Pass-through model selection (any HF model ID)
 - ZeroGPU H200 inference with HF Serverless fallback
 - HF Token authentication
 import logging
 import time
 from typing import Optional
 import gradio as gr
 import httpx
 from huggingface_hub import HfApi
 from config import get_config, get_quota_tracker
 from models import (
     create_chat_response,
     create_error_response,
     estimate_tokens,
 )
 logger = logging.getLogger(__name__)
         return False
 # --- ZeroGPU Inference Functions ---
 # These MUST be decorated with @spaces.GPU for ZeroGPU detection
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ) -> str:
     """Generate text using ZeroGPU (H200 GPU)."""
     start_time = time.time()
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
+        stop_sequences=None,
     )
     # Track quota usage
     return result
 # --- HF Serverless Fallback ---
+def serverless_generate_sync(
     model_id: str,
     prompt: str,
     max_new_tokens: int,
     top_p: float,
     token: str,
 ) -> str:
+    """Generate text using HuggingFace Serverless Inference API (sync version)."""
     url = f"https://api-inference.huggingface.co/models/{model_id}"
     payload = {
         },
     }
+    with httpx.Client() as client:
+        response = client.post(
             url,
             json=payload,
             headers={"Authorization": f"Bearer {token}"},
         return f"Error generating response: {str(e)}"
+# --- API Functions for Gradio's gr.api() ---
+def api_health() -> dict:
     """Health check endpoint."""
+    return {
         "status": "healthy",
         "zerogpu_available": ZEROGPU_AVAILABLE,
         "quota_remaining_minutes": quota_tracker.remaining_minutes(),
         "fallback_enabled": config.fallback_enabled,
+    }
+def api_chat_completions(
+    token: str,
+    model: str,
+    messages: list[dict],
+    temperature: float = 0.7,
+    max_tokens: int = 512,
+    top_p: float = 0.95,
+) -> dict:
     """
+    OpenAI-compatible chat completions.
+    Args:
+        token: HuggingFace API token (hf_xxx)
+        model: HuggingFace model ID (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+        messages: List of message dicts with "role" and "content"
+        temperature: Sampling temperature (0.0-2.0)
+        max_tokens: Maximum tokens to generate
+        top_p: Nucleus sampling probability
+    Returns:
+        OpenAI-compatible response dict
     """
     # Validate authentication
     if not token or not validate_hf_token(token):
+        return create_error_response(
+            message="Invalid or missing HuggingFace token",
+            error_type="authentication_error",
+            code="invalid_api_key",
+        ).model_dump()
     # Apply chat template
     try:
+        prompt = apply_chat_template(model, messages)
     except Exception as e:
         logger.error(f"Failed to apply chat template: {e}")
+        return create_error_response(
+            message=f"Failed to load model or apply chat template: {str(e)}",
+            error_type="invalid_request_error",
+            param="model",
+        ).model_dump()
     prompt_tokens = estimate_tokens(prompt)
     use_zerogpu = ZEROGPU_AVAILABLE and not quota_tracker.quota_exhausted
     if not use_zerogpu and not config.fallback_enabled:
+        return create_error_response(
+            message="ZeroGPU quota exhausted and fallback is disabled",
+            error_type="server_error",
+            code="quota_exhausted",
+        ).model_dump()
     try:
+        # Non-streaming response
+        if use_zerogpu:
+            response_text = zerogpu_generate(
+                model_id=model,
+                prompt=prompt,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
             )
         else:
+            logger.info("Using HF Serverless fallback")
+            response_text = serverless_generate_sync(
+                model_id=model,
+                prompt=prompt,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                token=token,
             )
+        completion_tokens = estimate_tokens(response_text)
+        return create_chat_response(
+            model=model,
+            content=response_text,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        ).model_dump()
     except Exception as e:
         logger.exception(f"Inference error: {e}")
+        return create_error_response(
+            message=f"Inference failed: {str(e)}",
+            error_type="server_error",
+        ).model_dump()
 # --- Build Gradio Interface ---
         OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode).
+        **API Endpoints:**
+        - `/api/health` - Health check
+        - `/api/chat_completions` - Chat completions (OpenAI-compatible response format)
         ## Usage with opencode
             "zerogpu": {
               "npm": "@ai-sdk/openai-compatible",
               "options": {
+                "baseURL": "https://serenichron-opencode-zerogpu.hf.space/api",
                 "headers": {
                   "Authorization": "Bearer hf_YOUR_TOKEN"
                 }
                 title="",
             )
+    # Register API endpoints using Gradio's API system
+    # These will be available at /api/<name>
+    gr.api(api_health, api_name="health")
+    gr.api(api_chat_completions, api_name="chat_completions")
 # --- Launch the application ---
 # On HuggingFace Spaces, the runtime handles the launch automatically
+# We just expose the demo object
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)