Spaces:

oki692
/

endpointllm

Sleeping

App Files Files Community

oki692 commited on Mar 21

Commit

6ca3422

verified ·

1 Parent(s): 55c5fe9

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +12 -0
main.py +282 -0
requirements.txt +5 -0
system_prompts.py +15 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]

main.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Multi-model AI gateway endpoint — HF Spaces compatible.
+Authorization via 'connect' API key header.
+Streaming always enabled. Function calling supported.
+"""
+import json
+import asyncio
+from typing import AsyncGenerator, Optional
+from fastapi import FastAPI, HTTPException, Header, Request
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from openai import OpenAI
+from system_prompts import get_system_prompt
+# ── Config ──────────────────────────────────────────────────────────────────
+CONNECT_KEY = "connect"
+NVIDIA_BASE_URL = "https://integrate.api.nvidia.com/v1"
+NVIDIA_API_KEY  = "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"
+# Model registry: display-name → real model id + optional extra body
+MODELS = {
+    "Bielik-11b": {
+        "model_id":   "speakleash/bielik-11b-v2.6-instruct",
+        "extra_body": {
+            "chat_template_kwargs": {
+                "enable_thinking": False,
+                "clear_thinking":  True,
+            }
+        },
+    },
+    "GLM-4.7": {
+        "model_id":   "z-ai/glm4.7",
+        "extra_body": {
+            "chat_template_kwargs": {
+                "enable_thinking": False,
+                "clear_thinking":  True,
+            }
+        },
+    },
+    "Mistral-Small-4": {
+        "model_id":   "mistralai/mistral-small-4-119b-2603",
+        "extra_body": {},
+    },
+    "DeepSeek-V3.1": {
+        "model_id":   "deepseek-ai/deepseek-v3.1",
+        "extra_body": {},
+    },
+    "Kimi-K2": {
+        "model_id":   "moonshotai/kimi-k2-instruct",
+        "extra_body": {},
+    },
+}
+# ── FastAPI ──────────────────────────────────────────────────────────────────
+app = FastAPI(
+    title="Multi-Model AI Gateway",
+    version="1.0.0",
+    description="Streaming endpoint for Bielik-11b, GLM-4.7, Mistral-Small-4, DeepSeek-V3.1, Kimi-K2",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+client = OpenAI(base_url=NVIDIA_BASE_URL, api_key=NVIDIA_API_KEY)
+# ── Auth ─────────────────────────────────────────────────────────────────────
+def verify_key(authorization: Optional[str]) -> None:
+    """Check Bearer token matches CONNECT_KEY."""
+    if not authorization:
+        raise HTTPException(status_code=401, detail="Missing Authorization header")
+    scheme, _, token = authorization.partition(" ")
+    if scheme.lower() != "bearer" or token != CONNECT_KEY:
+        raise HTTPException(status_code=403, detail="Invalid API key")
+# ── Schemas ───────────────────────────────────────────────────────────────────
+class Message(BaseModel):
+    role: str
+    content: str | list  # supports text or multipart
+class ToolFunction(BaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[dict] = None
+class Tool(BaseModel):
+    type: str = "function"
+    function: ToolFunction
+class ChatRequest(BaseModel):
+    model: str = Field(..., description="Model name: Bielik-11b | GLM-4.7 | Mistral-Small-4 | DeepSeek-V3.1 | Kimi-K2")
+    messages: list[Message]
+    tools: Optional[list[Tool]] = None
+    tool_choice: Optional[str | dict] = None
+    temperature: Optional[float] = None
+    max_tokens: Optional[int] = None
+    top_p: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    inject_system_prompt: bool = Field(
+        default=True,
+        description="Prepend the model-specific system prompt automatically"
+    )
+# ── Stream helper ─────────────────────────────────────────────────────────────
+async def stream_nvidia(
+    model_name: str,
+    messages: list[dict],
+    tools: Optional[list[dict]],
+    tool_choice,
+    kwargs: dict,
+    extra_body: dict,
+) -> AsyncGenerator[str, None]:
+    """Yield SSE chunks from NVIDIA NIM in a thread-safe way."""
+    params = {
+        "model":      MODELS[model_name]["model_id"],
+        "messages":   messages,
+        "stream":     True,          # always True
+        **kwargs,
+    }
+    if tools:
+        params["tools"] = tools
+    if tool_choice is not None:
+        params["tool_choice"] = tool_choice
+    if extra_body:
+        params["extra_body"] = extra_body
+    loop = asyncio.get_event_loop()
+    def _call():
+        return client.chat.completions.create(**params)
+    stream = await loop.run_in_executor(None, _call)
+    for chunk in stream:
+        data = chunk.model_dump()
+        yield f"data: {json.dumps(data)}\n\n"
+    yield "data: [DONE]\n\n"
+# ── Endpoints ─────────────────────────────────────────────────────────────────
+@app.get("/")
+async def root():
+    return {
+        "service": "Multi-Model AI Gateway",
+        "models":  list(MODELS.keys()),
+        "auth":    "Bearer <connect-key>",
+        "docs":    "/docs",
+    }
+@app.get("/models")
+async def list_models(authorization: Optional[str] = Header(default=None)):
+    verify_key(authorization)
+    return {
+        name: {
+            "model_id":    cfg["model_id"],
+            "has_thinking": bool(cfg["extra_body"]),
+        }
+        for name, cfg in MODELS.items()
+    }
+@app.post("/chat")
+async def chat(
+    request: ChatRequest,
+    authorization: Optional[str] = Header(default=None),
+):
+    verify_key(authorization)
+    if request.model not in MODELS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown model '{request.model}'. Available: {list(MODELS.keys())}",
+        )
+    cfg = MODELS[request.model]
+    # Build messages list
+    messages = [m.model_dump() for m in request.messages]
+    # Inject per-model system prompt at position 0 if not already present
+    if request.inject_system_prompt:
+        system_prompt = get_system_prompt(request.model)
+        if not messages or messages[0].get("role") != "system":
+            messages.insert(0, {"role": "system", "content": system_prompt})
+    # Optional params
+    kwargs = {}
+    for field in ("temperature", "max_tokens", "top_p", "presence_penalty", "frequency_penalty"):
+        val = getattr(request, field)
+        if val is not None:
+            kwargs[field] = val
+    tools = [t.model_dump() for t in request.tools] if request.tools else None
+    return StreamingResponse(
+        stream_nvidia(
+            model_name=request.model,
+            messages=messages,
+            tools=tools,
+            tool_choice=request.tool_choice,
+            kwargs=kwargs,
+            extra_body=cfg["extra_body"],
+        ),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control":     "no-cache",
+            "X-Accel-Buffering": "no",
+        },
+    )
+# ── Compatibility: OpenAI-style /v1/chat/completions ──────────────────────────
+@app.post("/v1/chat/completions")
+async def openai_compat(
+    raw: Request,
+    authorization: Optional[str] = Header(default=None),
+):
+    """
+    Drop-in OpenAI-compatible endpoint.
+    Pass model as one of the gateway model names (e.g. 'Kimi-K2').
+    """
+    verify_key(authorization)
+    body = await raw.json()
+    model_name = body.get("model", "")
+    if model_name not in MODELS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown model '{model_name}'. Available: {list(MODELS.keys())}",
+        )
+    cfg = MODELS[model_name]
+    messages = body.get("messages", [])
+    inject = body.get("inject_system_prompt", True)
+    if inject:
+        system_prompt = get_system_prompt(model_name)
+        if not messages or messages[0].get("role") != "system":
+            messages.insert(0, {"role": "system", "content": system_prompt})
+    kwargs = {}
+    for field in ("temperature", "max_tokens", "top_p", "presence_penalty", "frequency_penalty"):
+        if field in body:
+            kwargs[field] = body[field]
+    tools      = body.get("tools")
+    tool_choice = body.get("tool_choice")
+    return StreamingResponse(
+        stream_nvidia(
+            model_name=model_name,
+            messages=messages,
+            tools=tools,
+            tool_choice=tool_choice,
+            kwargs=kwargs,
+            extra_body=cfg["extra_body"],
+        ),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control":     "no-cache",
+            "X-Accel-Buffering": "no",
+        },
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi>=0.111.0
+uvicorn[standard]>=0.29.0
+openai>=1.35.0
+pydantic>=2.7.0
+httpx>=0.27.0

system_prompts.py ADDED Viewed

	@@ -0,0 +1,15 @@

+SYSTEM_PROMPTS = {
+    "speakleash/bielik-11b-v2.6-instruct": """Jesteś Bielikiem – polskim asystentem AI stworzonym z myślą o użytkownikach mówiących po polsku. Komunikujesz się przede wszystkim w języku polskim, chyba że użytkownik wyraźnie poprosi o inny język. Jesteś precyzyjny, kulturalny i pomocny. Posiadasz szeroką wiedzę o polskiej kulturze, historii i realiach. Odpowiadasz zwięźle, ale wyczerpująco. Nie udajesz innego modelu – jesteś Bielikiem.""",
+    "z-ai/glm4.7": """You are GLM-4.7, a high-capability multilingual assistant developed by Zhipu AI. You excel at reasoning, coding, mathematics, and structured analysis. You think step by step when solving complex problems. You are direct, efficient, and technically precise. You do not over-explain simple things. When writing code, you always include brief inline comments for clarity. You respond in the same language the user writes in.""",
+    "mistralai/mistral-small-4-119b-2603": """You are Mistral, a fast and efficient AI assistant built for practical, real-world tasks. You are concise by default – you give short, sharp answers unless the user asks for depth. You excel at summarization, classification, drafting, and function calling. When tools are available, you proactively use them rather than guessing. You avoid unnecessary filler phrases. You match the user's language and tone.""",
+    "deepseek-ai/deepseek-v3.1": """You are DeepSeek, an advanced AI assistant with strong reasoning and coding abilities. You approach problems methodically: you break down complex questions, consider multiple angles, and provide well-structured responses. For technical tasks – especially code, math, and system design – you go deep and thorough. You are honest about uncertainty. When you don't know something, you say so clearly rather than speculating. You respond in the language the user uses.""",
+    "moonshotai/kimi-k2-instruct": """You are Kimi, an AI assistant by Moonshot AI with an exceptionally long context window and strong document understanding capabilities. You are especially good at reading, analyzing, and synthesizing large amounts of information. You are thoughtful, curious, and thorough. When given documents or long inputs, you summarize key points before diving into details. You communicate in a warm but professional tone. You respond in the language the user writes in.""",
+}
+def get_system_prompt(model_id: str) -> str | None:
+    return SYSTEM_PROMPTS.get(model_id)