Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import time | |
| import uuid | |
| import re | |
| import json | |
| from typing import Optional, Any, List, Literal | |
| from fastapi import FastAPI, Header, HTTPException | |
| from fastapi.responses import StreamingResponse, JSONResponse | |
| from pydantic import BaseModel | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| APP_TITLE = "HF OpenAI-Compatible API (OpenCode+n8n) fast SSE + plain stream" | |
| # ----------------------- | |
| # GGUF model | |
| # ----------------------- | |
| HF_REPO_ID = os.environ.get("HF_REPO_ID", "bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF") | |
| HF_FILENAME = os.environ.get("HF_FILENAME", "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf") | |
| # DeepSeek chat tokens | |
| BOS = "<|begin▁of▁sentence|>" | |
| USR = "<|User|>" | |
| AST = "<|Assistant|>" | |
| PRIMARY_MODEL_ID = os.environ.get("PRIMARY_MODEL_ID", "deepseek-r1-distill-qwen-1.5b-q4_k_m") | |
| # ----------------------- | |
| # Auth | |
| # ----------------------- | |
| API_KEY = os.environ.get("API_KEY", "").strip() | |
| if not API_KEY: | |
| raise RuntimeError("Missing API_KEY secret (Space Settings -> Secrets).") | |
| def require_auth(auth: Optional[str]) -> None: | |
| if not auth or not auth.startswith("Bearer "): | |
| raise HTTPException(status_code=401, detail="Missing Authorization: Bearer <key>") | |
| token = auth.removeprefix("Bearer ").strip() | |
| if token != API_KEY: | |
| raise HTTPException(status_code=403, detail="Forbidden") | |
| def oai_error(message: str, code: str = "internal_error", status: int = 500): | |
| return JSONResponse( | |
| status_code=status, | |
| content={"error": {"message": message, "type": "server_error", "code": code}}, | |
| ) | |
| # ----------------------- | |
| # Performance tuning (HF free CPU defaults) | |
| # ----------------------- | |
| # Recommended for OpenCode+n8n: | |
| # N_CTX=2048 (or 3072 if you can afford more latency) | |
| # N_BATCH=512 or 1024 (try 1024 first) | |
| N_THREADS = int(os.environ.get("N_THREADS", "2")) | |
| N_CTX = int(os.environ.get("N_CTX", "3072")) | |
| N_BATCH = int(os.environ.get("N_BATCH", "1024")) | |
| MAX_TOKENS_DEFAULT = int(os.environ.get("MAX_TOKENS_DEFAULT", "256")) | |
| CTX_MARGIN = int(os.environ.get("CTX_MARGIN", "96")) | |
| # SSE chunking knobs (make OpenCode feel fast) | |
| SSE_FLUSH_CHARS = int(os.environ.get("SSE_FLUSH_CHARS", "48")) # flush after ~48 chars buffered | |
| SSE_FLUSH_SEC = float(os.environ.get("SSE_FLUSH_SEC", "0.12")) # or flush after 120ms | |
| # History trimming knobs (keeps prompts smaller => faster) | |
| KEEP_LAST_TURNS = int(os.environ.get("KEEP_LAST_TURNS", "8")) # keep last 8 non-system messages | |
| # Keep the server default system short for speed; clients can send their own. | |
| DEFAULT_SYSTEM = ( | |
| "You are a helpful programming assistant. " | |
| "Answer directly and concisely. " | |
| "No <think>. No reasoning." | |
| ) | |
| THINK_BLOCK_RE = re.compile(r"<think>.*?</think>", re.DOTALL) | |
| def strip_think(text: str) -> str: | |
| text = THINK_BLOCK_RE.sub("", text) | |
| if "<think>" in text: | |
| text = text.split("<think>", 1)[0] | |
| return text.strip() | |
| # ----------------------- | |
| # App + Model | |
| # ----------------------- | |
| app = FastAPI(title=APP_TITLE) | |
| llm: Optional[Llama] = None | |
| MODEL_PATH: Optional[str] = None | |
| LOAD_ERROR: Optional[str] = None | |
| def ensure_model_loaded() -> None: | |
| global llm, MODEL_PATH, LOAD_ERROR | |
| if llm is not None: | |
| return | |
| LOAD_ERROR = None | |
| MODEL_PATH = hf_hub_download( | |
| repo_id=HF_REPO_ID, | |
| filename=HF_FILENAME, | |
| local_dir="/tmp/models", | |
| ) | |
| t0 = time.time() | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_threads=N_THREADS, | |
| n_ctx=N_CTX, | |
| n_batch=N_BATCH, | |
| use_mmap=True, | |
| use_mlock=False, | |
| ) | |
| print(f"Model loaded in {time.time() - t0:.1f}s: {MODEL_PATH}") | |
| # Warm-up (reduces first token delay) | |
| try: | |
| _ = llm(f"{BOS}Warmup{USR}hi{AST}", max_tokens=16, temperature=0.0, top_p=1.0) | |
| print("Warm-up completed") | |
| except Exception as e: | |
| print(f"Warm-up failed (ignored): {e}") | |
| def startup_event(): | |
| global LOAD_ERROR | |
| try: | |
| ensure_model_loaded() | |
| except Exception as e: | |
| LOAD_ERROR = str(e) | |
| print(f"Startup preload failed: {e}") | |
| # ----------------------- | |
| # Token counting + clamping (prevents context overflow crashes) | |
| # ----------------------- | |
| def prompt_token_count(prompt: str) -> int: | |
| toks = llm.tokenize(prompt.encode("utf-8")) | |
| return len(toks) | |
| def clamp_max_tokens(prompt: str, requested: int) -> int: | |
| pt = prompt_token_count(prompt) | |
| available = max(0, N_CTX - pt - CTX_MARGIN) | |
| return max(1, min(int(requested), int(available))) | |
| # ----------------------- | |
| # Lenient schemas (accept extra fields OpenCode/LangChain send) | |
| # ----------------------- | |
| class LenientModel(BaseModel): | |
| model_config = {"extra": "allow"} | |
| class ChatMessage(LenientModel): | |
| role: Literal["system", "user", "assistant", "tool"] | |
| content: Optional[str] = None | |
| tool_call_id: Optional[str] = None | |
| name: Optional[str] = None | |
| class ChatCompletionsReq(LenientModel): | |
| model: Optional[str] = None | |
| messages: List[ChatMessage] | |
| temperature: Optional[float] = 0.2 | |
| top_p: Optional[float] = 0.9 | |
| max_tokens: Optional[int] = None | |
| max_completion_tokens: Optional[int] = None | |
| stream: Optional[bool] = False | |
| stop: Optional[Any] = None | |
| class ResponsesReq(LenientModel): | |
| model: Optional[str] = None | |
| input: Any = None | |
| temperature: Optional[float] = 0.2 | |
| top_p: Optional[float] = 0.9 | |
| max_output_tokens: Optional[int] = None | |
| stream: Optional[bool] = False | |
| class CompletionsReq(LenientModel): | |
| model: Optional[str] = None | |
| prompt: Any = None | |
| temperature: Optional[float] = 0.2 | |
| top_p: Optional[float] = 0.9 | |
| max_tokens: Optional[int] = None | |
| stream: Optional[bool] = False | |
| class GenerateStreamReq(LenientModel): | |
| prompt: str | |
| max_new_tokens: int = 200 | |
| temperature: float = 0.2 | |
| top_p: float = 0.9 | |
| # ----------------------- | |
| # Prompt builders | |
| # ----------------------- | |
| def trim_messages_for_speed(messages: List[ChatMessage], keep_last_non_system: int = KEEP_LAST_TURNS) -> List[ChatMessage]: | |
| sys = [m for m in messages if m.role == "system"] | |
| other = [m for m in messages if m.role != "system"] | |
| return sys + other[-keep_last_non_system:] | |
| def messages_to_prompt(messages: List[ChatMessage]) -> str: | |
| system_text = "" | |
| convo = "" | |
| for m in messages: | |
| if m.role == "system": | |
| system_text += (m.content or "") | |
| elif m.role == "user": | |
| convo += f"{USR}{m.content or ''}\n" | |
| elif m.role == "assistant": | |
| convo += f"{AST}{m.content or ''}\n" | |
| elif m.role == "tool": | |
| convo += f"{USR}[Tool result]\n{m.content or ''}\n" | |
| if not system_text.strip(): | |
| system_text = DEFAULT_SYSTEM | |
| else: | |
| # Prepend short server rules to keep behavior consistent | |
| system_text = DEFAULT_SYSTEM + "\n" + system_text | |
| return f"{BOS}{system_text}\n{convo}{AST}" | |
| def input_to_messages(inp: Any) -> List[ChatMessage]: | |
| if inp is None: | |
| return [ChatMessage(role="user", content="")] | |
| if isinstance(inp, str): | |
| return [ChatMessage(role="user", content=inp)] | |
| if isinstance(inp, list) and inp and isinstance(inp[0], dict): | |
| if inp[0].get("type") == "message": | |
| msgs: List[ChatMessage] = [] | |
| for item in inp: | |
| role = item.get("role", "user") | |
| blocks = item.get("content", []) | |
| parts = [] | |
| if isinstance(blocks, list): | |
| for b in blocks: | |
| if isinstance(b, dict) and b.get("type") == "text": | |
| parts.append(b.get("text", "")) | |
| msgs.append(ChatMessage(role=role, content="".join(parts))) | |
| return msgs | |
| if "role" in inp[0]: | |
| return [ChatMessage(role=m.get("role", "user"), content=m.get("content", "")) for m in inp] | |
| return [ChatMessage(role="user", content=str(inp))] | |
| # ----------------------- | |
| # Endpoints | |
| # ----------------------- | |
| def root(): | |
| return { | |
| "ok": True, | |
| "service": "openai-compatible", | |
| "endpoints": [ | |
| "/v1/models", | |
| "/v1/chat/completions", | |
| "/v1/responses", | |
| "/v1/completions", | |
| "/generate_stream", | |
| ], | |
| } | |
| def health(): | |
| return { | |
| "ok": True, | |
| "model_loaded": llm is not None, | |
| "load_error": LOAD_ERROR, | |
| "model": PRIMARY_MODEL_ID, | |
| "threads": N_THREADS, | |
| "ctx": N_CTX, | |
| "batch": N_BATCH, | |
| "ctx_margin": CTX_MARGIN, | |
| "keep_last_turns": KEEP_LAST_TURNS, | |
| "sse_flush_chars": SSE_FLUSH_CHARS, | |
| "sse_flush_sec": SSE_FLUSH_SEC, | |
| } | |
| def v1_models(authorization: Optional[str] = Header(default=None)): | |
| require_auth(authorization) | |
| return { | |
| "object": "list", | |
| "data": [ | |
| {"id": PRIMARY_MODEL_ID, "object": "model", "owned_by": "me"}, | |
| {"id": "gpt-4", "object": "model", "owned_by": "me"}, | |
| {"id": "gpt-3.5-turbo", "object": "model", "owned_by": "me"}, | |
| {"id": "auto", "object": "model", "owned_by": "me"}, | |
| ], | |
| } | |
| # ----------------------- | |
| # /v1/chat/completions (OpenAI + FAST SSE) | |
| # ----------------------- | |
| def v1_chat_completions(req: ChatCompletionsReq, authorization: Optional[str] = Header(default=None)): | |
| try: | |
| require_auth(authorization) | |
| ensure_model_loaded() | |
| # Trim long histories for speed/stability | |
| msgs = trim_messages_for_speed(req.messages, KEEP_LAST_TURNS) | |
| prompt = messages_to_prompt(msgs) | |
| requested = ( | |
| req.max_completion_tokens | |
| if req.max_completion_tokens is not None | |
| else (req.max_tokens if req.max_tokens is not None else MAX_TOKENS_DEFAULT) | |
| ) | |
| requested = int(requested) | |
| max_toks = clamp_max_tokens(prompt, requested) | |
| temperature = float(req.temperature if req.temperature is not None else 0.2) | |
| top_p = float(req.top_p if req.top_p is not None else 0.9) | |
| if req.stream: | |
| stream_id = f"chatcmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| def sse_gen(): | |
| buf: List[str] = [] | |
| last_flush = time.time() | |
| def flush(): | |
| nonlocal buf, last_flush | |
| if not buf: | |
| return None | |
| text = "".join(buf) | |
| buf = [] | |
| last_flush = time.time() | |
| event = { | |
| "id": stream_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "choices": [{"index": 0, "delta": {"content": text}, "finish_reason": None}], | |
| } | |
| return f"data: {json.dumps(event, ensure_ascii=False)}\n\n" | |
| for chunk in llm( | |
| prompt, | |
| max_tokens=max_toks, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stream=True, | |
| ): | |
| token = chunk["choices"][0]["text"] or "" | |
| if not token: | |
| continue | |
| # Strip thinking inline | |
| token = THINK_BLOCK_RE.sub("", token) | |
| if "<think>" in token: | |
| token = token.split("<think>", 1)[0] | |
| if not token: | |
| continue | |
| buf.append(token) | |
| # Flush less often to reduce SSE overhead (big speed win) | |
| buf_len = sum(len(x) for x in buf) | |
| if buf_len >= SSE_FLUSH_CHARS or (time.time() - last_flush) >= SSE_FLUSH_SEC: | |
| out = flush() | |
| if out: | |
| yield out | |
| # Final flush | |
| out = flush() | |
| if out: | |
| yield out | |
| final = { | |
| "id": stream_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], | |
| } | |
| yield f"data: {json.dumps(final, ensure_ascii=False)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(sse_gen(), media_type="text/event-stream") | |
| # Non-stream | |
| out = llm(prompt, max_tokens=max_toks, temperature=temperature, top_p=top_p) | |
| text = strip_think(out["choices"][0]["text"]) | |
| created = int(time.time()) | |
| return { | |
| "id": f"chatcmpl-{uuid.uuid4().hex}", | |
| "object": "chat.completion", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}], | |
| } | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| return oai_error(str(e), code="internal_error", status=500) | |
| # ----------------------- | |
| # /v1/responses (n8n/LangChain; minimal) | |
| # ----------------------- | |
| def v1_responses(req: ResponsesReq, authorization: Optional[str] = Header(default=None)): | |
| try: | |
| require_auth(authorization) | |
| ensure_model_loaded() | |
| messages = input_to_messages(req.input) | |
| messages = trim_messages_for_speed(messages, KEEP_LAST_TURNS) | |
| prompt = messages_to_prompt(messages) | |
| requested = int(req.max_output_tokens if req.max_output_tokens is not None else MAX_TOKENS_DEFAULT) | |
| max_toks = clamp_max_tokens(prompt, requested) | |
| temperature = float(req.temperature if req.temperature is not None else 0.2) | |
| top_p = float(req.top_p if req.top_p is not None else 0.9) | |
| out = llm(prompt, max_tokens=max_toks, temperature=temperature, top_p=top_p) | |
| text = strip_think(out["choices"][0]["text"]) | |
| rid = f"resp_{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| return { | |
| "id": rid, | |
| "object": "response", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "output_text": text, | |
| "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": text}]}], | |
| } | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| return oai_error(str(e), code="internal_error", status=500) | |
| # ----------------------- | |
| # /v1/completions (legacy) | |
| # ----------------------- | |
| def v1_completions(req: CompletionsReq, authorization: Optional[str] = Header(default=None)): | |
| try: | |
| require_auth(authorization) | |
| ensure_model_loaded() | |
| prompt_in = req.prompt | |
| if isinstance(prompt_in, list): | |
| prompt_in = "\n".join(str(x) for x in prompt_in) | |
| if prompt_in is None: | |
| prompt_in = "" | |
| prompt = f"{BOS}{DEFAULT_SYSTEM}\n{USR}{prompt_in}\n{AST}" | |
| requested = int(req.max_tokens if req.max_tokens is not None else MAX_TOKENS_DEFAULT) | |
| max_toks = clamp_max_tokens(prompt, requested) | |
| temperature = float(req.temperature if req.temperature is not None else 0.2) | |
| top_p = float(req.top_p if req.top_p is not None else 0.9) | |
| if req.stream: | |
| comp_id = f"cmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| def sse_gen(): | |
| buf: List[str] = [] | |
| last_flush = time.time() | |
| def flush(): | |
| nonlocal buf, last_flush | |
| if not buf: | |
| return None | |
| text = "".join(buf) | |
| buf = [] | |
| last_flush = time.time() | |
| event = { | |
| "id": comp_id, | |
| "object": "text_completion", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "choices": [{"index": 0, "text": text, "finish_reason": None}], | |
| } | |
| return f"data: {json.dumps(event, ensure_ascii=False)}\n\n" | |
| for chunk in llm(prompt, max_tokens=max_toks, temperature=temperature, top_p=top_p, stream=True): | |
| token = chunk["choices"][0]["text"] or "" | |
| if not token: | |
| continue | |
| token = THINK_BLOCK_RE.sub("", token) | |
| if "<think>" in token: | |
| token = token.split("<think>", 1)[0] | |
| if not token: | |
| continue | |
| buf.append(token) | |
| buf_len = sum(len(x) for x in buf) | |
| if buf_len >= SSE_FLUSH_CHARS or (time.time() - last_flush) >= SSE_FLUSH_SEC: | |
| out = flush() | |
| if out: | |
| yield out | |
| out = flush() | |
| if out: | |
| yield out | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(sse_gen(), media_type="text/event-stream") | |
| out = llm(prompt, max_tokens=max_toks, temperature=temperature, top_p=top_p) | |
| text = strip_think(out["choices"][0]["text"]) | |
| created = int(time.time()) | |
| return { | |
| "id": f"cmpl-{uuid.uuid4().hex}", | |
| "object": "text_completion", | |
| "created": created, | |
| "model": PRIMARY_MODEL_ID, | |
| "choices": [{"index": 0, "text": text, "finish_reason": "stop"}], | |
| } | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| return oai_error(str(e), code="internal_error", status=500) | |
| # ----------------------- | |
| # /generate_stream (plain text streaming; fastest) | |
| # ----------------------- | |
| def generate_stream(req: GenerateStreamReq, authorization: Optional[str] = Header(default=None)): | |
| try: | |
| require_auth(authorization) | |
| ensure_model_loaded() | |
| prompt = f"{BOS}{DEFAULT_SYSTEM}\n{USR}{req.prompt}\n{AST}" | |
| requested = int(req.max_new_tokens if req.max_new_tokens is not None else 200) | |
| max_toks = clamp_max_tokens(prompt, requested) | |
| temperature = float(req.temperature if req.temperature is not None else 0.2) | |
| top_p = float(req.top_p if req.top_p is not None else 0.9) | |
| def token_gen(): | |
| for chunk in llm(prompt, max_tokens=max_toks, temperature=temperature, top_p=top_p, stream=True): | |
| token = chunk["choices"][0]["text"] or "" | |
| if not token: | |
| continue | |
| token = THINK_BLOCK_RE.sub("", token) | |
| if "<think>" in token: | |
| token = token.split("<think>", 1)[0] | |
| if token: | |
| yield token | |
| return StreamingResponse(token_gen(), media_type="text/plain") | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| return oai_error(str(e), code="internal_error", status=500) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860"))) | |