import os from fastapi import FastAPI, Request, HTTPException, Depends from fastapi.responses import StreamingResponse from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx import json import time import uuid app = FastAPI() security = HTTPBearer() API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!") # np. "moj-tajny-klucz" MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!") # np. "deepseek-r1:14b" albo "hf.co/unsloth/GLM-4.7-Flash-GGUF:UD-TQ1_0" OLLAMA_BASE = "http://127.0.0.1:11434" if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL): raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables") def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)): if credentials.credentials != API_KEY: raise HTTPException(status_code=401, detail="Invalid API key") return credentials.credentials @app.get("/v1/models") async def list_models(key: str = Depends(verify_key)): return { "object": "list", "data": [{ "id": MODEL, "object": "model", "created": int(time.time()), "owned_by": "ollama", }] } @app.post("/v1/chat/completions") async def chat_completions(request: Request, key: str = Depends(verify_key)): body = await request.json() messages = body.get("messages", []) temperature = body.get("temperature", 0.6) # !TU MUSISZ EDYTOWAC! domyślna temperatura (0.0-2.0) top_p = body.get("top_p", 0.95) # !TU MUSISZ EDYTOWAC! domyślne top_p (0.0-1.0) options = {"temperature": temperature, "top_p": top_p} if "max_tokens" in body: options["num_predict"] = body["max_tokens"] ollama_payload = { "model": MODEL, "messages": messages, "stream": True, "options": options, } completion_id = f"chatcmpl-{uuid.uuid4().hex}" created = int(time.time()) async def generate(): async with httpx.AsyncClient(timeout=300.0) as client: async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=ollama_payload) as resp: async for line in resp.aiter_lines(): if not line: continue try: chunk = json.loads(line) except Exception: continue msg = chunk.get("message", {}) done = chunk.get("done", False) if done: delta = {} else: delta = {} if msg.get("thinking") is not None: delta["reasoning_content"] = msg["thinking"] if msg.get("content") is not None: delta["content"] = msg["content"] data = { "id": completion_id, "object": "chat.completion.chunk", "created": created, "model": MODEL, "choices": [{ "index": 0, "delta": delta, "finish_reason": "stop" if done else None, }] } yield f"data: {json.dumps(data)}\n\n" if done: break yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") @app.get("/health") async def health(): async with httpx.AsyncClient(timeout=5.0) as client: try: r = await client.get(f"{OLLAMA_BASE}/api/version") ollama_ok = r.status_code == 200 except Exception: ollama_ok = False return {"status": "ok" if ollama_ok else "starting", "model": MODEL}