ollamaapi-lfm

Sleeping

File size: 3,441 Bytes

f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
f2f6faa
 
 
5fde9c6
 
 
 
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
5fde9c6

import os
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import httpx

app = FastAPI()
security = HTTPBearer()

API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")
MODEL   = os.environ.get("MODEL",   "!TU MUSISZ EDYTOWAC!")
OLLAMA_BASE = "http://127.0.0.1:11434"

# --- Optymalizacja dla CPU ---
# Ograniczamy kontekst (num_ctx), aby model nie musiał przeliczać ogromnej historii.
# 1024-2048 to bezpieczny zakres dla darmowych serwerów HF.
CONTEXT_LIMIT = 2048 

if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
    raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")

def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    if credentials.credentials != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API key")
    return credentials.credentials

@app.post("/api/chat")
async def ollama_chat(request: Request, key: str = Depends(verify_key)):
    body = await request.json()
    
    # Wymuszamy parametry optymalizacyjne
    body["stream"] = True
    body["model"] = MODEL
    
    # Dodajemy opcje optymalizacji kontekstu, jeśli nie zostały podane
    if "options" not in body:
        body["options"] = {}
    
    # num_ctx: 2048 ogranicza "pamięć" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU
    if "num_ctx" not in body["options"]:
        body["options"]["num_ctx"] = CONTEXT_LIMIT

    async def generate():
        async with httpx.AsyncClient(timeout=600.0) as client:
            async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp:
                async for line in resp.aiter_lines():
                    if not line: continue
                    yield line + "\n"

    return StreamingResponse(generate(), media_type="application/x-ndjson")

@app.post("/api/generate")
async def ollama_generate(request: Request, key: str = Depends(verify_key)):
    body = await request.json()
    body["stream"] = True
    body["model"] = MODEL
    
    if "options" not in body:
        body["options"] = {}
    if "num_ctx" not in body["options"]:
        body["options"]["num_ctx"] = CONTEXT_LIMIT
    
    async def generate():
        async with httpx.AsyncClient(timeout=600.0) as client:
            async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp:
                async for line in resp.aiter_lines():
                    if not line: continue
                    yield line + "\n"

    return StreamingResponse(generate(), media_type="application/x-ndjson")

@app.get("/api/tags")
async def ollama_tags(key: str = Depends(verify_key)):
    return {
        "models": [{
            "name": MODEL,
            "model": MODEL,
            "modified_at": "2024-01-01T00:00:00Z",
            "size": 0,
            "details": {"format": "gguf", "family": "llama"}
        }]
    }

@app.get("/health")
async def health():
    async with httpx.AsyncClient(timeout=5.0) as client:
        try:
            r = await client.get(f"{OLLAMA_BASE}/api/version")
            ollama_ok = r.status_code == 200
        except Exception:
            ollama_ok = False
    return {"status": "ok" if ollama_ok else "starting", "model": MODEL}

@app.get("/")
async def root():
    return {"message": "Ollama Proxy is running", "model": MODEL}