Spaces:

truegleai
/

o87dev-llm-api

Paused

File size: 8,896 Bytes

78822f8

import os
import subprocess
import logging
import json
import requests
import uvicorn
from fastapi import FastAPI, Depends, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.responses import StreamingResponse
from huggingface_hub import HfApi

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="o87Dev Cloud LLM API")
security = HTTPBearer()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

OLLAMA_BASE = "http://localhost:11434"
MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
API_TOKEN = os.environ.get("API_TOKEN")  # Set as Space secret

# ── Auth ──────────────────────────────────────────────────────────────────────

def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
    token = creds.credentials
    # If API_TOKEN secret is set, validate against it directly (faster)
    if API_TOKEN:
        if token != API_TOKEN:
            raise HTTPException(401, "Invalid token")
        return token
    # Fallback: validate as HF token
    try:
        HfApi().whoami(token=token)
    except Exception:
        raise HTTPException(401, "Invalid Hugging Face token")
    return token

# ── Health ────────────────────────────────────────────────────────────────────

@app.get("/health")
async def health():
    try:
        r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
        models = [m["name"] for m in r.json().get("models", [])]
        return {"status": "ok", "model": MODEL, "available_models": models}
    except Exception as e:
        return {"status": "starting", "error": str(e)}

# ── OpenAI-compatible /v1/chat/completions ────────────────────────────────────

@app.post("/v1/chat/completions")
async def chat_completions(request: Request, token: str = Depends(verify_token)):
    body = await request.json()
    model = body.get("model", MODEL)
    stream = body.get("stream", False)

    ollama_payload = {
        "model": model,
        "messages": body.get("messages", []),
        "stream": stream,
        "options": {
            "num_ctx": body.get("max_tokens", 32768),
            "temperature": body.get("temperature", 0.7),
        }
    }

    if stream:
        def generate():
            try:
                with requests.post(
                    f"{OLLAMA_BASE}/v1/chat/completions",
                    json=ollama_payload,
                    stream=True,
                    timeout=300
                ) as r:
                    for chunk in r.iter_content(chunk_size=None):
                        if chunk:
                            yield chunk
            except Exception as e:
                yield f"data: {{\"error\": \"{str(e)}\"}}\n\n"
        return StreamingResponse(generate(), media_type="text/event-stream")
    else:
        try:
            r = requests.post(
                f"{OLLAMA_BASE}/v1/chat/completions",
                json=ollama_payload,
                timeout=300
            )
            return r.json()
        except Exception as e:
            raise HTTPException(500, str(e))

# ── Anthropic-compatible /v1/messages ─────────────────────────────────────────

@app.post("/v1/messages")
async def messages(request: Request, token: str = Depends(verify_token)):
    body = await request.json()
    model = body.get("model", MODEL)
    stream = body.get("stream", False)

    ollama_payload = {
        "model": model,
        "messages": body.get("messages", []),
        "stream": stream,
        "options": {
            "num_ctx": body.get("max_tokens", 32768),
            "temperature": body.get("temperature", 0.7),
        }
    }

    if stream:
        import time

        def generate_anthropic():
            msg_id = f"msg_{int(time.time())}"
            yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n"
            yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n"
            yield f"event: ping\ndata: {{\"type\":\"ping\"}}\n\n"

            output_tokens = 0
            try:
                with requests.post(
                    f"{OLLAMA_BASE}/v1/chat/completions",
                    json=ollama_payload,
                    stream=True,
                    timeout=300
                ) as r:
                    buffer = ""
                    for chunk in r.iter_content(chunk_size=None):
                        if not chunk:
                            continue
                        buffer += chunk.decode("utf-8", errors="ignore")
                        lines = buffer.split("\n")
                        buffer = lines.pop()
                        for line in lines:
                            line = line.strip()
                            if not line or not line.startswith("data: "):
                                continue
                            js = line[6:]
                            if js == "[DONE]":
                                break
                            try:
                                data = json.loads(js)
                                if data.get("usage"):
                                    output_tokens = data["usage"].get("completion_tokens", 0)
                                delta = data.get("choices", [{}])[0].get("delta", {})
                                text = delta.get("content") or delta.get("reasoning") or ""
                                if text:
                                    yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n"
                                if data.get("choices", [{}])[0].get("finish_reason"):
                                    break
                            except Exception:
                                pass
            except Exception as e:
                yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n"

            yield f"event: content_block_stop\ndata: {{\"type\":\"content_block_stop\",\"index\":0}}\n\n"
            yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':output_tokens}})}\n\n"
            yield f"event: message_stop\ndata: {{\"type\":\"message_stop\"}}\n\n"

        return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
    else:
        try:
            r = requests.post(
                f"{OLLAMA_BASE}/v1/chat/completions",
                json=ollama_payload,
                timeout=300
            )
            data = r.json()
            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {
                "id": data.get("id", f"msg_{int(__import__('time').time())}"),
                "type": "message",
                "role": "assistant",
                "content": [{"type": "text", "text": content}],
                "model": model,
                "stop_reason": "end_turn",
                "usage": {
                    "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
                    "output_tokens": data.get("usage", {}).get("completion_tokens", 0)
                }
            }
        except Exception as e:
            raise HTTPException(500, str(e))

# ── Models list ───────────────────────────────────────────────────────────────

@app.get("/v1/models")
async def list_models(token: str = Depends(verify_token)):
    try:
        r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
        models = [{"id": m["name"], "object": "model"} for m in r.json().get("models", [])]
        return {"object": "list", "data": models}
    except Exception:
        return {"object": "list", "data": [{"id": MODEL, "object": "model"}]}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)