import os import subprocess import logging import json import requests import uvicorn from fastapi import FastAPI, Depends, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from fastapi.responses import StreamingResponse from huggingface_hub import HfApi logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="o87Dev Cloud LLM API") security = HTTPBearer() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) OLLAMA_BASE = "http://localhost:11434" MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M") API_TOKEN = os.environ.get("API_TOKEN") # Set as Space secret # ── Auth ────────────────────────────────────────────────────────────────────── def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)): token = creds.credentials # If API_TOKEN secret is set, validate against it directly (faster) if API_TOKEN: if token != API_TOKEN: raise HTTPException(401, "Invalid token") return token # Fallback: validate as HF token try: HfApi().whoami(token=token) except Exception: raise HTTPException(401, "Invalid Hugging Face token") return token # ── Health ──────────────────────────────────────────────────────────────────── @app.get("/health") async def health(): try: r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5) models = [m["name"] for m in r.json().get("models", [])] return {"status": "ok", "model": MODEL, "available_models": models} except Exception as e: return {"status": "starting", "error": str(e)} # ── OpenAI-compatible /v1/chat/completions ──────────────────────────────────── @app.post("/v1/chat/completions") async def chat_completions(request: Request, token: str = Depends(verify_token)): body = await request.json() model = body.get("model", MODEL) stream = body.get("stream", False) ollama_payload = { "model": model, "messages": body.get("messages", []), "stream": stream, "options": { "num_ctx": body.get("max_tokens", 32768), "temperature": body.get("temperature", 0.7), } } if stream: def generate(): try: with requests.post( f"{OLLAMA_BASE}/v1/chat/completions", json=ollama_payload, stream=True, timeout=300 ) as r: for chunk in r.iter_content(chunk_size=None): if chunk: yield chunk except Exception as e: yield f"data: {{\"error\": \"{str(e)}\"}}\n\n" return StreamingResponse(generate(), media_type="text/event-stream") else: try: r = requests.post( f"{OLLAMA_BASE}/v1/chat/completions", json=ollama_payload, timeout=300 ) return r.json() except Exception as e: raise HTTPException(500, str(e)) # ── Anthropic-compatible /v1/messages ───────────────────────────────────────── @app.post("/v1/messages") async def messages(request: Request, token: str = Depends(verify_token)): body = await request.json() model = body.get("model", MODEL) stream = body.get("stream", False) ollama_payload = { "model": model, "messages": body.get("messages", []), "stream": stream, "options": { "num_ctx": body.get("max_tokens", 32768), "temperature": body.get("temperature", 0.7), } } if stream: import time def generate_anthropic(): msg_id = f"msg_{int(time.time())}" yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n" yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n" yield f"event: ping\ndata: {{\"type\":\"ping\"}}\n\n" output_tokens = 0 try: with requests.post( f"{OLLAMA_BASE}/v1/chat/completions", json=ollama_payload, stream=True, timeout=300 ) as r: buffer = "" for chunk in r.iter_content(chunk_size=None): if not chunk: continue buffer += chunk.decode("utf-8", errors="ignore") lines = buffer.split("\n") buffer = lines.pop() for line in lines: line = line.strip() if not line or not line.startswith("data: "): continue js = line[6:] if js == "[DONE]": break try: data = json.loads(js) if data.get("usage"): output_tokens = data["usage"].get("completion_tokens", 0) delta = data.get("choices", [{}])[0].get("delta", {}) text = delta.get("content") or delta.get("reasoning") or "" if text: yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n" if data.get("choices", [{}])[0].get("finish_reason"): break except Exception: pass except Exception as e: yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n" yield f"event: content_block_stop\ndata: {{\"type\":\"content_block_stop\",\"index\":0}}\n\n" yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':output_tokens}})}\n\n" yield f"event: message_stop\ndata: {{\"type\":\"message_stop\"}}\n\n" return StreamingResponse(generate_anthropic(), media_type="text/event-stream") else: try: r = requests.post( f"{OLLAMA_BASE}/v1/chat/completions", json=ollama_payload, timeout=300 ) data = r.json() content = data.get("choices", [{}])[0].get("message", {}).get("content", "") return { "id": data.get("id", f"msg_{int(__import__('time').time())}"), "type": "message", "role": "assistant", "content": [{"type": "text", "text": content}], "model": model, "stop_reason": "end_turn", "usage": { "input_tokens": data.get("usage", {}).get("prompt_tokens", 0), "output_tokens": data.get("usage", {}).get("completion_tokens", 0) } } except Exception as e: raise HTTPException(500, str(e)) # ── Models list ─────────────────────────────────────────────────────────────── @app.get("/v1/models") async def list_models(token: str = Depends(verify_token)): try: r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5) models = [{"id": m["name"], "object": "model"} for m in r.json().get("models", [])] return {"object": "list", "data": models} except Exception: return {"object": "list", "data": [{"id": MODEL, "object": "model"}]} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)