o87dev-llm-api / app.py
truegleai's picture
Upload app.py with huggingface_hub
78822f8 verified
import os
import subprocess
import logging
import json
import requests
import uvicorn
from fastapi import FastAPI, Depends, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.responses import StreamingResponse
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="o87Dev Cloud LLM API")
security = HTTPBearer()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
OLLAMA_BASE = "http://localhost:11434"
MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
API_TOKEN = os.environ.get("API_TOKEN") # Set as Space secret
# ── Auth ──────────────────────────────────────────────────────────────────────
def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
token = creds.credentials
# If API_TOKEN secret is set, validate against it directly (faster)
if API_TOKEN:
if token != API_TOKEN:
raise HTTPException(401, "Invalid token")
return token
# Fallback: validate as HF token
try:
HfApi().whoami(token=token)
except Exception:
raise HTTPException(401, "Invalid Hugging Face token")
return token
# ── Health ────────────────────────────────────────────────────────────────────
@app.get("/health")
async def health():
try:
r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
models = [m["name"] for m in r.json().get("models", [])]
return {"status": "ok", "model": MODEL, "available_models": models}
except Exception as e:
return {"status": "starting", "error": str(e)}
# ── OpenAI-compatible /v1/chat/completions ────────────────────────────────────
@app.post("/v1/chat/completions")
async def chat_completions(request: Request, token: str = Depends(verify_token)):
body = await request.json()
model = body.get("model", MODEL)
stream = body.get("stream", False)
ollama_payload = {
"model": model,
"messages": body.get("messages", []),
"stream": stream,
"options": {
"num_ctx": body.get("max_tokens", 32768),
"temperature": body.get("temperature", 0.7),
}
}
if stream:
def generate():
try:
with requests.post(
f"{OLLAMA_BASE}/v1/chat/completions",
json=ollama_payload,
stream=True,
timeout=300
) as r:
for chunk in r.iter_content(chunk_size=None):
if chunk:
yield chunk
except Exception as e:
yield f"data: {{\"error\": \"{str(e)}\"}}\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
try:
r = requests.post(
f"{OLLAMA_BASE}/v1/chat/completions",
json=ollama_payload,
timeout=300
)
return r.json()
except Exception as e:
raise HTTPException(500, str(e))
# ── Anthropic-compatible /v1/messages ─────────────────────────────────────────
@app.post("/v1/messages")
async def messages(request: Request, token: str = Depends(verify_token)):
body = await request.json()
model = body.get("model", MODEL)
stream = body.get("stream", False)
ollama_payload = {
"model": model,
"messages": body.get("messages", []),
"stream": stream,
"options": {
"num_ctx": body.get("max_tokens", 32768),
"temperature": body.get("temperature", 0.7),
}
}
if stream:
import time
def generate_anthropic():
msg_id = f"msg_{int(time.time())}"
yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n"
yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n"
yield f"event: ping\ndata: {{\"type\":\"ping\"}}\n\n"
output_tokens = 0
try:
with requests.post(
f"{OLLAMA_BASE}/v1/chat/completions",
json=ollama_payload,
stream=True,
timeout=300
) as r:
buffer = ""
for chunk in r.iter_content(chunk_size=None):
if not chunk:
continue
buffer += chunk.decode("utf-8", errors="ignore")
lines = buffer.split("\n")
buffer = lines.pop()
for line in lines:
line = line.strip()
if not line or not line.startswith("data: "):
continue
js = line[6:]
if js == "[DONE]":
break
try:
data = json.loads(js)
if data.get("usage"):
output_tokens = data["usage"].get("completion_tokens", 0)
delta = data.get("choices", [{}])[0].get("delta", {})
text = delta.get("content") or delta.get("reasoning") or ""
if text:
yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n"
if data.get("choices", [{}])[0].get("finish_reason"):
break
except Exception:
pass
except Exception as e:
yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n"
yield f"event: content_block_stop\ndata: {{\"type\":\"content_block_stop\",\"index\":0}}\n\n"
yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':output_tokens}})}\n\n"
yield f"event: message_stop\ndata: {{\"type\":\"message_stop\"}}\n\n"
return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
else:
try:
r = requests.post(
f"{OLLAMA_BASE}/v1/chat/completions",
json=ollama_payload,
timeout=300
)
data = r.json()
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
return {
"id": data.get("id", f"msg_{int(__import__('time').time())}"),
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": content}],
"model": model,
"stop_reason": "end_turn",
"usage": {
"input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
"output_tokens": data.get("usage", {}).get("completion_tokens", 0)
}
}
except Exception as e:
raise HTTPException(500, str(e))
# ── Models list ───────────────────────────────────────────────────────────────
@app.get("/v1/models")
async def list_models(token: str = Depends(verify_token)):
try:
r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
models = [{"id": m["name"], "object": "model"} for m in r.json().get("models", [])]
return {"object": "list", "data": models}
except Exception:
return {"object": "list", "data": [{"id": MODEL, "object": "model"}]}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)