Spaces:

truegleai
/

o87dev-llm-api

Paused

App Files Files Community

o87dev-llm-api / app.py

truegleai

Upload app.py with huggingface_hub

78822f8 verified about 1 month ago

raw

history blame contribute delete

8.9 kB

	import os
	import subprocess
	import logging
	import json
	import requests
	import uvicorn
	from fastapi import FastAPI, Depends, HTTPException, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
	from fastapi.responses import StreamingResponse
	from huggingface_hub import HfApi

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(title="o87Dev Cloud LLM API")
	security = HTTPBearer()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	OLLAMA_BASE = "http://localhost:11434"
	MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
	API_TOKEN = os.environ.get("API_TOKEN") # Set as Space secret

	# ── Auth ──────────────────────────────────────────────────────────────────────

	def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
	token = creds.credentials
	# If API_TOKEN secret is set, validate against it directly (faster)
	if API_TOKEN:
	if token != API_TOKEN:
	raise HTTPException(401, "Invalid token")
	return token
	# Fallback: validate as HF token
	try:
	HfApi().whoami(token=token)
	except Exception:
	raise HTTPException(401, "Invalid Hugging Face token")
	return token

	# ── Health ────────────────────────────────────────────────────────────────────

	@app.get("/health")
	async def health():
	try:
	r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
	models = [m["name"] for m in r.json().get("models", [])]
	return {"status": "ok", "model": MODEL, "available_models": models}
	except Exception as e:
	return {"status": "starting", "error": str(e)}

	# ── OpenAI-compatible /v1/chat/completions ────────────────────────────────────

	@app.post("/v1/chat/completions")
	async def chat_completions(request: Request, token: str = Depends(verify_token)):
	body = await request.json()
	model = body.get("model", MODEL)
	stream = body.get("stream", False)

	ollama_payload = {
	"model": model,
	"messages": body.get("messages", []),
	"stream": stream,
	"options": {
	"num_ctx": body.get("max_tokens", 32768),
	"temperature": body.get("temperature", 0.7),
	}
	}

	if stream:
	def generate():
	try:
	with requests.post(
	f"{OLLAMA_BASE}/v1/chat/completions",
	json=ollama_payload,
	stream=True,
	timeout=300
	) as r:
	for chunk in r.iter_content(chunk_size=None):
	if chunk:
	yield chunk
	except Exception as e:
	yield f"data: {{\"error\": \"{str(e)}\"}}\n\n"
	return StreamingResponse(generate(), media_type="text/event-stream")
	else:
	try:
	r = requests.post(
	f"{OLLAMA_BASE}/v1/chat/completions",
	json=ollama_payload,
	timeout=300
	)
	return r.json()
	except Exception as e:
	raise HTTPException(500, str(e))

	# ── Anthropic-compatible /v1/messages ─────────────────────────────────────────

	@app.post("/v1/messages")
	async def messages(request: Request, token: str = Depends(verify_token)):
	body = await request.json()
	model = body.get("model", MODEL)
	stream = body.get("stream", False)

	ollama_payload = {
	"model": model,
	"messages": body.get("messages", []),
	"stream": stream,
	"options": {
	"num_ctx": body.get("max_tokens", 32768),
	"temperature": body.get("temperature", 0.7),
	}
	}

	if stream:
	import time

	def generate_anthropic():
	msg_id = f"msg_{int(time.time())}"
	yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n"
	yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n"
	yield f"event: ping\ndata: {{\"type\":\"ping\"}}\n\n"

	output_tokens = 0
	try:
	with requests.post(
	f"{OLLAMA_BASE}/v1/chat/completions",
	json=ollama_payload,
	stream=True,
	timeout=300
	) as r:
	buffer = ""
	for chunk in r.iter_content(chunk_size=None):
	if not chunk:
	continue
	buffer += chunk.decode("utf-8", errors="ignore")
	lines = buffer.split("\n")
	buffer = lines.pop()
	for line in lines:
	line = line.strip()
	if not line or not line.startswith("data: "):
	continue
	js = line[6:]
	if js == "[DONE]":
	break
	try:
	data = json.loads(js)
	if data.get("usage"):
	output_tokens = data["usage"].get("completion_tokens", 0)
	delta = data.get("choices", [{}])[0].get("delta", {})
	text = delta.get("content") or delta.get("reasoning") or ""
	if text:
	yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n"
	if data.get("choices", [{}])[0].get("finish_reason"):
	break
	except Exception:
	pass
	except Exception as e:
	yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n"

	yield f"event: content_block_stop\ndata: {{\"type\":\"content_block_stop\",\"index\":0}}\n\n"
	yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':output_tokens}})}\n\n"
	yield f"event: message_stop\ndata: {{\"type\":\"message_stop\"}}\n\n"

	return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
	else:
	try:
	r = requests.post(
	f"{OLLAMA_BASE}/v1/chat/completions",
	json=ollama_payload,
	timeout=300
	)
	data = r.json()
	content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
	return {
	"id": data.get("id", f"msg_{int(__import__('time').time())}"),
	"type": "message",
	"role": "assistant",
	"content": [{"type": "text", "text": content}],
	"model": model,
	"stop_reason": "end_turn",
	"usage": {
	"input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
	"output_tokens": data.get("usage", {}).get("completion_tokens", 0)
	}
	}
	except Exception as e:
	raise HTTPException(500, str(e))

	# ── Models list ───────────────────────────────────────────────────────────────

	@app.get("/v1/models")
	async def list_models(token: str = Depends(verify_token)):
	try:
	r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
	models = [{"id": m["name"], "object": "model"} for m in r.json().get("models", [])]
	return {"object": "list", "data": models}
	except Exception:
	return {"object": "list", "data": [{"id": MODEL, "object": "model"}]}

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)