Spaces:
Paused
Paused
| import os | |
| import subprocess | |
| import logging | |
| import json | |
| import requests | |
| import uvicorn | |
| from fastapi import FastAPI, Depends, HTTPException, Request | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials | |
| from fastapi.responses import StreamingResponse | |
| from huggingface_hub import HfApi | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI(title="o87Dev Cloud LLM API") | |
| security = HTTPBearer() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| OLLAMA_BASE = "http://localhost:11434" | |
| MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M") | |
| API_TOKEN = os.environ.get("API_TOKEN") # Set as Space secret | |
| # ββ Auth ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)): | |
| token = creds.credentials | |
| # If API_TOKEN secret is set, validate against it directly (faster) | |
| if API_TOKEN: | |
| if token != API_TOKEN: | |
| raise HTTPException(401, "Invalid token") | |
| return token | |
| # Fallback: validate as HF token | |
| try: | |
| HfApi().whoami(token=token) | |
| except Exception: | |
| raise HTTPException(401, "Invalid Hugging Face token") | |
| return token | |
| # ββ Health ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def health(): | |
| try: | |
| r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5) | |
| models = [m["name"] for m in r.json().get("models", [])] | |
| return {"status": "ok", "model": MODEL, "available_models": models} | |
| except Exception as e: | |
| return {"status": "starting", "error": str(e)} | |
| # ββ OpenAI-compatible /v1/chat/completions ββββββββββββββββββββββββββββββββββββ | |
| async def chat_completions(request: Request, token: str = Depends(verify_token)): | |
| body = await request.json() | |
| model = body.get("model", MODEL) | |
| stream = body.get("stream", False) | |
| ollama_payload = { | |
| "model": model, | |
| "messages": body.get("messages", []), | |
| "stream": stream, | |
| "options": { | |
| "num_ctx": body.get("max_tokens", 32768), | |
| "temperature": body.get("temperature", 0.7), | |
| } | |
| } | |
| if stream: | |
| def generate(): | |
| try: | |
| with requests.post( | |
| f"{OLLAMA_BASE}/v1/chat/completions", | |
| json=ollama_payload, | |
| stream=True, | |
| timeout=300 | |
| ) as r: | |
| for chunk in r.iter_content(chunk_size=None): | |
| if chunk: | |
| yield chunk | |
| except Exception as e: | |
| yield f"data: {{\"error\": \"{str(e)}\"}}\n\n" | |
| return StreamingResponse(generate(), media_type="text/event-stream") | |
| else: | |
| try: | |
| r = requests.post( | |
| f"{OLLAMA_BASE}/v1/chat/completions", | |
| json=ollama_payload, | |
| timeout=300 | |
| ) | |
| return r.json() | |
| except Exception as e: | |
| raise HTTPException(500, str(e)) | |
| # ββ Anthropic-compatible /v1/messages βββββββββββββββββββββββββββββββββββββββββ | |
| async def messages(request: Request, token: str = Depends(verify_token)): | |
| body = await request.json() | |
| model = body.get("model", MODEL) | |
| stream = body.get("stream", False) | |
| ollama_payload = { | |
| "model": model, | |
| "messages": body.get("messages", []), | |
| "stream": stream, | |
| "options": { | |
| "num_ctx": body.get("max_tokens", 32768), | |
| "temperature": body.get("temperature", 0.7), | |
| } | |
| } | |
| if stream: | |
| import time | |
| def generate_anthropic(): | |
| msg_id = f"msg_{int(time.time())}" | |
| yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n" | |
| yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n" | |
| yield f"event: ping\ndata: {{\"type\":\"ping\"}}\n\n" | |
| output_tokens = 0 | |
| try: | |
| with requests.post( | |
| f"{OLLAMA_BASE}/v1/chat/completions", | |
| json=ollama_payload, | |
| stream=True, | |
| timeout=300 | |
| ) as r: | |
| buffer = "" | |
| for chunk in r.iter_content(chunk_size=None): | |
| if not chunk: | |
| continue | |
| buffer += chunk.decode("utf-8", errors="ignore") | |
| lines = buffer.split("\n") | |
| buffer = lines.pop() | |
| for line in lines: | |
| line = line.strip() | |
| if not line or not line.startswith("data: "): | |
| continue | |
| js = line[6:] | |
| if js == "[DONE]": | |
| break | |
| try: | |
| data = json.loads(js) | |
| if data.get("usage"): | |
| output_tokens = data["usage"].get("completion_tokens", 0) | |
| delta = data.get("choices", [{}])[0].get("delta", {}) | |
| text = delta.get("content") or delta.get("reasoning") or "" | |
| if text: | |
| yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n" | |
| if data.get("choices", [{}])[0].get("finish_reason"): | |
| break | |
| except Exception: | |
| pass | |
| except Exception as e: | |
| yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n" | |
| yield f"event: content_block_stop\ndata: {{\"type\":\"content_block_stop\",\"index\":0}}\n\n" | |
| yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':output_tokens}})}\n\n" | |
| yield f"event: message_stop\ndata: {{\"type\":\"message_stop\"}}\n\n" | |
| return StreamingResponse(generate_anthropic(), media_type="text/event-stream") | |
| else: | |
| try: | |
| r = requests.post( | |
| f"{OLLAMA_BASE}/v1/chat/completions", | |
| json=ollama_payload, | |
| timeout=300 | |
| ) | |
| data = r.json() | |
| content = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| return { | |
| "id": data.get("id", f"msg_{int(__import__('time').time())}"), | |
| "type": "message", | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": content}], | |
| "model": model, | |
| "stop_reason": "end_turn", | |
| "usage": { | |
| "input_tokens": data.get("usage", {}).get("prompt_tokens", 0), | |
| "output_tokens": data.get("usage", {}).get("completion_tokens", 0) | |
| } | |
| } | |
| except Exception as e: | |
| raise HTTPException(500, str(e)) | |
| # ββ Models list βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def list_models(token: str = Depends(verify_token)): | |
| try: | |
| r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5) | |
| models = [{"id": m["name"], "object": "model"} for m in r.json().get("models", [])] | |
| return {"object": "list", "data": models} | |
| except Exception: | |
| return {"object": "list", "data": [{"id": MODEL, "object": "model"}]} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |