import os import json from typing import Any, Dict, List, Optional, Union from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field from huggingface_hub import hf_hub_download from llama_cpp import Llama app = FastAPI(title="OpenFable API — SynastrIA Networks") REPO_ID = "SynastriaNetworks/OpenFable-4B-GGUF" # ── Mapa de modelos disponíveis: apelido -> arquivo GGUF no repo ────────────── AVAILABLE_MODELS: Dict[str, str] = { "openfable-4b": "openfable-4b-q4_k_m.gguf", # padrão (melhor custo-benefício) # "openfable-4b-q8": "openfable-4b-q8_0.gguf", # adicione quando publicar } DEFAULT_MODEL = "openfable-4b" # Cache lazy: 1 modelo por vez em RAM (HF Spaces free tem ~16 GB mas é CPU-only) _loaded: Dict[str, Llama] = {} def get_model(name: str) -> Llama: if name not in AVAILABLE_MODELS: raise HTTPException( status_code=404, detail=f"Modelo '{name}' não encontrado. Disponíveis: {list(AVAILABLE_MODELS)}" ) if name not in _loaded: # Evicta o modelo anterior pra liberar RAM for k in list(_loaded.keys()): print(f"[MODEL] Evictando {k}...") del _loaded[k] filename = AVAILABLE_MODELS[name] print(f"[MODEL] Baixando {filename}...") path = hf_hub_download(repo_id=REPO_ID, filename=filename) print(f"[MODEL] Carregando {name}...") _loaded[name] = Llama( model_path=path, n_ctx=2048, n_threads=2, # HF Spaces CPU — mais de 1 thread causa GGML_ASSERT n_batch=512, verbose=False, chat_format="chatml", ) print(f"[MODEL] {name} pronto!") return _loaded[name] # Pré-carrega o modelo padrão no boot print(f"[STARTUP] Carregando modelo padrão: {DEFAULT_MODEL}") get_model(DEFAULT_MODEL) print("[STARTUP] Pronto!") # ── Schemas ─────────────────────────────────────────────────────────────────── class GenerateRequest(BaseModel): """Endpoint legado — retrocompatibilidade com o Lucian backend.""" prompt: str model: str = DEFAULT_MODEL max_tokens: int = 512 temperature: float = 0.5 class ToolFunction(BaseModel): name: str description: str parameters: Dict[str, Any] = Field(default_factory=dict) class Tool(BaseModel): type: str = "function" function: ToolFunction class ChatMessage(BaseModel): role: str # system | user | assistant | tool content: Optional[str] = None tool_calls: Optional[List[Dict[str, Any]]] = None # resposta do modelo com chamadas tool_call_id: Optional[str] = None # resultado de uma tool call name: Optional[str] = None # nome da tool (role=tool) class ChatCompletionRequest(BaseModel): model: str = DEFAULT_MODEL messages: List[ChatMessage] tools: Optional[List[Tool]] = None tool_choice: Union[str, Dict[str, Any]] = "auto" max_tokens: int = 512 temperature: float = 0.5 # ── Endpoints ───────────────────────────────────────────────────────────────── @app.get("/") def health(): return { "status": "healthy", "default_model": DEFAULT_MODEL, "available_models": list(AVAILABLE_MODELS.keys()), } @app.get("/v1/models") def list_models(): """Lista modelos disponíveis no formato OpenAI.""" return { "object": "list", "data": [ { "id": name, "object": "model", "owned_by": "SynastriaNetworks", "file": AVAILABLE_MODELS[name], } for name in AVAILABLE_MODELS ], } @app.post("/v1/generate") def generate(req: GenerateRequest): """Legado: completion por prompt cru, sem histórico de chat.""" llm = get_model(req.model) try: out = llm( prompt=req.prompt, max_tokens=req.max_tokens, temperature=req.temperature, ) return {"response": out["choices"][0]["text"], "model": req.model} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/chat/completions") def chat_completions(req: ChatCompletionRequest): """ Endpoint principal — compatível com a API OpenAI. Suporta: - Histórico de mensagens (system / user / assistant) - Tool calling (tools + tool_choice) - Seleção de modelo via campo 'model' """ llm = get_model(req.model) # Converte os schemas Pydantic para dicts simples que o llama-cpp-python espera messages: List[Dict[str, Any]] = [] for m in req.messages: entry: Dict[str, Any] = {"role": m.role} if m.content is not None: entry["content"] = m.content if m.tool_calls: entry["tool_calls"] = m.tool_calls if m.tool_call_id: entry["tool_call_id"] = m.tool_call_id if m.name: entry["name"] = m.name messages.append(entry) kwargs: Dict[str, Any] = { "messages": messages, "max_tokens": req.max_tokens, "temperature": req.temperature, } if req.tools: kwargs["tools"] = [ { "type": t.type, "function": { "name": t.function.name, "description": t.function.description, "parameters": t.function.parameters, }, } for t in req.tools ] kwargs["tool_choice"] = req.tool_choice try: # create_chat_completion retorna um dict no formato OpenAI — repassa direto response = llm.create_chat_completion(**kwargs) return response except Exception as e: raise HTTPException(status_code=500, detail=str(e))