openfable-api / app.py
gustajunq's picture
Update app.py
6a93a27 verified
Raw
History Blame Contribute Delete
6.18 kB
import os
import json
from typing import Any, Dict, List, Optional, Union
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
app = FastAPI(title="OpenFable API β€” SynastrIA Networks")
REPO_ID = "SynastriaNetworks/OpenFable-4B-GGUF"
# ── Mapa de modelos disponΓ­veis: apelido -> arquivo GGUF no repo ──────────────
AVAILABLE_MODELS: Dict[str, str] = {
"openfable-4b": "openfable-4b-q4_k_m.gguf", # padrΓ£o (melhor custo-benefΓ­cio)
# "openfable-4b-q8": "openfable-4b-q8_0.gguf", # adicione quando publicar
}
DEFAULT_MODEL = "openfable-4b"
# Cache lazy: 1 modelo por vez em RAM (HF Spaces free tem ~16 GB mas Γ© CPU-only)
_loaded: Dict[str, Llama] = {}
def get_model(name: str) -> Llama:
if name not in AVAILABLE_MODELS:
raise HTTPException(
status_code=404,
detail=f"Modelo '{name}' nΓ£o encontrado. DisponΓ­veis: {list(AVAILABLE_MODELS)}"
)
if name not in _loaded:
# Evicta o modelo anterior pra liberar RAM
for k in list(_loaded.keys()):
print(f"[MODEL] Evictando {k}...")
del _loaded[k]
filename = AVAILABLE_MODELS[name]
print(f"[MODEL] Baixando {filename}...")
path = hf_hub_download(repo_id=REPO_ID, filename=filename)
print(f"[MODEL] Carregando {name}...")
_loaded[name] = Llama(
model_path=path,
n_ctx=2048,
n_threads=2, # HF Spaces CPU β€” mais de 1 thread causa GGML_ASSERT
n_batch=512,
verbose=False,
chat_format="chatml",
)
print(f"[MODEL] {name} pronto!")
return _loaded[name]
# PrΓ©-carrega o modelo padrΓ£o no boot
print(f"[STARTUP] Carregando modelo padrΓ£o: {DEFAULT_MODEL}")
get_model(DEFAULT_MODEL)
print("[STARTUP] Pronto!")
# ── Schemas ───────────────────────────────────────────────────────────────────
class GenerateRequest(BaseModel):
"""Endpoint legado β€” retrocompatibilidade com o Lucian backend."""
prompt: str
model: str = DEFAULT_MODEL
max_tokens: int = 512
temperature: float = 0.5
class ToolFunction(BaseModel):
name: str
description: str
parameters: Dict[str, Any] = Field(default_factory=dict)
class Tool(BaseModel):
type: str = "function"
function: ToolFunction
class ChatMessage(BaseModel):
role: str # system | user | assistant | tool
content: Optional[str] = None
tool_calls: Optional[List[Dict[str, Any]]] = None # resposta do modelo com chamadas
tool_call_id: Optional[str] = None # resultado de uma tool call
name: Optional[str] = None # nome da tool (role=tool)
class ChatCompletionRequest(BaseModel):
model: str = DEFAULT_MODEL
messages: List[ChatMessage]
tools: Optional[List[Tool]] = None
tool_choice: Union[str, Dict[str, Any]] = "auto"
max_tokens: int = 512
temperature: float = 0.5
# ── Endpoints ─────────────────────────────────────────────────────────────────
@app.get("/")
def health():
return {
"status": "healthy",
"default_model": DEFAULT_MODEL,
"available_models": list(AVAILABLE_MODELS.keys()),
}
@app.get("/v1/models")
def list_models():
"""Lista modelos disponΓ­veis no formato OpenAI."""
return {
"object": "list",
"data": [
{
"id": name,
"object": "model",
"owned_by": "SynastriaNetworks",
"file": AVAILABLE_MODELS[name],
}
for name in AVAILABLE_MODELS
],
}
@app.post("/v1/generate")
def generate(req: GenerateRequest):
"""Legado: completion por prompt cru, sem histΓ³rico de chat."""
llm = get_model(req.model)
try:
out = llm(
prompt=req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
)
return {"response": out["choices"][0]["text"], "model": req.model}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/chat/completions")
def chat_completions(req: ChatCompletionRequest):
"""
Endpoint principal β€” compatΓ­vel com a API OpenAI.
Suporta:
- HistΓ³rico de mensagens (system / user / assistant)
- Tool calling (tools + tool_choice)
- SeleΓ§Γ£o de modelo via campo 'model'
"""
llm = get_model(req.model)
# Converte os schemas Pydantic para dicts simples que o llama-cpp-python espera
messages: List[Dict[str, Any]] = []
for m in req.messages:
entry: Dict[str, Any] = {"role": m.role}
if m.content is not None:
entry["content"] = m.content
if m.tool_calls:
entry["tool_calls"] = m.tool_calls
if m.tool_call_id:
entry["tool_call_id"] = m.tool_call_id
if m.name:
entry["name"] = m.name
messages.append(entry)
kwargs: Dict[str, Any] = {
"messages": messages,
"max_tokens": req.max_tokens,
"temperature": req.temperature,
}
if req.tools:
kwargs["tools"] = [
{
"type": t.type,
"function": {
"name": t.function.name,
"description": t.function.description,
"parameters": t.function.parameters,
},
}
for t in req.tools
]
kwargs["tool_choice"] = req.tool_choice
try:
# create_chat_completion retorna um dict no formato OpenAI β€” repassa direto
response = llm.create_chat_completion(**kwargs)
return response
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))