Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from typing import Any, Dict, List, Optional, Union | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel, Field | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| app = FastAPI(title="OpenFable API β SynastrIA Networks") | |
| REPO_ID = "SynastriaNetworks/OpenFable-4B-GGUF" | |
| # ββ Mapa de modelos disponΓveis: apelido -> arquivo GGUF no repo ββββββββββββββ | |
| AVAILABLE_MODELS: Dict[str, str] = { | |
| "openfable-4b": "openfable-4b-q4_k_m.gguf", # padrΓ£o (melhor custo-benefΓcio) | |
| # "openfable-4b-q8": "openfable-4b-q8_0.gguf", # adicione quando publicar | |
| } | |
| DEFAULT_MODEL = "openfable-4b" | |
| # Cache lazy: 1 modelo por vez em RAM (HF Spaces free tem ~16 GB mas Γ© CPU-only) | |
| _loaded: Dict[str, Llama] = {} | |
| def get_model(name: str) -> Llama: | |
| if name not in AVAILABLE_MODELS: | |
| raise HTTPException( | |
| status_code=404, | |
| detail=f"Modelo '{name}' nΓ£o encontrado. DisponΓveis: {list(AVAILABLE_MODELS)}" | |
| ) | |
| if name not in _loaded: | |
| # Evicta o modelo anterior pra liberar RAM | |
| for k in list(_loaded.keys()): | |
| print(f"[MODEL] Evictando {k}...") | |
| del _loaded[k] | |
| filename = AVAILABLE_MODELS[name] | |
| print(f"[MODEL] Baixando {filename}...") | |
| path = hf_hub_download(repo_id=REPO_ID, filename=filename) | |
| print(f"[MODEL] Carregando {name}...") | |
| _loaded[name] = Llama( | |
| model_path=path, | |
| n_ctx=2048, | |
| n_threads=2, # HF Spaces CPU β mais de 1 thread causa GGML_ASSERT | |
| n_batch=512, | |
| verbose=False, | |
| chat_format="chatml", | |
| ) | |
| print(f"[MODEL] {name} pronto!") | |
| return _loaded[name] | |
| # PrΓ©-carrega o modelo padrΓ£o no boot | |
| print(f"[STARTUP] Carregando modelo padrΓ£o: {DEFAULT_MODEL}") | |
| get_model(DEFAULT_MODEL) | |
| print("[STARTUP] Pronto!") | |
| # ββ Schemas βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class GenerateRequest(BaseModel): | |
| """Endpoint legado β retrocompatibilidade com o Lucian backend.""" | |
| prompt: str | |
| model: str = DEFAULT_MODEL | |
| max_tokens: int = 512 | |
| temperature: float = 0.5 | |
| class ToolFunction(BaseModel): | |
| name: str | |
| description: str | |
| parameters: Dict[str, Any] = Field(default_factory=dict) | |
| class Tool(BaseModel): | |
| type: str = "function" | |
| function: ToolFunction | |
| class ChatMessage(BaseModel): | |
| role: str # system | user | assistant | tool | |
| content: Optional[str] = None | |
| tool_calls: Optional[List[Dict[str, Any]]] = None # resposta do modelo com chamadas | |
| tool_call_id: Optional[str] = None # resultado de uma tool call | |
| name: Optional[str] = None # nome da tool (role=tool) | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = DEFAULT_MODEL | |
| messages: List[ChatMessage] | |
| tools: Optional[List[Tool]] = None | |
| tool_choice: Union[str, Dict[str, Any]] = "auto" | |
| max_tokens: int = 512 | |
| temperature: float = 0.5 | |
| # ββ Endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def health(): | |
| return { | |
| "status": "healthy", | |
| "default_model": DEFAULT_MODEL, | |
| "available_models": list(AVAILABLE_MODELS.keys()), | |
| } | |
| def list_models(): | |
| """Lista modelos disponΓveis no formato OpenAI.""" | |
| return { | |
| "object": "list", | |
| "data": [ | |
| { | |
| "id": name, | |
| "object": "model", | |
| "owned_by": "SynastriaNetworks", | |
| "file": AVAILABLE_MODELS[name], | |
| } | |
| for name in AVAILABLE_MODELS | |
| ], | |
| } | |
| def generate(req: GenerateRequest): | |
| """Legado: completion por prompt cru, sem histΓ³rico de chat.""" | |
| llm = get_model(req.model) | |
| try: | |
| out = llm( | |
| prompt=req.prompt, | |
| max_tokens=req.max_tokens, | |
| temperature=req.temperature, | |
| ) | |
| return {"response": out["choices"][0]["text"], "model": req.model} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def chat_completions(req: ChatCompletionRequest): | |
| """ | |
| Endpoint principal β compatΓvel com a API OpenAI. | |
| Suporta: | |
| - HistΓ³rico de mensagens (system / user / assistant) | |
| - Tool calling (tools + tool_choice) | |
| - SeleΓ§Γ£o de modelo via campo 'model' | |
| """ | |
| llm = get_model(req.model) | |
| # Converte os schemas Pydantic para dicts simples que o llama-cpp-python espera | |
| messages: List[Dict[str, Any]] = [] | |
| for m in req.messages: | |
| entry: Dict[str, Any] = {"role": m.role} | |
| if m.content is not None: | |
| entry["content"] = m.content | |
| if m.tool_calls: | |
| entry["tool_calls"] = m.tool_calls | |
| if m.tool_call_id: | |
| entry["tool_call_id"] = m.tool_call_id | |
| if m.name: | |
| entry["name"] = m.name | |
| messages.append(entry) | |
| kwargs: Dict[str, Any] = { | |
| "messages": messages, | |
| "max_tokens": req.max_tokens, | |
| "temperature": req.temperature, | |
| } | |
| if req.tools: | |
| kwargs["tools"] = [ | |
| { | |
| "type": t.type, | |
| "function": { | |
| "name": t.function.name, | |
| "description": t.function.description, | |
| "parameters": t.function.parameters, | |
| }, | |
| } | |
| for t in req.tools | |
| ] | |
| kwargs["tool_choice"] = req.tool_choice | |
| try: | |
| # create_chat_completion retorna um dict no formato OpenAI β repassa direto | |
| response = llm.create_chat_completion(**kwargs) | |
| return response | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |