import os
import uuid
import time
import hashlib
import traceback
from datetime import datetime
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from huggingface_hub import InferenceClient, hf_hub_download
from llama_cpp import Llama
# ============ Configuração ============
HF_TOKEN = os.environ.get("HF_TOKEN")
API_KEY = os.environ.get("API_KEY", HF_TOKEN)
# ============ Modelo Local - LFM2-8B-A1B (GGUF - CPU Otimizado) ============
print("🔄 Baixando e carregando LFM2-8B-A1B (GGUF)...")
# Baixar modelo GGUF (Q4_K_M para equilíbrio entre qualidade e memória ~5.5GB)
REPO_ID = "bartowski/LiquidAI_LFM2-8B-A1B-GGUF"
FILENAME = "LiquidAI_LFM2-8B-A1B-Q4_K_M.gguf"
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
token=HF_TOKEN
)
print(f"✅ Modelo baixado em: {model_path}")
# Carregar modelo com llama.cpp
chat_model = Llama(
model_path=model_path,
n_ctx=4096, # Contexto
n_threads=8, # Threads da CPU
n_batch=512,
verbose=False
)
print("✅ LFM2-8B-A1B carregado com sucesso na memória!")
except Exception as e:
print(f"❌ Erro ao carregar modelo: {e}")
chat_model = None
# ============ Clientes de Modelos (Inference API) ============
# Visão - Análise de imagens
vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it")
# Embeddings - Vetores semânticos
embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3")
# Classificação Zero-Shot (Multilíngue - PT/EN/ES...)
classify_client = InferenceClient(token=HF_TOKEN, model="joeddav/xlm-roberta-large-xnli")
# Sumarização (Multilíngue - 45 idiomas incluindo PT)
summarize_client = InferenceClient(token=HF_TOKEN, model="csebuetnlp/mT5_multilingual_XLSum")
# Análise de Sentimento (Multilíngue - PT/EN/ES...)
sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
# ============ Função de Chat Local ============
def generate_local_chat(messages, max_tokens=1024, temperature=0.7):
"""Gera resposta usando o modelo local LFM2-8B-A1B (GGUF)"""
if not chat_model:
return "Erro: Modelo não carregado."
# Usar chat_completion nativo do llama-cpp-python (já lida com templates)
output = chat_model.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
stop=["<|im_end|>", "<|endoftext|>"]
)
return output['choices'][0]['message']['content']
# ============ Cache ============
response_cache = {}
CACHE_MAX_SIZE = 500
CACHE_TTL_SECONDS = 3600
def get_cache_key(content, task):
data = str(content) + task
return hashlib.md5(data.encode()).hexdigest()
def get_cached_response(key):
if key in response_cache:
entry = response_cache[key]
if time.time() - entry["timestamp"] < CACHE_TTL_SECONDS:
return entry["response"]
else:
del response_cache[key]
return None
def set_cached_response(key, response):
if len(response_cache) >= CACHE_MAX_SIZE:
oldest_key = min(response_cache.keys(), key=lambda k: response_cache[k]["timestamp"])
del response_cache[oldest_key]
response_cache[key] = {"response": response, "timestamp": time.time()}
def verify_api_key(request: Request) -> bool:
auth = request.headers.get("Authorization", "")
return auth.startswith("Bearer ") and auth[7:] == API_KEY
def has_image_content(messages):
for msg in messages:
content = msg.get("content", [])
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "image_url":
return True
return False
# ============ FastAPI ============
app = FastAPI(
title="DGGirl Multi-Modal API",
description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento",
version="4.1.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ============ Página Inicial ============
@app.get("/", response_class=HTMLResponse)
async def home():
endpoints_html = """
POST /v1/chat/completions💬 Chat inteligente (LFM2-8B GGUF) + Visão (Gemma 3)
POST /v1/embeddings🔢 Vetores semânticos para RAG (BGE-M3)
POST /v1/classify🏷️ Classificação zero-shot de textos
POST /v1/summarize📝 Resumir textos longos
POST /v1/sentiment😊 Análise de sentimento
"""
return f"""
DGGirl API v4.1
🤖 DGGirl API v4.1 - CPU Optimized
Status: ● OPERACIONAL
{endpoints_html}
🧠 Modelos Ativos
LFM2-8B-A1B (GGUF Q4)
Gemma 3 27B Vision
BGE-M3 Embeddings
XLM-RoBERTa Classification
mT5 Summarization
DistilBERT Sentiment
{len(response_cache)}
Cache Items
📚 Documentação Swagger |
❤️ Health Check
"""
# ============ Chat Completions (Texto + Visão) ============
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
try:
body = await request.json()
raw_messages = body.get("messages", [])
model = body.get("model", "auto")
# Detectar se precisa de visão
has_vision = model == "vision" or has_image_content(raw_messages)
model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B-GGUF"
# Cache (apenas para texto)
cache_key = get_cache_key(raw_messages, model_used)
if not has_vision:
cached = get_cached_response(cache_key)
if cached:
return cached
# Gerar resposta
if has_vision:
last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None)
if not last_user_msg:
return JSONResponse(status_code=400, content={"error": "No user message"})
content = last_user_msg.get("content", [])
vision_content = []
text_parts = []
if isinstance(content, list):
for item in content:
if isinstance(item, dict):
if item.get("type") == "text":
text_parts.append(item.get("text", ""))
elif item.get("type") == "image_url":
url = item.get("image_url", {}).get("url", "")
if url:
vision_content.append({"type": "image_url", "image_url": {"url": url}})
final_text = " ".join(text_parts) if text_parts else "Analise a imagem."
vision_content.append({"type": "text", "text": final_text})
messages = [{"role": "user", "content": vision_content}]
else:
messages = raw_messages
response = vision_client.chat_completion(
messages=messages,
max_tokens=body.get("max_tokens", 1024),
temperature=body.get("temperature", 0.7)
)
response_content = response.choices[0].message.content
else:
# Usar modelo local (GGUF) para texto
try:
response_content = generate_local_chat(
messages=raw_messages,
max_tokens=body.get("max_tokens", 1024),
temperature=body.get("temperature", 0.7)
)
except Exception as e:
response_content = f"Error generating response: {str(e)}"
result = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": model_used,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": response_content
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
}
}
if not has_vision:
set_cached_response(cache_key, result)
return result
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()})
# ============ Embeddings ============
@app.post("/v1/embeddings")
async def create_embeddings(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
try:
body = await request.json()
input_text = body.get("input", "")
texts = input_text if isinstance(input_text, list) else [input_text]
embeddings_data = []
for idx, text in enumerate(texts):
res = embed_client.feature_extraction(text)
embedding = res.tolist() if hasattr(res, 'tolist') else res
embeddings_data.append({
"object": "embedding",
"index": idx,
"embedding": embedding
})
return {
"object": "list",
"data": embeddings_data,
"model": "bge-m3",
"usage": {"prompt_tokens": sum(len(t.split()) for t in texts), "total_tokens": sum(len(t.split()) for t in texts)}
}
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()})
# ============ Classificação Zero-Shot ============
@app.post("/v1/classify")
async def classify_text(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
try:
body = await request.json()
text = body.get("text", "")
labels = body.get("labels", ["positive", "negative", "neutral"])
multi_label = body.get("multi_label", False)
if not text:
return JSONResponse(status_code=400, content={"error": "Text is required"})
# Cache
cache_key = get_cache_key(text + str(labels), "classify")
cached = get_cached_response(cache_key)
if cached:
return cached
result = classify_client.zero_shot_classification(
text,
labels,
multi_label=multi_label
)
response = {
"object": "classification",
"text": text,
"labels": result.labels if hasattr(result, 'labels') else labels,
"scores": result.scores if hasattr(result, 'scores') else [],
"predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None,
"model": "xlm-roberta-large-xnli"
}
set_cached_response(cache_key, response)
return response
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()})
# ============ Sumarização ============
@app.post("/v1/summarize")
async def summarize_text(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
try:
body = await request.json()
text = body.get("text", "")
max_length = body.get("max_length", 150)
min_length = body.get("min_length", 30)
if not text:
return JSONResponse(status_code=400, content={"error": "Text is required"})
# Cache
cache_key = get_cache_key(text, "summarize")
cached = get_cached_response(cache_key)
if cached:
return cached
result = summarize_client.summarization(
text,
parameters={"max_length": max_length, "min_length": min_length}
)
summary = result.summary_text if hasattr(result, 'summary_text') else str(result)
response = {
"object": "summarization",
"original_length": len(text),
"summary": summary,
"summary_length": len(summary),
"compression_ratio": round(len(summary) / len(text) * 100, 2),
"model": "mt5-multilingual"
}
set_cached_response(cache_key, response)
return response
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()})
# ============ Análise de Sentimento ============
@app.post("/v1/sentiment")
async def analyze_sentiment(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
try:
body = await request.json()
text = body.get("text", "")
if not text:
return JSONResponse(status_code=400, content={"error": "Text is required"})
# Cache
cache_key = get_cache_key(text, "sentiment")
cached = get_cached_response(cache_key)
if cached:
return cached
result = sentiment_client.text_classification(text)
# Mapear labels
label_map = {
"positive": "positivo",
"negative": "negativo",
"neutral": "neutro",
"POSITIVE": "positivo",
"NEGATIVE": "negativo",
"NEUTRAL": "neutro",
"1 star": "negativo",
"5 stars": "positivo"
}
if isinstance(result, list) and len(result) > 0:
top_result = result[0]
label = top_result.label if hasattr(top_result, 'label') else str(top_result)
score = top_result.score if hasattr(top_result, 'score') else 0.0
else:
label = str(result)
score = 1.0
response = {
"object": "sentiment",
"text": text,
"sentiment": label_map.get(label, label),
"sentiment_raw": label,
"confidence": round(score, 4),
"all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [],
"model": "distilbert-base-multilingual"
}
set_cached_response(cache_key, response)
return response
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()})
# ============ Endpoints Auxiliares ============
@app.get("/v1/models")
async def list_models():
return {
"object": "list",
"data": [
{"id": "lfm2-8b-gguf", "object": "model", "owned_by": "liquidai", "description": "Chat rápido (GGUF Q4)"},
{"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"},
{"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"},
{"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"},
{"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue"},
{"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"}
]
}
@app.get("/health")
async def health():
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"cache_size": len(response_cache),
"version": "4.1.0",
"models": {
"chat": "LiquidAI/LFM2-8B-A1B-GGUF (Q4)",
"vision": "google/gemma-3-27b-it",
"embeddings": "BAAI/bge-m3",
"classify": "joeddav/xlm-roberta-large-xnli",
"summarize": "csebuetnlp/mT5_multilingual_XLSum",
"sentiment": "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
}
}
@app.delete("/v1/cache/clear")
async def clear_cache(request: Request):
if not verify_api_key(request):
return JSONResponse(status_code=401, content={"error": "Invalid API key"})
global response_cache
response_cache = {}
return {"message": "Cache cleared", "timestamp": datetime.now().isoformat()}