import os import uuid import time import hashlib import traceback from datetime import datetime from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, HTMLResponse from fastapi.middleware.cors import CORSMiddleware from huggingface_hub import InferenceClient, hf_hub_download from llama_cpp import Llama # ============ Configuração ============ HF_TOKEN = os.environ.get("HF_TOKEN") API_KEY = os.environ.get("API_KEY", HF_TOKEN) # ============ Modelo Local - LFM2-8B-A1B (GGUF - CPU Otimizado) ============ print("🔄 Baixando e carregando LFM2-8B-A1B (GGUF)...") # Baixar modelo GGUF (Q4_K_M para equilíbrio entre qualidade e memória ~5.5GB) REPO_ID = "bartowski/LiquidAI_LFM2-8B-A1B-GGUF" FILENAME = "LiquidAI_LFM2-8B-A1B-Q4_K_M.gguf" try: model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, token=HF_TOKEN ) print(f"✅ Modelo baixado em: {model_path}") # Carregar modelo com llama.cpp chat_model = Llama( model_path=model_path, n_ctx=4096, # Contexto n_threads=8, # Threads da CPU n_batch=512, verbose=False ) print("✅ LFM2-8B-A1B carregado com sucesso na memória!") except Exception as e: print(f"❌ Erro ao carregar modelo: {e}") chat_model = None # ============ Clientes de Modelos (Inference API) ============ # Visão - Análise de imagens vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it") # Embeddings - Vetores semânticos embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3") # Classificação Zero-Shot (Multilíngue - PT/EN/ES...) classify_client = InferenceClient(token=HF_TOKEN, model="joeddav/xlm-roberta-large-xnli") # Sumarização (Multilíngue - 45 idiomas incluindo PT) summarize_client = InferenceClient(token=HF_TOKEN, model="csebuetnlp/mT5_multilingual_XLSum") # Análise de Sentimento (Multilíngue - PT/EN/ES...) sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student") # ============ Função de Chat Local ============ def generate_local_chat(messages, max_tokens=1024, temperature=0.7): """Gera resposta usando o modelo local LFM2-8B-A1B (GGUF)""" if not chat_model: return "Erro: Modelo não carregado." # Usar chat_completion nativo do llama-cpp-python (já lida com templates) output = chat_model.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, stop=["<|im_end|>", "<|endoftext|>"] ) return output['choices'][0]['message']['content'] # ============ Cache ============ response_cache = {} CACHE_MAX_SIZE = 500 CACHE_TTL_SECONDS = 3600 def get_cache_key(content, task): data = str(content) + task return hashlib.md5(data.encode()).hexdigest() def get_cached_response(key): if key in response_cache: entry = response_cache[key] if time.time() - entry["timestamp"] < CACHE_TTL_SECONDS: return entry["response"] else: del response_cache[key] return None def set_cached_response(key, response): if len(response_cache) >= CACHE_MAX_SIZE: oldest_key = min(response_cache.keys(), key=lambda k: response_cache[k]["timestamp"]) del response_cache[oldest_key] response_cache[key] = {"response": response, "timestamp": time.time()} def verify_api_key(request: Request) -> bool: auth = request.headers.get("Authorization", "") return auth.startswith("Bearer ") and auth[7:] == API_KEY def has_image_content(messages): for msg in messages: content = msg.get("content", []) if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get("type") == "image_url": return True return False # ============ FastAPI ============ app = FastAPI( title="DGGirl Multi-Modal API", description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento", version="4.1.0" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ============ Página Inicial ============ @app.get("/", response_class=HTMLResponse) async def home(): endpoints_html = """
POST /v1/chat/completions

💬 Chat inteligente (LFM2-8B GGUF) + Visão (Gemma 3)

POST /v1/embeddings

🔢 Vetores semânticos para RAG (BGE-M3)

POST /v1/classify

🏷️ Classificação zero-shot de textos

POST /v1/summarize

📝 Resumir textos longos

POST /v1/sentiment

😊 Análise de sentimento

""" return f""" DGGirl API v4.1

🤖 DGGirl API v4.1 - CPU Optimized

Status: ● OPERACIONAL

{endpoints_html}

🧠 Modelos Ativos

LFM2-8B-A1B (GGUF Q4) Gemma 3 27B Vision BGE-M3 Embeddings XLM-RoBERTa Classification mT5 Summarization DistilBERT Sentiment
{len(response_cache)}
Cache Items
6
Endpoints

📚 Documentação Swagger | ❤️ Health Check

""" # ============ Chat Completions (Texto + Visão) ============ @app.post("/v1/chat/completions") async def chat_completions(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) try: body = await request.json() raw_messages = body.get("messages", []) model = body.get("model", "auto") # Detectar se precisa de visão has_vision = model == "vision" or has_image_content(raw_messages) model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B-GGUF" # Cache (apenas para texto) cache_key = get_cache_key(raw_messages, model_used) if not has_vision: cached = get_cached_response(cache_key) if cached: return cached # Gerar resposta if has_vision: last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None) if not last_user_msg: return JSONResponse(status_code=400, content={"error": "No user message"}) content = last_user_msg.get("content", []) vision_content = [] text_parts = [] if isinstance(content, list): for item in content: if isinstance(item, dict): if item.get("type") == "text": text_parts.append(item.get("text", "")) elif item.get("type") == "image_url": url = item.get("image_url", {}).get("url", "") if url: vision_content.append({"type": "image_url", "image_url": {"url": url}}) final_text = " ".join(text_parts) if text_parts else "Analise a imagem." vision_content.append({"type": "text", "text": final_text}) messages = [{"role": "user", "content": vision_content}] else: messages = raw_messages response = vision_client.chat_completion( messages=messages, max_tokens=body.get("max_tokens", 1024), temperature=body.get("temperature", 0.7) ) response_content = response.choices[0].message.content else: # Usar modelo local (GGUF) para texto try: response_content = generate_local_chat( messages=raw_messages, max_tokens=body.get("max_tokens", 1024), temperature=body.get("temperature", 0.7) ) except Exception as e: response_content = f"Error generating response: {str(e)}" result = { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "object": "chat.completion", "created": int(time.time()), "model": model_used, "choices": [{ "index": 0, "message": { "role": "assistant", "content": response_content }, "finish_reason": "stop" }], "usage": { "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0 } } if not has_vision: set_cached_response(cache_key, result) return result except Exception as e: return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()}) # ============ Embeddings ============ @app.post("/v1/embeddings") async def create_embeddings(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) try: body = await request.json() input_text = body.get("input", "") texts = input_text if isinstance(input_text, list) else [input_text] embeddings_data = [] for idx, text in enumerate(texts): res = embed_client.feature_extraction(text) embedding = res.tolist() if hasattr(res, 'tolist') else res embeddings_data.append({ "object": "embedding", "index": idx, "embedding": embedding }) return { "object": "list", "data": embeddings_data, "model": "bge-m3", "usage": {"prompt_tokens": sum(len(t.split()) for t in texts), "total_tokens": sum(len(t.split()) for t in texts)} } except Exception as e: return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()}) # ============ Classificação Zero-Shot ============ @app.post("/v1/classify") async def classify_text(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) try: body = await request.json() text = body.get("text", "") labels = body.get("labels", ["positive", "negative", "neutral"]) multi_label = body.get("multi_label", False) if not text: return JSONResponse(status_code=400, content={"error": "Text is required"}) # Cache cache_key = get_cache_key(text + str(labels), "classify") cached = get_cached_response(cache_key) if cached: return cached result = classify_client.zero_shot_classification( text, labels, multi_label=multi_label ) response = { "object": "classification", "text": text, "labels": result.labels if hasattr(result, 'labels') else labels, "scores": result.scores if hasattr(result, 'scores') else [], "predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None, "model": "xlm-roberta-large-xnli" } set_cached_response(cache_key, response) return response except Exception as e: return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()}) # ============ Sumarização ============ @app.post("/v1/summarize") async def summarize_text(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) try: body = await request.json() text = body.get("text", "") max_length = body.get("max_length", 150) min_length = body.get("min_length", 30) if not text: return JSONResponse(status_code=400, content={"error": "Text is required"}) # Cache cache_key = get_cache_key(text, "summarize") cached = get_cached_response(cache_key) if cached: return cached result = summarize_client.summarization( text, parameters={"max_length": max_length, "min_length": min_length} ) summary = result.summary_text if hasattr(result, 'summary_text') else str(result) response = { "object": "summarization", "original_length": len(text), "summary": summary, "summary_length": len(summary), "compression_ratio": round(len(summary) / len(text) * 100, 2), "model": "mt5-multilingual" } set_cached_response(cache_key, response) return response except Exception as e: return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()}) # ============ Análise de Sentimento ============ @app.post("/v1/sentiment") async def analyze_sentiment(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) try: body = await request.json() text = body.get("text", "") if not text: return JSONResponse(status_code=400, content={"error": "Text is required"}) # Cache cache_key = get_cache_key(text, "sentiment") cached = get_cached_response(cache_key) if cached: return cached result = sentiment_client.text_classification(text) # Mapear labels label_map = { "positive": "positivo", "negative": "negativo", "neutral": "neutro", "POSITIVE": "positivo", "NEGATIVE": "negativo", "NEUTRAL": "neutro", "1 star": "negativo", "5 stars": "positivo" } if isinstance(result, list) and len(result) > 0: top_result = result[0] label = top_result.label if hasattr(top_result, 'label') else str(top_result) score = top_result.score if hasattr(top_result, 'score') else 0.0 else: label = str(result) score = 1.0 response = { "object": "sentiment", "text": text, "sentiment": label_map.get(label, label), "sentiment_raw": label, "confidence": round(score, 4), "all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [], "model": "distilbert-base-multilingual" } set_cached_response(cache_key, response) return response except Exception as e: return JSONResponse(status_code=500, content={"error": str(e), "detail": traceback.format_exc()}) # ============ Endpoints Auxiliares ============ @app.get("/v1/models") async def list_models(): return { "object": "list", "data": [ {"id": "lfm2-8b-gguf", "object": "model", "owned_by": "liquidai", "description": "Chat rápido (GGUF Q4)"}, {"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"}, {"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"}, {"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"}, {"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue"}, {"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"} ] } @app.get("/health") async def health(): return { "status": "healthy", "timestamp": datetime.now().isoformat(), "cache_size": len(response_cache), "version": "4.1.0", "models": { "chat": "LiquidAI/LFM2-8B-A1B-GGUF (Q4)", "vision": "google/gemma-3-27b-it", "embeddings": "BAAI/bge-m3", "classify": "joeddav/xlm-roberta-large-xnli", "summarize": "csebuetnlp/mT5_multilingual_XLSum", "sentiment": "lxyuan/distilbert-base-multilingual-cased-sentiments-student" } } @app.delete("/v1/cache/clear") async def clear_cache(request: Request): if not verify_api_key(request): return JSONResponse(status_code=401, content={"error": "Invalid API key"}) global response_cache response_cache = {} return {"message": "Cache cleared", "timestamp": datetime.now().isoformat()}