#!/usr/bin/env python3 """ scripts/build_embeddings.py Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings """ import json import argparse from pathlib import Path def carregar_chunks(chunks_path): """Carrega chunks de arquivo JSONL""" chunks = [] with open(chunks_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: chunks.append(json.loads(line)) print(f"Carregados {len(chunks)} chunks de {chunks_path}") return chunks def construir_indice(chunks, model_name, output_dir): """Constroi indice FAISS com embeddings""" try: import numpy as np import faiss from sentence_transformers import SentenceTransformer except ImportError as e: print(f"Dependencia nao instalada: {e}") print("Instale com: pip install faiss-cpu sentence-transformers numpy") return False print(f"Carregando modelo: {model_name}") model = SentenceTransformer(model_name) textos = [c["text"] for c in chunks] metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks] print(f"Gerando embeddings para {len(textos)} chunks...") embeddings = model.encode(textos, batch_size=32, show_progress_bar=True) embeddings = np.array(embeddings).astype("float32") # Normalizar para busca por similaridade de cosseno faiss.normalize_L2(embeddings) # Criar indice FAISS dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) # Inner Product = cosine sim com normalizacao index.add(embeddings) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Salvar indice index_file = output_path / "faiss_index.bin" faiss.write_index(index, str(index_file)) print(f"Indice FAISS salvo: {index_file}") # Salvar metadados meta_file = output_path / "chunks_meta.json" with open(meta_file, "w", encoding="utf-8") as f: json.dump({ "total_chunks": len(chunks), "modelo": model_name, "dimensao": dimension, "metadados": metadados }, f, ensure_ascii=False, indent=2) print(f"Metadados salvos: {meta_file}") return True def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3): """Testa uma busca no indice FAISS""" try: import numpy as np import faiss from sentence_transformers import SentenceTransformer except ImportError as e: print(f"Dependencia nao instalada: {e}") return # Carregar indice e metadados index = faiss.read_index(str(index_path)) with open(meta_path, "r", encoding="utf-8") as f: meta = json.load(f) chunks = carregar_chunks(chunks_path) model = SentenceTransformer(model_name) # Buscar query_embedding = model.encode([query]) query_embedding = query_embedding.astype("float32") faiss.normalize_L2(query_embedding) scores, indices = index.search(query_embedding, top_k) print(f"\nQuery: {query}") print(f"Top {top_k} resultados:") for i, (score, idx) in enumerate(zip(scores[0], indices[0])): chunk = chunks[idx] print(f"\n {i+1}. [{score:.4f}] {chunk.get('section','')}") print(f" {chunk['text'][:150]}...") def main(): parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO") parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks") parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2", help="Modelo de embeddings") parser.add_argument("--output", type=str, default="data", help="Diretorio de saida") parser.add_argument("--test", action="store_true", help="Testar busca apos construcao") args = parser.parse_args() chunks_path = Path(args.chunks) if not chunks_path.exists(): print(f"Arquivo de chunks nao encontrado: {chunks_path}") print("Execute primeiro: python scripts/build_chunks.py --synthetic") return chunks = carregar_chunks(chunks_path) success = construir_indice(chunks, args.model, args.output) if success and args.test: output_path = Path(args.output) testar_busca( query="quantos extintores preciso para um escritorio de 300m2", chunks_path=chunks_path, index_path=output_path / "faiss_index.bin", meta_path=output_path / "chunks_meta.json", model_name=args.model, top_k=3 ) if success: print("\nConstrucao concluida!") print(f"Proximos passos:") print(f" 1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF") print(f" 2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF") print(f" 3. Execute: python app.py") if __name__ == "__main__": main()