| |
| """ |
| scripts/build_embeddings.py |
| Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO |
| Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings |
| """ |
|
|
| import json |
| import argparse |
| from pathlib import Path |
|
|
|
|
| def carregar_chunks(chunks_path): |
| """Carrega chunks de arquivo JSONL""" |
| chunks = [] |
| with open(chunks_path, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| chunks.append(json.loads(line)) |
| print(f"Carregados {len(chunks)} chunks de {chunks_path}") |
| return chunks |
|
|
|
|
| def construir_indice(chunks, model_name, output_dir): |
| """Constroi indice FAISS com embeddings""" |
| try: |
| import numpy as np |
| import faiss |
| from sentence_transformers import SentenceTransformer |
| except ImportError as e: |
| print(f"Dependencia nao instalada: {e}") |
| print("Instale com: pip install faiss-cpu sentence-transformers numpy") |
| return False |
| |
| print(f"Carregando modelo: {model_name}") |
| model = SentenceTransformer(model_name) |
| |
| textos = [c["text"] for c in chunks] |
| metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks] |
| |
| print(f"Gerando embeddings para {len(textos)} chunks...") |
| embeddings = model.encode(textos, batch_size=32, show_progress_bar=True) |
| embeddings = np.array(embeddings).astype("float32") |
| |
| |
| faiss.normalize_L2(embeddings) |
| |
| |
| dimension = embeddings.shape[1] |
| index = faiss.IndexFlatIP(dimension) |
| index.add(embeddings) |
| |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| |
| index_file = output_path / "faiss_index.bin" |
| faiss.write_index(index, str(index_file)) |
| print(f"Indice FAISS salvo: {index_file}") |
| |
| |
| meta_file = output_path / "chunks_meta.json" |
| with open(meta_file, "w", encoding="utf-8") as f: |
| json.dump({ |
| "total_chunks": len(chunks), |
| "modelo": model_name, |
| "dimensao": dimension, |
| "metadados": metadados |
| }, f, ensure_ascii=False, indent=2) |
| print(f"Metadados salvos: {meta_file}") |
| |
| return True |
|
|
|
|
| def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3): |
| """Testa uma busca no indice FAISS""" |
| try: |
| import numpy as np |
| import faiss |
| from sentence_transformers import SentenceTransformer |
| except ImportError as e: |
| print(f"Dependencia nao instalada: {e}") |
| return |
| |
| |
| index = faiss.read_index(str(index_path)) |
| with open(meta_path, "r", encoding="utf-8") as f: |
| meta = json.load(f) |
| chunks = carregar_chunks(chunks_path) |
| |
| model = SentenceTransformer(model_name) |
| |
| |
| query_embedding = model.encode([query]) |
| query_embedding = query_embedding.astype("float32") |
| faiss.normalize_L2(query_embedding) |
| |
| scores, indices = index.search(query_embedding, top_k) |
| |
| print(f"\nQuery: {query}") |
| print(f"Top {top_k} resultados:") |
| for i, (score, idx) in enumerate(zip(scores[0], indices[0])): |
| chunk = chunks[idx] |
| print(f"\n {i+1}. [{score:.4f}] {chunk.get('section','')}") |
| print(f" {chunk['text'][:150]}...") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO") |
| parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks") |
| parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2", |
| help="Modelo de embeddings") |
| parser.add_argument("--output", type=str, default="data", help="Diretorio de saida") |
| parser.add_argument("--test", action="store_true", help="Testar busca apos construcao") |
| args = parser.parse_args() |
| |
| chunks_path = Path(args.chunks) |
| if not chunks_path.exists(): |
| print(f"Arquivo de chunks nao encontrado: {chunks_path}") |
| print("Execute primeiro: python scripts/build_chunks.py --synthetic") |
| return |
| |
| chunks = carregar_chunks(chunks_path) |
| success = construir_indice(chunks, args.model, args.output) |
| |
| if success and args.test: |
| output_path = Path(args.output) |
| testar_busca( |
| query="quantos extintores preciso para um escritorio de 300m2", |
| chunks_path=chunks_path, |
| index_path=output_path / "faiss_index.bin", |
| meta_path=output_path / "chunks_meta.json", |
| model_name=args.model, |
| top_k=3 |
| ) |
| |
| if success: |
| print("\nConstrucao concluida!") |
| print(f"Proximos passos:") |
| print(f" 1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF") |
| print(f" 2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF") |
| print(f" 3. Execute: python app.py") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|