CERCON / scripts /build_embeddings.py
carlosh10's picture
fix: Adiciona conteudo completo do build_embeddings.py com FAISS
e5bf384 verified
#!/usr/bin/env python3
"""
scripts/build_embeddings.py
Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO
Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings
"""
import json
import argparse
from pathlib import Path
def carregar_chunks(chunks_path):
"""Carrega chunks de arquivo JSONL"""
chunks = []
with open(chunks_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
chunks.append(json.loads(line))
print(f"Carregados {len(chunks)} chunks de {chunks_path}")
return chunks
def construir_indice(chunks, model_name, output_dir):
"""Constroi indice FAISS com embeddings"""
try:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
except ImportError as e:
print(f"Dependencia nao instalada: {e}")
print("Instale com: pip install faiss-cpu sentence-transformers numpy")
return False
print(f"Carregando modelo: {model_name}")
model = SentenceTransformer(model_name)
textos = [c["text"] for c in chunks]
metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks]
print(f"Gerando embeddings para {len(textos)} chunks...")
embeddings = model.encode(textos, batch_size=32, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
# Normalizar para busca por similaridade de cosseno
faiss.normalize_L2(embeddings)
# Criar indice FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner Product = cosine sim com normalizacao
index.add(embeddings)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Salvar indice
index_file = output_path / "faiss_index.bin"
faiss.write_index(index, str(index_file))
print(f"Indice FAISS salvo: {index_file}")
# Salvar metadados
meta_file = output_path / "chunks_meta.json"
with open(meta_file, "w", encoding="utf-8") as f:
json.dump({
"total_chunks": len(chunks),
"modelo": model_name,
"dimensao": dimension,
"metadados": metadados
}, f, ensure_ascii=False, indent=2)
print(f"Metadados salvos: {meta_file}")
return True
def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3):
"""Testa uma busca no indice FAISS"""
try:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
except ImportError as e:
print(f"Dependencia nao instalada: {e}")
return
# Carregar indice e metadados
index = faiss.read_index(str(index_path))
with open(meta_path, "r", encoding="utf-8") as f:
meta = json.load(f)
chunks = carregar_chunks(chunks_path)
model = SentenceTransformer(model_name)
# Buscar
query_embedding = model.encode([query])
query_embedding = query_embedding.astype("float32")
faiss.normalize_L2(query_embedding)
scores, indices = index.search(query_embedding, top_k)
print(f"\nQuery: {query}")
print(f"Top {top_k} resultados:")
for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
chunk = chunks[idx]
print(f"\n {i+1}. [{score:.4f}] {chunk.get('section','')}")
print(f" {chunk['text'][:150]}...")
def main():
parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO")
parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks")
parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2",
help="Modelo de embeddings")
parser.add_argument("--output", type=str, default="data", help="Diretorio de saida")
parser.add_argument("--test", action="store_true", help="Testar busca apos construcao")
args = parser.parse_args()
chunks_path = Path(args.chunks)
if not chunks_path.exists():
print(f"Arquivo de chunks nao encontrado: {chunks_path}")
print("Execute primeiro: python scripts/build_chunks.py --synthetic")
return
chunks = carregar_chunks(chunks_path)
success = construir_indice(chunks, args.model, args.output)
if success and args.test:
output_path = Path(args.output)
testar_busca(
query="quantos extintores preciso para um escritorio de 300m2",
chunks_path=chunks_path,
index_path=output_path / "faiss_index.bin",
meta_path=output_path / "chunks_meta.json",
model_name=args.model,
top_k=3
)
if success:
print("\nConstrucao concluida!")
print(f"Proximos passos:")
print(f" 1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF")
print(f" 2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF")
print(f" 3. Execute: python app.py")
if __name__ == "__main__":
main()