File size: 5,191 Bytes
e5bf384 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #!/usr/bin/env python3
"""
scripts/build_embeddings.py
Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO
Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings
"""
import json
import argparse
from pathlib import Path
def carregar_chunks(chunks_path):
"""Carrega chunks de arquivo JSONL"""
chunks = []
with open(chunks_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
chunks.append(json.loads(line))
print(f"Carregados {len(chunks)} chunks de {chunks_path}")
return chunks
def construir_indice(chunks, model_name, output_dir):
"""Constroi indice FAISS com embeddings"""
try:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
except ImportError as e:
print(f"Dependencia nao instalada: {e}")
print("Instale com: pip install faiss-cpu sentence-transformers numpy")
return False
print(f"Carregando modelo: {model_name}")
model = SentenceTransformer(model_name)
textos = [c["text"] for c in chunks]
metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks]
print(f"Gerando embeddings para {len(textos)} chunks...")
embeddings = model.encode(textos, batch_size=32, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
# Normalizar para busca por similaridade de cosseno
faiss.normalize_L2(embeddings)
# Criar indice FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner Product = cosine sim com normalizacao
index.add(embeddings)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Salvar indice
index_file = output_path / "faiss_index.bin"
faiss.write_index(index, str(index_file))
print(f"Indice FAISS salvo: {index_file}")
# Salvar metadados
meta_file = output_path / "chunks_meta.json"
with open(meta_file, "w", encoding="utf-8") as f:
json.dump({
"total_chunks": len(chunks),
"modelo": model_name,
"dimensao": dimension,
"metadados": metadados
}, f, ensure_ascii=False, indent=2)
print(f"Metadados salvos: {meta_file}")
return True
def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3):
"""Testa uma busca no indice FAISS"""
try:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
except ImportError as e:
print(f"Dependencia nao instalada: {e}")
return
# Carregar indice e metadados
index = faiss.read_index(str(index_path))
with open(meta_path, "r", encoding="utf-8") as f:
meta = json.load(f)
chunks = carregar_chunks(chunks_path)
model = SentenceTransformer(model_name)
# Buscar
query_embedding = model.encode([query])
query_embedding = query_embedding.astype("float32")
faiss.normalize_L2(query_embedding)
scores, indices = index.search(query_embedding, top_k)
print(f"\nQuery: {query}")
print(f"Top {top_k} resultados:")
for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
chunk = chunks[idx]
print(f"\n {i+1}. [{score:.4f}] {chunk.get('section','')}")
print(f" {chunk['text'][:150]}...")
def main():
parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO")
parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks")
parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2",
help="Modelo de embeddings")
parser.add_argument("--output", type=str, default="data", help="Diretorio de saida")
parser.add_argument("--test", action="store_true", help="Testar busca apos construcao")
args = parser.parse_args()
chunks_path = Path(args.chunks)
if not chunks_path.exists():
print(f"Arquivo de chunks nao encontrado: {chunks_path}")
print("Execute primeiro: python scripts/build_chunks.py --synthetic")
return
chunks = carregar_chunks(chunks_path)
success = construir_indice(chunks, args.model, args.output)
if success and args.test:
output_path = Path(args.output)
testar_busca(
query="quantos extintores preciso para um escritorio de 300m2",
chunks_path=chunks_path,
index_path=output_path / "faiss_index.bin",
meta_path=output_path / "chunks_meta.json",
model_name=args.model,
top_k=3
)
if success:
print("\nConstrucao concluida!")
print(f"Proximos passos:")
print(f" 1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF")
print(f" 2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF")
print(f" 3. Execute: python app.py")
if __name__ == "__main__":
main()
|