File size: 5,191 Bytes

e5bf384

#!/usr/bin/env python3
"""
scripts/build_embeddings.py
Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO
Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings
"""

import json
import argparse
from pathlib import Path


def carregar_chunks(chunks_path):
    """Carrega chunks de arquivo JSONL"""
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                chunks.append(json.loads(line))
    print(f"Carregados {len(chunks)} chunks de {chunks_path}")
    return chunks


def construir_indice(chunks, model_name, output_dir):
    """Constroi indice FAISS com embeddings"""
    try:
        import numpy as np
        import faiss
        from sentence_transformers import SentenceTransformer
    except ImportError as e:
        print(f"Dependencia nao instalada: {e}")
        print("Instale com: pip install faiss-cpu sentence-transformers numpy")
        return False
    
    print(f"Carregando modelo: {model_name}")
    model = SentenceTransformer(model_name)
    
    textos = [c["text"] for c in chunks]
    metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks]
    
    print(f"Gerando embeddings para {len(textos)} chunks...")
    embeddings = model.encode(textos, batch_size=32, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")
    
    # Normalizar para busca por similaridade de cosseno
    faiss.normalize_L2(embeddings)
    
    # Criar indice FAISS
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product = cosine sim com normalizacao
    index.add(embeddings)
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Salvar indice
    index_file = output_path / "faiss_index.bin"
    faiss.write_index(index, str(index_file))
    print(f"Indice FAISS salvo: {index_file}")
    
    # Salvar metadados
    meta_file = output_path / "chunks_meta.json"
    with open(meta_file, "w", encoding="utf-8") as f:
        json.dump({
            "total_chunks": len(chunks),
            "modelo": model_name,
            "dimensao": dimension,
            "metadados": metadados
        }, f, ensure_ascii=False, indent=2)
    print(f"Metadados salvos: {meta_file}")
    
    return True


def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3):
    """Testa uma busca no indice FAISS"""
    try:
        import numpy as np
        import faiss
        from sentence_transformers import SentenceTransformer
    except ImportError as e:
        print(f"Dependencia nao instalada: {e}")
        return
    
    # Carregar indice e metadados
    index = faiss.read_index(str(index_path))
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    chunks = carregar_chunks(chunks_path)
    
    model = SentenceTransformer(model_name)
    
    # Buscar
    query_embedding = model.encode([query])
    query_embedding = query_embedding.astype("float32")
    faiss.normalize_L2(query_embedding)
    
    scores, indices = index.search(query_embedding, top_k)
    
    print(f"\nQuery: {query}")
    print(f"Top {top_k} resultados:")
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        chunk = chunks[idx]
        print(f"\n  {i+1}. [{score:.4f}] {chunk.get('section','')}")
        print(f"     {chunk['text'][:150]}...")


def main():
    parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO")
    parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks")
    parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2",
                        help="Modelo de embeddings")
    parser.add_argument("--output", type=str, default="data", help="Diretorio de saida")
    parser.add_argument("--test", action="store_true", help="Testar busca apos construcao")
    args = parser.parse_args()
    
    chunks_path = Path(args.chunks)
    if not chunks_path.exists():
        print(f"Arquivo de chunks nao encontrado: {chunks_path}")
        print("Execute primeiro: python scripts/build_chunks.py --synthetic")
        return
    
    chunks = carregar_chunks(chunks_path)
    success = construir_indice(chunks, args.model, args.output)
    
    if success and args.test:
        output_path = Path(args.output)
        testar_busca(
            query="quantos extintores preciso para um escritorio de 300m2",
            chunks_path=chunks_path,
            index_path=output_path / "faiss_index.bin",
            meta_path=output_path / "chunks_meta.json",
            model_name=args.model,
            top_k=3
        )
    
    if success:
        print("\nConstrucao concluida!")
        print(f"Proximos passos:")
        print(f"  1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF")
        print(f"  2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF")
        print(f"  3. Execute: python app.py")


if __name__ == "__main__":
    main()