File size: 5,191 Bytes
e5bf384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
"""
scripts/build_embeddings.py
Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO
Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings
"""

import json
import argparse
from pathlib import Path


def carregar_chunks(chunks_path):
    """Carrega chunks de arquivo JSONL"""
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                chunks.append(json.loads(line))
    print(f"Carregados {len(chunks)} chunks de {chunks_path}")
    return chunks


def construir_indice(chunks, model_name, output_dir):
    """Constroi indice FAISS com embeddings"""
    try:
        import numpy as np
        import faiss
        from sentence_transformers import SentenceTransformer
    except ImportError as e:
        print(f"Dependencia nao instalada: {e}")
        print("Instale com: pip install faiss-cpu sentence-transformers numpy")
        return False
    
    print(f"Carregando modelo: {model_name}")
    model = SentenceTransformer(model_name)
    
    textos = [c["text"] for c in chunks]
    metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks]
    
    print(f"Gerando embeddings para {len(textos)} chunks...")
    embeddings = model.encode(textos, batch_size=32, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")
    
    # Normalizar para busca por similaridade de cosseno
    faiss.normalize_L2(embeddings)
    
    # Criar indice FAISS
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product = cosine sim com normalizacao
    index.add(embeddings)
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Salvar indice
    index_file = output_path / "faiss_index.bin"
    faiss.write_index(index, str(index_file))
    print(f"Indice FAISS salvo: {index_file}")
    
    # Salvar metadados
    meta_file = output_path / "chunks_meta.json"
    with open(meta_file, "w", encoding="utf-8") as f:
        json.dump({
            "total_chunks": len(chunks),
            "modelo": model_name,
            "dimensao": dimension,
            "metadados": metadados
        }, f, ensure_ascii=False, indent=2)
    print(f"Metadados salvos: {meta_file}")
    
    return True


def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3):
    """Testa uma busca no indice FAISS"""
    try:
        import numpy as np
        import faiss
        from sentence_transformers import SentenceTransformer
    except ImportError as e:
        print(f"Dependencia nao instalada: {e}")
        return
    
    # Carregar indice e metadados
    index = faiss.read_index(str(index_path))
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    chunks = carregar_chunks(chunks_path)
    
    model = SentenceTransformer(model_name)
    
    # Buscar
    query_embedding = model.encode([query])
    query_embedding = query_embedding.astype("float32")
    faiss.normalize_L2(query_embedding)
    
    scores, indices = index.search(query_embedding, top_k)
    
    print(f"\nQuery: {query}")
    print(f"Top {top_k} resultados:")
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        chunk = chunks[idx]
        print(f"\n  {i+1}. [{score:.4f}] {chunk.get('section','')}")
        print(f"     {chunk['text'][:150]}...")


def main():
    parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO")
    parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks")
    parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2",
                        help="Modelo de embeddings")
    parser.add_argument("--output", type=str, default="data", help="Diretorio de saida")
    parser.add_argument("--test", action="store_true", help="Testar busca apos construcao")
    args = parser.parse_args()
    
    chunks_path = Path(args.chunks)
    if not chunks_path.exists():
        print(f"Arquivo de chunks nao encontrado: {chunks_path}")
        print("Execute primeiro: python scripts/build_chunks.py --synthetic")
        return
    
    chunks = carregar_chunks(chunks_path)
    success = construir_indice(chunks, args.model, args.output)
    
    if success and args.test:
        output_path = Path(args.output)
        testar_busca(
            query="quantos extintores preciso para um escritorio de 300m2",
            chunks_path=chunks_path,
            index_path=output_path / "faiss_index.bin",
            meta_path=output_path / "chunks_meta.json",
            model_name=args.model,
            top_k=3
        )
    
    if success:
        print("\nConstrucao concluida!")
        print(f"Proximos passos:")
        print(f"  1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF")
        print(f"  2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF")
        print(f"  3. Execute: python app.py")


if __name__ == "__main__":
    main()