carlosh10 commited on
Commit
e5bf384
·
verified ·
1 Parent(s): 473972a

fix: Adiciona conteudo completo do build_embeddings.py com FAISS

Browse files
Files changed (1) hide show
  1. scripts/build_embeddings.py +147 -0
scripts/build_embeddings.py CHANGED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ scripts/build_embeddings.py
4
+ Constroi indice FAISS de embeddings para o sistema RAG do Agente CBMGO
5
+ Usa sentence-transformers/all-mpnet-base-v2 para gerar embeddings
6
+ """
7
+
8
+ import json
9
+ import argparse
10
+ from pathlib import Path
11
+
12
+
13
+ def carregar_chunks(chunks_path):
14
+ """Carrega chunks de arquivo JSONL"""
15
+ chunks = []
16
+ with open(chunks_path, "r", encoding="utf-8") as f:
17
+ for line in f:
18
+ line = line.strip()
19
+ if line:
20
+ chunks.append(json.loads(line))
21
+ print(f"Carregados {len(chunks)} chunks de {chunks_path}")
22
+ return chunks
23
+
24
+
25
+ def construir_indice(chunks, model_name, output_dir):
26
+ """Constroi indice FAISS com embeddings"""
27
+ try:
28
+ import numpy as np
29
+ import faiss
30
+ from sentence_transformers import SentenceTransformer
31
+ except ImportError as e:
32
+ print(f"Dependencia nao instalada: {e}")
33
+ print("Instale com: pip install faiss-cpu sentence-transformers numpy")
34
+ return False
35
+
36
+ print(f"Carregando modelo: {model_name}")
37
+ model = SentenceTransformer(model_name)
38
+
39
+ textos = [c["text"] for c in chunks]
40
+ metadados = [{"id": c.get("id",""), "section": c.get("section",""), "source": c.get("source","")} for c in chunks]
41
+
42
+ print(f"Gerando embeddings para {len(textos)} chunks...")
43
+ embeddings = model.encode(textos, batch_size=32, show_progress_bar=True)
44
+ embeddings = np.array(embeddings).astype("float32")
45
+
46
+ # Normalizar para busca por similaridade de cosseno
47
+ faiss.normalize_L2(embeddings)
48
+
49
+ # Criar indice FAISS
50
+ dimension = embeddings.shape[1]
51
+ index = faiss.IndexFlatIP(dimension) # Inner Product = cosine sim com normalizacao
52
+ index.add(embeddings)
53
+
54
+ output_path = Path(output_dir)
55
+ output_path.mkdir(parents=True, exist_ok=True)
56
+
57
+ # Salvar indice
58
+ index_file = output_path / "faiss_index.bin"
59
+ faiss.write_index(index, str(index_file))
60
+ print(f"Indice FAISS salvo: {index_file}")
61
+
62
+ # Salvar metadados
63
+ meta_file = output_path / "chunks_meta.json"
64
+ with open(meta_file, "w", encoding="utf-8") as f:
65
+ json.dump({
66
+ "total_chunks": len(chunks),
67
+ "modelo": model_name,
68
+ "dimensao": dimension,
69
+ "metadados": metadados
70
+ }, f, ensure_ascii=False, indent=2)
71
+ print(f"Metadados salvos: {meta_file}")
72
+
73
+ return True
74
+
75
+
76
+ def testar_busca(query, chunks_path, index_path, meta_path, model_name, top_k=3):
77
+ """Testa uma busca no indice FAISS"""
78
+ try:
79
+ import numpy as np
80
+ import faiss
81
+ from sentence_transformers import SentenceTransformer
82
+ except ImportError as e:
83
+ print(f"Dependencia nao instalada: {e}")
84
+ return
85
+
86
+ # Carregar indice e metadados
87
+ index = faiss.read_index(str(index_path))
88
+ with open(meta_path, "r", encoding="utf-8") as f:
89
+ meta = json.load(f)
90
+ chunks = carregar_chunks(chunks_path)
91
+
92
+ model = SentenceTransformer(model_name)
93
+
94
+ # Buscar
95
+ query_embedding = model.encode([query])
96
+ query_embedding = query_embedding.astype("float32")
97
+ faiss.normalize_L2(query_embedding)
98
+
99
+ scores, indices = index.search(query_embedding, top_k)
100
+
101
+ print(f"\nQuery: {query}")
102
+ print(f"Top {top_k} resultados:")
103
+ for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
104
+ chunk = chunks[idx]
105
+ print(f"\n {i+1}. [{score:.4f}] {chunk.get('section','')}")
106
+ print(f" {chunk['text'][:150]}...")
107
+
108
+
109
+ def main():
110
+ parser = argparse.ArgumentParser(description="Construtor de embeddings FAISS para RAG CBMGO")
111
+ parser.add_argument("--chunks", type=str, default="data/chunks.jsonl", help="Arquivo JSONL de chunks")
112
+ parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2",
113
+ help="Modelo de embeddings")
114
+ parser.add_argument("--output", type=str, default="data", help="Diretorio de saida")
115
+ parser.add_argument("--test", action="store_true", help="Testar busca apos construcao")
116
+ args = parser.parse_args()
117
+
118
+ chunks_path = Path(args.chunks)
119
+ if not chunks_path.exists():
120
+ print(f"Arquivo de chunks nao encontrado: {chunks_path}")
121
+ print("Execute primeiro: python scripts/build_chunks.py --synthetic")
122
+ return
123
+
124
+ chunks = carregar_chunks(chunks_path)
125
+ success = construir_indice(chunks, args.model, args.output)
126
+
127
+ if success and args.test:
128
+ output_path = Path(args.output)
129
+ testar_busca(
130
+ query="quantos extintores preciso para um escritorio de 300m2",
131
+ chunks_path=chunks_path,
132
+ index_path=output_path / "faiss_index.bin",
133
+ meta_path=output_path / "chunks_meta.json",
134
+ model_name=args.model,
135
+ top_k=3
136
+ )
137
+
138
+ if success:
139
+ print("\nConstrucao concluida!")
140
+ print(f"Proximos passos:")
141
+ print(f" 1. Configure FAISS_INDEX_PATH='{args.output}/faiss_index.bin' no Space HF")
142
+ print(f" 2. Configure CHUNKS_META_PATH='{args.output}/chunks_meta.json' no Space HF")
143
+ print(f" 3. Execute: python app.py")
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()