from pathlib import Path import chromadb from src.labdaps.config import CHROMA_DIR, COLLECTION_NAME from src.labdaps.ingestion.chunker import Chunk from src.labdaps.ingestion.embedder import Embedder def _get_client() -> chromadb.PersistentClient: CHROMA_DIR.mkdir(parents=True, exist_ok=True) return chromadb.PersistentClient(path=str(CHROMA_DIR)) def build_index(chunks: list[Chunk], embedder: Embedder, rebuild: bool = False) -> None: client = _get_client() if rebuild: try: client.delete_collection(COLLECTION_NAME) print(f"[INFO] Coleção '{COLLECTION_NAME}' removida.") except Exception: pass collection = client.get_or_create_collection( name=COLLECTION_NAME, metadata={"hnsw:space": "cosine"}, ) if collection.count() > 0 and not rebuild: print(f"[INFO] Coleção já contém {collection.count()} chunks. Use --rebuild para re-indexar.") return texts = [c.text for c in chunks] embeddings = embedder.embed_passages(texts) ids = [f"chunk_{c.chunk_index}" for c in chunks] metadatas = [ {"source_file": c.source_file, "page_number": c.page_number, "chunk_index": c.chunk_index} for c in chunks ] batch_size = 500 for i in range(0, len(chunks), batch_size): collection.upsert( ids=ids[i:i+batch_size], embeddings=embeddings[i:i+batch_size], documents=texts[i:i+batch_size], metadatas=metadatas[i:i+batch_size], ) print(f"[INFO] Indexados chunks {i} a {min(i+batch_size, len(chunks))}/{len(chunks)}") print(f"[INFO] Indexação concluída. Total: {collection.count()} chunks") def query_store(query_embedding: list[float], top_k: int): client = _get_client() collection = client.get_collection(COLLECTION_NAME) results = collection.query( query_embeddings=[query_embedding], n_results=top_k, include=["documents", "metadatas", "distances"], ) return results["documents"][0], results["metadatas"][0], results["distances"][0] def collection_count() -> int: try: client = _get_client() collection = client.get_collection(COLLECTION_NAME) return collection.count() except Exception: return 0