Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import chromadb | |
| from src.labdaps.config import CHROMA_DIR, COLLECTION_NAME | |
| from src.labdaps.ingestion.chunker import Chunk | |
| from src.labdaps.ingestion.embedder import Embedder | |
| def _get_client() -> chromadb.PersistentClient: | |
| CHROMA_DIR.mkdir(parents=True, exist_ok=True) | |
| return chromadb.PersistentClient(path=str(CHROMA_DIR)) | |
| def build_index(chunks: list[Chunk], embedder: Embedder, rebuild: bool = False) -> None: | |
| client = _get_client() | |
| if rebuild: | |
| try: | |
| client.delete_collection(COLLECTION_NAME) | |
| print(f"[INFO] Coleção '{COLLECTION_NAME}' removida.") | |
| except Exception: | |
| pass | |
| collection = client.get_or_create_collection( | |
| name=COLLECTION_NAME, | |
| metadata={"hnsw:space": "cosine"}, | |
| ) | |
| if collection.count() > 0 and not rebuild: | |
| print(f"[INFO] Coleção já contém {collection.count()} chunks. Use --rebuild para re-indexar.") | |
| return | |
| texts = [c.text for c in chunks] | |
| embeddings = embedder.embed_passages(texts) | |
| ids = [f"chunk_{c.chunk_index}" for c in chunks] | |
| metadatas = [ | |
| {"source_file": c.source_file, "page_number": c.page_number, "chunk_index": c.chunk_index} | |
| for c in chunks | |
| ] | |
| batch_size = 500 | |
| for i in range(0, len(chunks), batch_size): | |
| collection.upsert( | |
| ids=ids[i:i+batch_size], | |
| embeddings=embeddings[i:i+batch_size], | |
| documents=texts[i:i+batch_size], | |
| metadatas=metadatas[i:i+batch_size], | |
| ) | |
| print(f"[INFO] Indexados chunks {i} a {min(i+batch_size, len(chunks))}/{len(chunks)}") | |
| print(f"[INFO] Indexação concluída. Total: {collection.count()} chunks") | |
| def query_store(query_embedding: list[float], top_k: int): | |
| client = _get_client() | |
| collection = client.get_collection(COLLECTION_NAME) | |
| results = collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=top_k, | |
| include=["documents", "metadatas", "distances"], | |
| ) | |
| return results["documents"][0], results["metadatas"][0], results["distances"][0] | |
| def collection_count() -> int: | |
| try: | |
| client = _get_client() | |
| collection = client.get_collection(COLLECTION_NAME) | |
| return collection.count() | |
| except Exception: | |
| return 0 | |