Spaces:
Sleeping
Sleeping
File size: 2,411 Bytes
d0d2f42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | from dataclasses import dataclass
import chromadb
from rag.embeddings import get_embeddings_batch
from rag.ingestion import Chunk
@dataclass
class SearchResult:
content: str
metadata: dict
score: float
chunk_id: str
def create_vectorstore(
collection_name: str, persist_dir: str = "./chroma_db"
) -> chromadb.Collection:
"""Crea o abre una colección en ChromaDB con métrica cosine."""
client = chromadb.PersistentClient(path=persist_dir)
collection = client.get_or_create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"},
)
return collection
def index_chunks(
collection: chromadb.Collection, chunks: list[Chunk], batch_size: int = 50
) -> int:
"""Indexa chunks en la colección ChromaDB usando upsert. Retorna el número de chunks indexados."""
total = len(chunks)
indexed = 0
for i in range(0, total, batch_size):
batch = chunks[i : i + batch_size]
texts = [chunk.content for chunk in batch]
ids = [chunk.chunk_id for chunk in batch]
metadatas = [chunk.metadata for chunk in batch]
embeddings = get_embeddings_batch(texts)
collection.upsert(
ids=ids,
embeddings=embeddings,
documents=texts,
metadatas=metadatas,
)
indexed += len(batch)
print(f" Indexados {indexed}/{total} chunks...")
return indexed
def search(
collection: chromadb.Collection,
query: str,
n_results: int = 5,
where: dict | None = None,
) -> list[SearchResult]:
"""Busca chunks similares a la query. Convierte distancia cosine a similitud."""
from rag.embeddings import get_embedding
query_embedding = get_embedding(query)
kwargs: dict = {
"query_embeddings": [query_embedding],
"n_results": n_results,
}
if where:
kwargs["where"] = where
results = collection.query(**kwargs)
search_results: list[SearchResult] = []
for i in range(len(results["ids"][0])):
score = 1 - results["distances"][0][i] # distancia cosine -> similitud
search_results.append(
SearchResult(
content=results["documents"][0][i],
metadata=results["metadatas"][0][i],
score=score,
chunk_id=results["ids"][0][i],
)
)
return search_results
|