Spaces:

NoobNovel
/

AdaptiveRAG

Sleeping

App Files Files Community

AdaptiveRAG / versioning /index_manager.py

NoobNovel

Add Knowledge Base Versioning layer with full UI tab

66bd94a about 1 month ago

raw

history blame contribute delete

10 kB

	"""ChromaDB snapshot manager.

	Every call to add_documents():
	1. Detects which docs are new / changed / unchanged (SHA-256).
	2. Creates a new ChromaDB collection ``kb_v{N}`` for the new snapshot.
	3. Copies unchanged chunks from the previous snapshot (no re-embedding).
	4. Embeds and indexes only the changed/new chunks.
	5. Points ``latest`` at the new version in SQLite.

	Rollback is a single SQLite write — no data is ever deleted.
	"""
	from __future__ import annotations

	import os

	os.environ.setdefault("ANONYMIZED_TELEMETRY", "False")

	import chromadb
	from chromadb.config import Settings

	from config import PATHS
	from versioning.change_detector import ChangeReport, detect_changes
	from versioning.document_store import DocumentStore

	_CHROMA_BATCH = 256


	def _client() -> chromadb.PersistentClient:
	PATHS["chroma_dir"].mkdir(parents=True, exist_ok=True)
	return chromadb.PersistentClient(
	path=str(PATHS["chroma_dir"]),
	settings=Settings(anonymized_telemetry=False),
	)


	def _collection_name(version: int) -> str:
	return f"kb_v{version}"


	class RAGVersionManager:
	"""Versioned knowledge-base layer on top of ChromaDB."""

	def __init__(self) -> None:
	self.store = DocumentStore()

	# ── public API ───────────────────────────────────────────────────

	def add_documents(
	self,
	chunks_by_doc: dict,
	batch_name: str = "",
	reason: str = "manual ingest",
	) -> dict:
	"""
	Index a batch of documents, only re-embedding what changed.

	Parameters
	----------
	chunks_by_doc:
	``{doc_id: {"chunks": [Chunk, ...], "source_path": str, "title": str}}``
	batch_name:
	Human-readable label for this batch (e.g. ``"initial_14_papers"``).
	reason:
	Short description of why this version was created.

	Returns
	-------
	dict with version, new, changed, unchanged counts.
	"""
	from ingestion.embedder import embed_texts

	client = _client()

	# ── 1. change detection ──────────────────────────────────────
	doc_meta = [
	{"doc_id": did, "source_path": info["source_path"], "title": info["title"]}
	for did, info in chunks_by_doc.items()
	]
	report: ChangeReport = detect_changes(doc_meta, self.store)

	if not report.has_changes:
	print(f" No changes detected ({report.summary()}) — index unchanged.")
	return {
	"version": self.store.get_latest(),
	"new": 0,
	"changed": 0,
	"unchanged": len(report.unchanged_docs),
	}

	current_version = self.store.get_latest() or 0
	new_version = self.store.bump_version()
	new_coll_name = _collection_name(new_version)

	print(f" Change summary: {report.summary()}")
	print(f" Creating snapshot {new_coll_name}…")

	new_coll = client.get_or_create_collection(
	name=new_coll_name,
	metadata={"hnsw:space": "cosine"},
	)

	# ── 2. copy unchanged chunks from previous snapshot ──────────
	if current_version > 0 and report.unchanged_docs:
	prev_name = _collection_name(current_version)
	try:
	prev_coll = client.get_collection(prev_name)
	unchanged_ids = {d["doc_id"] for d in report.unchanged_docs}
	copied = self._copy_chunks(prev_coll, new_coll, unchanged_ids)
	print(f" Copied {copied} chunks from {prev_name} (unchanged docs).")
	except Exception as exc:
	print(f" Warning: could not copy from v{current_version}: {exc}")

	# ── 3. embed + index new/changed chunks ──────────────────────
	to_index_ids = {d["doc_id"] for d in report.new_docs + report.changed_docs}
	new_chunks = [
	c
	for did, info in chunks_by_doc.items()
	if did in to_index_ids
	for c in info["chunks"]
	]

	if new_chunks:
	ids = [c.chunk_id for c in new_chunks]
	texts = [c.text for c in new_chunks]
	metas = [
	{
	"doc_id": c.doc_id,
	"source_path": c.source_path,
	"title": c.title,
	"page_start": c.page_start,
	"page_end": c.page_end,
	}
	for c in new_chunks
	]
	print(f" Embedding {len(new_chunks)} chunks for {len(to_index_ids)} doc(s)…")
	embeddings = embed_texts(texts)
	for i in range(0, len(ids), _CHROMA_BATCH):
	new_coll.upsert(
	ids=ids[i : i + _CHROMA_BATCH],
	documents=texts[i : i + _CHROMA_BATCH],
	metadatas=metas[i : i + _CHROMA_BATCH],
	embeddings=embeddings[i : i + _CHROMA_BATCH],
	)

	# ── 4. persist metadata ──────────────────────────────────────
	for d in report.new_docs:
	self.store.add_doc(d["doc_id"], new_version, d["checksum"],
	"active", d["source_path"], d["title"])
	for d in report.changed_docs:
	self.store.add_doc(d["doc_id"], new_version, d["checksum"],
	"active", d["source_path"], d["title"])
	for d in report.unchanged_docs:
	self.store.add_doc(d["doc_id"], new_version, d["checksum"],
	"active", d["source_path"], d["title"])

	self.store.log_version(
	version=new_version,
	batch_name=batch_name,
	docs_added=len(report.new_docs),
	docs_changed=len(report.changed_docs),
	docs_unchanged=len(report.unchanged_docs),
	reason=reason,
	collection_name=new_coll_name,
	)
	self.store.set_latest(new_version)

	print(
	f" ✓ Created {new_coll_name} — "
	f"{len(report.new_docs)} new, {len(report.changed_docs)} changed, "
	f"{len(report.unchanged_docs)} unchanged"
	)
	return {
	"version": new_version,
	"new": len(report.new_docs),
	"changed": len(report.changed_docs),
	"unchanged": len(report.unchanged_docs),
	"collection": new_coll_name,
	}

	def rollback(self, to_version: int) -> None:
	"""Point 'latest' at a previous snapshot (metadata-only, instant)."""
	known = {v["version"] for v in self.store.get_history()}
	if to_version not in known:
	raise ValueError(f"Version {to_version} not found. Known: {sorted(known)}")
	self.store.set_latest(to_version)
	print(f" Rolled back to v{to_version}")

	def list_versions(self) -> list[dict]:
	return self.store.get_history()

	def get_collection(self, version: str \| int = "latest") -> chromadb.Collection:
	client = _client()
	v = self.store.get_latest() if version == "latest" else int(version)
	if v is None:
	raise RuntimeError("No versioned snapshots exist yet. Run ingest first.")
	return client.get_collection(_collection_name(v))

	def collection_exists(self, version: str \| int = "latest") -> bool:
	try:
	self.get_collection(version)
	return True
	except Exception:
	return False

	def query(
	self,
	text: str,
	version: str \| int = "latest",
	k: int = 12,
	) -> list:
	"""Dense search against a specific snapshot. Returns list[Hit]."""
	from ingestion.embedder import embed_query
	from retrieval.dense import Hit

	coll = self.get_collection(version)
	qv = embed_query(text)
	res = coll.query(
	query_embeddings=[qv],
	n_results=min(k, coll.count()),
	include=["documents", "metadatas", "distances"],
	)
	hits = []
	for r, (cid, doc, meta, dist) in enumerate(
	zip(res["ids"][0], res["documents"][0],
	res["metadatas"][0], res["distances"][0])
	):
	hits.append(
	Hit(
	chunk_id=cid,
	text=doc,
	metadata=dict(meta),
	score=max(0.0, 1.0 - float(dist)),
	rank=r,
	)
	)
	return hits

	# ── internal helpers ─────────────────────────────────────────────

	def _copy_chunks(
	self,
	src: chromadb.Collection,
	dst: chromadb.Collection,
	doc_ids: set[str],
	) -> int:
	"""Copy all chunks whose doc_id is in doc_ids from src → dst."""
	if not doc_ids:
	return 0

	where = (
	{"doc_id": {"$in": list(doc_ids)}}
	if len(doc_ids) > 1
	else {"doc_id": list(doc_ids)[0]}
	)
	res = src.get(where=where, include=["documents", "metadatas", "embeddings"])
	if not res["ids"]:
	return 0

	ids, docs, metas, embs = (
	res["ids"], res["documents"], res["metadatas"], res["embeddings"]
	)
	for i in range(0, len(ids), _CHROMA_BATCH):
	dst.upsert(
	ids=ids[i : i + _CHROMA_BATCH],
	documents=docs[i : i + _CHROMA_BATCH],
	metadatas=metas[i : i + _CHROMA_BATCH],
	embeddings=embs[i : i + _CHROMA_BATCH],
	)
	return len(ids)