Spaces:

abiju
/

notebook_lm_clone

Running

notebook_lm_clone / src /notebooklm_clone /retrieval.py

Abhinav Biju

fast/thinking toggle

cc2dc62 3 months ago

18.6 kB

	"""Hybrid retrieval over notebook-scoped indexed chunks.

	Spec references:
	- `specs/04_interfaces.md`: implements `retrieve()`.
	- `specs/05_rag_and_citations.md`: hybrid BM25 plus vector retrieval with merged candidates.
	- `specs/07_security.md`: notebook access remains isolated per user and notebook.
	- `specs/10_test_plan.md`: deterministic retrieval logic suitable for testing.
	- `specs/11_observability.md`: retrieval emits structured logging fields.
	"""

	from __future__ import annotations

	import json
	import logging
	import math
	import os
	from functools import lru_cache
	from pathlib import Path
	from time import perf_counter
	from typing import Any, TypedDict

	from ingestion.embedder import EmbedderDependencyError, EmbedderError, embed_texts
	from notebooklm_clone.notebooks import get_notebook
	from notebooklm_clone.storage import notebook_root, safe_join


	LOGGER = logging.getLogger(__name__)


	class RetrievalResult(TypedDict):
	"""Returned retrieval record for one chunk candidate."""

	chunk_id: str
	source_id: str
	source_name: str
	text: str
	score: float
	loc: Any


	class RetrievalError(Exception):
	"""Base exception for retrieval failures."""


	class RetrievalDependencyError(RetrievalError):
	"""Raised when a required retrieval dependency is unavailable."""


	class RetrievalValidationError(RetrievalError):
	"""Raised when query inputs or indexed payloads are invalid."""


	class RetrievalStorageError(RetrievalError):
	"""Raised when notebook-local retrieval data cannot be opened."""


	class _Candidate(TypedDict):
	"""Internal merged candidate shape before final formatting."""

	chunk_id: str
	source_id: str
	source_name: str
	text: str
	loc: Any
	bm25_score: float
	vector_score: float


	def _log_retrieval(
	username: str,
	notebook_id: str,
	status: str,
	started_at: float,
	) -> None:
	"""Emit an observability log record for retrieval operations."""

	duration_ms: int = int((perf_counter() - started_at) * 1000)
	LOGGER.info(
	"retrieve",
	extra={
	"user": username,
	"notebook_id": notebook_id,
	"action": "retrieve",
	"duration_ms": duration_ms,
	"status": status,
	},
	)


	def _tokenize(text: str) -> list[str]:
	"""Tokenize text deterministically into lowercase alphanumeric terms."""

	tokens: list[str] = []
	current: list[str] = []

	for character in text.lower():
	if character.isalnum():
	current.append(character)
	continue
	if current:
	tokens.append("".join(current))
	current = []

	if current:
	tokens.append("".join(current))

	return tokens


	def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
	"""Normalize positive scores to the `[0, 1]` interval deterministically."""

	positive_scores: list[float] = [score for score in scores.values() if score > 0.0]
	if not positive_scores:
	return {chunk_id: 0.0 for chunk_id in scores}

	max_score: float = max(positive_scores)
	if max_score <= 0.0:
	return {chunk_id: 0.0 for chunk_id in scores}

	return {
	chunk_id: (score / max_score) if score > 0.0 else 0.0
	for chunk_id, score in scores.items()
	}


	def _parse_loc(value: Any) -> Any:
	"""Parse stored location metadata when it was serialized as JSON."""

	if not isinstance(value, str):
	return value

	try:
	return json.loads(value)
	except json.JSONDecodeError:
	return value


	def _chroma_path(username: str, notebook_id: str) -> Path:
	"""Return the notebook-scoped Chroma persistence directory."""

	root: Path = notebook_root(username, notebook_id)
	chroma_root: Path = safe_join(root, "chroma")
	try:
	chroma_root.mkdir(parents=True, exist_ok=True)
	except OSError as exc:
	raise RetrievalStorageError(f"Failed to prepare Chroma path: {chroma_root}") from exc
	return chroma_root


	def _get_collection(username: str, notebook_id: str) -> Any:
	"""Open the notebook-local Chroma collection."""

	try:
	import chromadb
	except ImportError as exc:
	raise RetrievalDependencyError(
	"Retrieval requires the 'chromadb' package to be installed."
	) from exc

	chroma_root: Path = _chroma_path(username, notebook_id)
	try:
	client = chromadb.PersistentClient(path=str(chroma_root))
	return client.get_or_create_collection(name=notebook_id)
	except Exception as exc:
	raise RetrievalStorageError(
	f"Failed to open Chroma collection for notebook: {notebook_id}"
	) from exc


	def _load_collection_documents(collection: Any) -> tuple[list[str], list[str], list[dict[str, Any]]]:
	"""Load indexed notebook documents for BM25 scoring."""

	try:
	payload: dict[str, Any] = collection.get(include=["documents", "metadatas"])
	except Exception as exc:
	raise RetrievalStorageError("Failed to read notebook collection contents.") from exc

	ids: Any = payload.get("ids")
	documents: Any = payload.get("documents")
	metadatas: Any = payload.get("metadatas")

	if not isinstance(ids, list) or not isinstance(documents, list) or not isinstance(metadatas, list):
	raise RetrievalStorageError("Chroma collection returned invalid retrieval payloads.")
	if not (len(ids) == len(documents) == len(metadatas)):
	raise RetrievalStorageError("Chroma collection returned misaligned retrieval payloads.")

	validated_ids: list[str] = []
	validated_documents: list[str] = []
	validated_metadatas: list[dict[str, Any]] = []

	for index, item_id in enumerate(ids):
	if not isinstance(item_id, str):
	raise RetrievalStorageError(f"Indexed chunk id at position {index} is invalid.")
	if not isinstance(documents[index], str):
	raise RetrievalStorageError(f"Indexed document at position {index} is invalid.")
	if not isinstance(metadatas[index], dict):
	raise RetrievalStorageError(f"Indexed metadata at position {index} is invalid.")

	validated_ids.append(item_id)
	validated_documents.append(documents[index])
	validated_metadatas.append(metadatas[index])

	return validated_ids, validated_documents, validated_metadatas


	def _bm25_scores(documents: dict[str, str], query: str) -> dict[str, float]:
	"""Compute deterministic BM25 scores over `chunk_text` values."""

	query_tokens: list[str] = _tokenize(query)
	if not query_tokens:
	return {chunk_id: 0.0 for chunk_id in documents}

	doc_tokens: dict[str, list[str]] = {
	chunk_id: _tokenize(text) for chunk_id, text in documents.items()
	}
	document_count: int = len(doc_tokens)
	if document_count == 0:
	return {}

	average_length: float = sum(len(tokens) for tokens in doc_tokens.values()) / document_count
	if average_length == 0.0:
	return {chunk_id: 0.0 for chunk_id in documents}

	document_frequency: dict[str, int] = {}
	term_frequencies: dict[str, dict[str, int]] = {}

	for chunk_id, tokens in doc_tokens.items():
	counts: dict[str, int] = {}
	for token in tokens:
	counts[token] = counts.get(token, 0) + 1
	term_frequencies[chunk_id] = counts
	for token in counts:
	document_frequency[token] = document_frequency.get(token, 0) + 1

	k1: float = 1.5
	b: float = 0.75
	scores: dict[str, float] = {}

	for chunk_id, tokens in doc_tokens.items():
	doc_length: int = len(tokens)
	score: float = 0.0
	counts: dict[str, int] = term_frequencies[chunk_id]

	for token in query_tokens:
	frequency: int = counts.get(token, 0)
	if frequency == 0:
	continue

	df: int = document_frequency.get(token, 0)
	inverse_document_frequency: float = math.log(
	1.0 + ((document_count - df + 0.5) / (df + 0.5))
	)
	denominator: float = frequency + k1 * (
	1.0 - b + b * (doc_length / average_length)
	)
	score += inverse_document_frequency * (
	(frequency * (k1 + 1.0)) / denominator
	)

	scores[chunk_id] = score

	return scores


	def _vector_scores(collection: Any, query: str, limit: int) -> dict[str, float]:
	"""Query vector similarity from the notebook-scoped Chroma collection."""

	if limit <= 0:
	return {}

	try:
	query_embedding: list[float] = embed_texts([query])[0]
	except (EmbedderDependencyError, EmbedderError) as exc:
	raise RetrievalDependencyError("Failed to generate retrieval query embedding.") from exc

	try:
	payload: dict[str, Any] = collection.query(
	query_embeddings=[query_embedding],
	n_results=limit,
	include=["distances"],
	)
	except Exception as exc:
	raise RetrievalStorageError("Failed to query notebook vector index.") from exc

	ids_nested: Any = payload.get("ids")
	distances_nested: Any = payload.get("distances")
	if not isinstance(ids_nested, list) or not ids_nested:
	return {}
	if not isinstance(distances_nested, list) or not distances_nested:
	raise RetrievalStorageError("Chroma query returned invalid distance payloads.")

	ids: Any = ids_nested[0]
	distances: Any = distances_nested[0]
	if not isinstance(ids, list) or not isinstance(distances, list):
	raise RetrievalStorageError("Chroma query returned invalid nested payloads.")
	if len(ids) != len(distances):
	raise RetrievalStorageError("Chroma query returned misaligned ids and distances.")

	scores: dict[str, float] = {}
	for index, chunk_id in enumerate(ids):
	distance: Any = distances[index]
	if not isinstance(chunk_id, str) or not isinstance(distance, (int, float)):
	raise RetrievalStorageError("Chroma query returned invalid vector results.")
	scores[chunk_id] = 1.0 / (1.0 + max(float(distance), 0.0))

	return scores


	@lru_cache(maxsize=1)
	def _load_reranker() -> Any:
	"""Load cross-encoder reranker model once per process."""

	model_name: str = os.getenv(
	"NOTEBOOKLM_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2"
	).strip()

	try:
	from sentence_transformers import CrossEncoder
	except ImportError as exc:
	raise RetrievalDependencyError(
	"Reranking requires the 'sentence-transformers' package."
	) from exc

	LOGGER.info("Loading reranker model: %s", model_name)
	return CrossEncoder(model_name)


	def _rerank(
	query: str,
	candidates: list[RetrievalResult],
	k: int,
	) -> list[RetrievalResult]:
	"""Re-score candidates with a cross-encoder and return top-k."""

	if not candidates:
	return []

	reranker = _load_reranker()
	pairs: list[list[str]] = [[query, c["text"]] for c in candidates]
	scores = reranker.predict(pairs)

	for i, candidate in enumerate(candidates):
	candidate["score"] = float(scores[i])

	candidates.sort(key=lambda item: (-item["score"], item["chunk_id"]))
	return candidates[:k]


	def _expand_query(query: str) -> list[str]:
	"""Generate alternative query phrasings using the LLM.

	Returns a list containing the original query plus expansions.
	Falls back to just [query] if LLM is unavailable or expansion is disabled.
	"""

	if os.getenv("NOTEBOOKLM_QUERY_EXPANSION", "on").strip().lower() == "off":
	return [query]

	api_key: str = os.getenv("OPENAI_API_KEY", "").strip()
	model: str = os.getenv("NOTEBOOKLM_CHAT_MODEL", "gpt-4o-mini").strip()
	if not api_key or not model:
	return [query]

	try:
	from openai import OpenAI
	client = OpenAI(api_key=api_key)
	response = client.chat.completions.create(
	model=model,
	messages=[
	{
	"role": "system",
	"content": (
	"Generate 2 alternative phrasings of the user's search query. "
	"Return only the alternative queries, one per line, no numbering."
	),
	},
	{"role": "user", "content": query},
	],
	temperature=0.7,
	max_tokens=150,
	)
	content: str = (response.choices[0].message.content or "").strip()
	expansions: list[str] = [
	line.strip() for line in content.splitlines() if line.strip()
	]
	LOGGER.info("Query expanded: %s -> %s", query, expansions)
	return [query] + expansions[:2]
	except Exception as exc:
	LOGGER.warning("Query expansion failed, using original: %s", exc)
	return [query]


	def _multi_query_scores(
	chunk_documents: dict[str, str],
	collection: Any,
	queries: list[str],
	n_docs: int,
	) -> tuple[dict[str, float], dict[str, float]]:
	"""Run BM25 + vector for each query variant, merge with max-per-chunk."""

	merged_bm25: dict[str, float] = {}
	merged_vector: dict[str, float] = {}

	for q in queries:
	bm25_raw = _bm25_scores(chunk_documents, q)
	vector_raw = _vector_scores(collection, q, n_docs)
	for cid, score in bm25_raw.items():
	merged_bm25[cid] = max(merged_bm25.get(cid, 0.0), score)
	for cid, score in vector_raw.items():
	merged_vector[cid] = max(merged_vector.get(cid, 0.0), score)

	return merged_bm25, merged_vector


	def retrieve(
	username: str,
	notebook_id: str,
	query: str,
	k: int,
	rag_mode: str = "Reasoning",
	) -> list[RetrievalResult]:
	"""Retrieve top notebook chunks with hybrid scoring, query expansion, and reranking.

	Spec references:
	- `specs/04_interfaces.md`: implements `retrieve()`.
	- `specs/05_rag_and_citations.md`: BM25 retrieval, vector retrieval, merge, dedupe,
	normalize, and return top-k sorted descending.
	- `specs/07_security.md`: retrieval is scoped to one notebook owned by one user.
	- `specs/11_observability.md`: logs `user`, `notebook_id`, `action`, `duration_ms`, and `status`.

	Raises:
	ValueError: If `query` is empty or `k` is not positive.
	RetrievalDependencyError: If retrieval dependencies are unavailable.
	RetrievalStorageError: If notebook-local retrieval data cannot be opened.
	RetrievalValidationError: If indexed metadata is malformed.
	"""

	started_at: float = perf_counter()
	try:
	if not isinstance(query, str) or not query.strip():
	raise ValueError("query must be a non-empty string.")
	if k <= 0:
	raise ValueError("k must be greater than 0.")

	# Verifies notebook ownership and existence before any retrieval work.
	get_notebook(username, notebook_id)
	collection = _get_collection(username, notebook_id)
	ids, documents, metadatas = _load_collection_documents(collection)

	if not ids:
	_log_retrieval(username, notebook_id, "success", started_at)
	return []

	chunk_documents: dict[str, str] = {
	chunk_id: document for chunk_id, document in zip(ids, documents)
	}
	chunk_metadata: dict[str, dict[str, Any]] = {
	chunk_id: metadata for chunk_id, metadata in zip(ids, metadatas)
	}

	# Query expansion: generate alt phrasings and merge scores
	queries: list[str] = _expand_query(query) if rag_mode == "Reasoning" else [query]
	bm25_raw, vector_raw = _multi_query_scores(
	chunk_documents, collection, queries, len(ids)
	)
	bm25_normalized: dict[str, float] = _normalize_scores(bm25_raw)
	vector_normalized: dict[str, float] = _normalize_scores(vector_raw)

	merged_ids: list[str] = sorted(set(bm25_raw) \| set(vector_raw))
	candidates: list[_Candidate] = []

	for chunk_id in merged_ids:
	metadata: dict[str, Any] \| None = chunk_metadata.get(chunk_id)
	text: str \| None = chunk_documents.get(chunk_id)

	if metadata is None or text is None:
	raise RetrievalStorageError(f"Missing indexed content for chunk: {chunk_id}")

	source_id: Any = metadata.get("source_id")
	source_name: Any = metadata.get("source_name")
	if not isinstance(source_id, str) or not source_id.strip():
	raise RetrievalValidationError(
	f"Indexed metadata missing valid source_id for chunk: {chunk_id}"
	)
	if not isinstance(source_name, str) or not source_name.strip():
	raise RetrievalValidationError(
	f"Indexed metadata missing valid source_name for chunk: {chunk_id}"
	)

	candidates.append(
	{
	"chunk_id": chunk_id,
	"source_id": source_id.strip(),
	"source_name": source_name.strip(),
	"text": text,
	"loc": _parse_loc(metadata.get("location_hints")),
	"bm25_score": bm25_normalized.get(chunk_id, 0.0),
	"vector_score": vector_normalized.get(chunk_id, 0.0),
	}
	)

	ranked_results: list[RetrievalResult] = []
	for candidate in candidates:
	combined_score: float = (candidate["bm25_score"] + candidate["vector_score"]) / 2.0
	ranked_results.append(
	{
	"chunk_id": candidate["chunk_id"],
	"source_id": candidate["source_id"],
	"source_name": candidate["source_name"],
	"text": candidate["text"],
	"score": combined_score,
	"loc": candidate["loc"],
	}
	)

	ranked_results.sort(key=lambda item: (-item["score"], item["chunk_id"]))

	if rag_mode == "Fast":
	result: list[RetrievalResult] = ranked_results[:k]
	else:
	# Rerank only top-N candidates to control latency (default: 10)
	_rerank_n: int = int(os.getenv("NOTEBOOKLM_RERANK_TOP_N", "10"))
	rerank_pool: list[RetrievalResult] = ranked_results[:_rerank_n]
	result: list[RetrievalResult] = _rerank(query, rerank_pool, k)

	_log_retrieval(username, notebook_id, "success", started_at)
	return result
	except Exception:
	_log_retrieval(username, notebook_id, "error", started_at)
	raise