Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / src /core /rag /vector_db.py

ymlin105

chore: remove obsolete files and update project structure

6ad997d 11 days ago

raw

history blame contribute delete

15.1 kB

	from typing import List, Any

	# Using community version to avoid 'BaseBlobParser' version conflict in langchain-chroma/core
	from langchain_community.vectorstores import Chroma
	from langchain_huggingface import HuggingFaceEmbeddings

	from src.config import (
	REVIEW_HIGHLIGHTS_TXT,
	CHROMA_DB_DIR,
	EMBEDDING_MODEL,
	RERANK_CANDIDATES_MAX,
	RRF_K,
	SMALL_TO_BIG_OVER_RETRIEVE_FACTOR,
	)
	from src.utils import setup_logger
	from src.data.stores.metadata_store import metadata_store
	from src.data.stores.online_books_store import online_books_store


	logger = setup_logger(__name__)


	class VectorDB:
	"""
	Hybrid Vector Database combining ChromaDB (Dense) and SQLite FTS5 (Sparse).

	ENGINEERING IMPROVEMENT:
	Transitioned from in-memory `rank_bm25` logic to a disk-based SQLite FTS5
	architecture for keyword search. This allows for zero-RAM search indexing and
	eliminates the need for dataset pruning.

	Features:
	- Zero-RAM Keyword Indexing (via FTS5).
	- Hybrid RRF scoring (ChromaDB + FTS5).
	- Persistence on disk for 221k+ items.
	"""

	_instance = None

	def __new__(cls):
	if cls._instance is None:
	cls._instance = super(VectorDB, cls).__new__(cls)
	cls._instance.db = None
	cls._instance.embeddings = None
	return cls._instance

	def __init__(self):
	if self.db is None:
	self._initialize_db()

	def _initialize_db(self):
	"""Initialize ChromaDB with local embeddings."""
	try:
	# Use local sentence-transformers model (no API calls)
	logger.info(f"Loading embedding model: {EMBEDDING_MODEL}")
	self.embeddings = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True},
	)
	logger.info("Embedding model loaded successfully")

	if CHROMA_DB_DIR.exists() and any(CHROMA_DB_DIR.iterdir()):
	logger.info(f"Loading existing vector database from {CHROMA_DB_DIR}")
	self.db = Chroma(
	persist_directory=str(CHROMA_DB_DIR),
	embedding_function=self.embeddings,
	)
	logger.info(
	"Loaded %s documents from vector database",
	self.db._collection.count(),
	)
	else:
	error_msg = (
	f"Vector Database not found at {CHROMA_DB_DIR}.\n"
	"Please run the initialization script first to build the index:\n"
	" python src/init_db.py"
	)
	logger.warning(error_msg)
	self.db = None

	except Exception as e:
	logger.error("VectorDB init failed [%s]: %s", type(e).__name__, e)
	# Graceful degradation: search() will return []
	self.db = None

	# 2. Initialize FTS5 (Sparse Retrieval)
	self._init_fts5()

	# 3. Initialize Temporal Data logic (Zero-RAM mode)
	logger.info("VectorDB: Temporal Scoring will use SQLite metadata.")

	def _init_fts5(self):
	"""
	Initialize FTS5 for Sparse (Keyword) Retrieval via SQLite.
	"""
	try:
	conn = metadata_store.connection
	if conn:
	logger.info("VectorDB: FTS5 keyword search enabled via SQLite.")
	self.fts_enabled = True
	else:
	logger.warning(
	"VectorDB: SQLite connection not available. FTS5 disabled."
	)
	self.fts_enabled = False
	except Exception as e:
	logger.error("Failed to initialize FTS5: %s", e)
	self.fts_enabled = False

	def _sparse_fts_search(self, query: str, k: int = 5) -> List[Any]:
	"""
	Sparse retrieval: main FTS5 + online staging FTS5. No lock on main DB from writes.
	"""
	if not self.fts_enabled:
	logger.warning("FTS5 not enabled, cannot perform sparse search.")
	return []

	class MockDoc:
	def __init__(self, content, metadata):
	self.page_content = content
	self.metadata = metadata

	def mk_doc(row: dict) -> MockDoc:
	title = row.get("title", "") or ""
	desc = row.get("description", "") or ""
	return MockDoc(
	f"{title} {desc}",
	{
	"isbn": row.get("isbn13", ""),
	"title": title,
	"authors": row.get("authors", ""),
	"categories": row.get("simple_categories", ""),
	},
	)

	results: List[Any] = []
	try:
	# 1. Main store (read-only, no contention)
	conn = metadata_store.connection
	if conn:
	clean_query = query.strip().replace('"', '""')
	if clean_query:
	fts_query = f'"{clean_query}"'
	cursor = conn.cursor()
	cursor.execute(
	"""
	SELECT isbn13, title, description, authors, simple_categories
	FROM books_fts WHERE books_fts MATCH ? ORDER BY rank LIMIT ?
	""",
	(fts_query, k),
	)
	for row in cursor.fetchall():
	results.append(mk_doc(dict(row)))

	# 2. Online staging store (separate DB)
	for row in online_books_store.fts_search(query, k=k):
	results.append(mk_doc(row))

	logger.info(
	"VectorDB: FTS5 keyword search found %d results.", len(results)
	)
	return results

	except Exception as e:
	logger.error("VectorDB: FTS5 keyword search failed: %s", e)
	return []

	def search(self, query: str, k: int = 5) -> List[Any]:
	"""
	Legacy semantic search.
	"""
	if not self.db:
	return []
	return self.db.similarity_search(query, k=k)

	def get_book_details(self, isbn: str):
	"""Get book metadata by ISBN from the centralized MetadataStore."""
	return metadata_store.get_book_metadata(str(isbn))

	def hybrid_search(
	self,
	query: str,
	k: int = 5,
	alpha: float = 0.5,
	rerank: bool = False,
	temporal: bool = False,
	) -> List[Any]:
	"""
	Hybrid Search = Dense (Vector) + Sparse (FTS5) with Reciprocal Rank Fusion (RRF).
	Optional: Cross-Encoder Reranking for high precision.
	"""
	if not self.db or not self.fts_enabled:
	logger.warning("FTS5 or DB missing, falling back to simple search.")
	return self.search(query, k)

	# 1. Sparse Retrieval (FTS5)
	# Get top K*2 candidates
	sparse_results = self._sparse_fts_search(query, k=k * 2)

	# Optimization: If alpha=1.0, return Sparse results directly (Skip Dense)
	if alpha == 1.0:
	return sparse_results[:k]

	# 2. Dense Retrieval (Chroma)
	dense_results = self.search(query, k=k * 2)

	# 3. Reciprocal Rank Fusion: score = 1 / (RRF_K + rank)
	fusion_scores = {}

	# Helper to get ID (using ISBN as unique ID)
	def get_id(doc):
	# Try metadata, fallback to content parsing if needed
	if "isbn" in doc.metadata and doc.metadata["isbn"]:
	return str(doc.metadata["isbn"])
	if "isbn13" in doc.metadata and doc.metadata["isbn13"]:
	return str(doc.metadata["isbn13"])

	# Fallback parsing
	if "ISBN:" in doc.page_content:
	return doc.page_content.split("ISBN:")[1].strip().split()[0]
	# Fallback 2: Check first word (legacy format)
	return doc.page_content.strip().split()[0]

	# Fusion: Dense
	for rank, doc in enumerate(dense_results):
	doc_id = get_id(doc)
	if doc_id not in fusion_scores:
	fusion_scores[doc_id] = {"score": 0.0, "doc": doc}
	fusion_scores[doc_id]["score"] += 1 / (rank + RRF_K)

	# Fusion: Sparse
	for rank, doc in enumerate(sparse_results):
	doc_id = get_id(doc)
	if doc_id not in fusion_scores:
	fusion_scores[doc_id] = {"score": 0.0, "doc": doc}
	fusion_scores[doc_id]["score"] += 1 / (rank + RRF_K)

	# Sort by RRF score
	sorted_docs = sorted(
	fusion_scores.values(), key=lambda x: x["score"], reverse=True
	)
	# Keep all unique candidates for reranking
	top_candidates = [item["doc"] for item in sorted_docs]

	# 4. Reranking (Cross-Encoder)
	final_results = top_candidates[:k]
	if rerank:
	from src.core.reranker import reranker

	rerank_candidates = top_candidates[
	: min(len(top_candidates), RERANK_CANDIDATES_MAX)
	]
	logger.info("Reranking top %d candidates...", len(rerank_candidates))
	final_results = reranker.rerank(query, rerank_candidates, top_k=k)

	# 5. Temporal Dynamics (Optional)
	# Apply boost to 'final_results' (which now have scores from reranker)
	if temporal:
	from src.core.temporal import temporal_ranker

	logger.info("Applying Temporal Decay...")

	# Populate local year map for candidates to avoid repeated queries
	candidate_years = {}
	for doc in final_results:
	isbn = get_id(doc)
	if isbn:
	rec = metadata_store.get_book_metadata(isbn)
	year = temporal_ranker.parse_year(rec.get("publishedDate"))
	if year > 0:
	candidate_years[isbn] = year

	final_results = temporal_ranker.apply_decay(final_results, candidate_years)

	return final_results

	def small_to_big_search(self, query: str, k: int = 5) -> List[Any]:
	"""
	Small-to-Big Retrieval (Parent-Child / Chunk-to-Parent Pattern).

	ALGORITHM:
	1. Search FINE-GRAINED chunks (e.g. review snippets) for high-precision
	semantic match. User queries like "twist ending" match chunk content.
	2. Map matched chunks back to PARENT books via parent_isbn metadata.
	3. Return full book documents (title, description) for LLM context.

	WHY: Book-level vectors often miss query-specific details (e.g. "unreliable
	narrator"). Chunk-level search finds those, then we lift to book-level.

	Ref: LlamaIndex Recursive Retrieval, RAPTOR (Sarthi et al., 2024).
	"""
	from langchain_community.vectorstores import Chroma

	chunk_persist_dir = "data/chroma_chunks"

	# Load chunk index (lazy)
	if not hasattr(self, "chunk_db"):
	try:
	self.chunk_db = Chroma(
	persist_directory=chunk_persist_dir,
	embedding_function=self.embeddings,
	collection_name="review_chunks",
	)
	logger.info("Loaded chunk index from %s", chunk_persist_dir)
	except Exception as e:
	logger.warning(
	"Chunk index not available: %s. Falling back to hybrid search.", e
	)
	return self.hybrid_search(query, k=k, rerank=True)

	# Step 1: Search chunks (Fine-grained). Over-retrieve to improve recall.
	chunk_results = self.chunk_db.similarity_search(
	query, k=k * SMALL_TO_BIG_OVER_RETRIEVE_FACTOR
	)
	logger.info("Small-to-Big: Found %d chunk matches", len(chunk_results))

	# Step 2: Extract unique parent ISBNs
	parent_isbns = []
	seen = set()
	for chunk in chunk_results:
	isbn = chunk.metadata.get("parent_isbn")
	if isbn and isbn not in seen:
	parent_isbns.append(isbn)
	seen.add(isbn)

	logger.info("Small-to-Big: Mapped to %d unique books", len(parent_isbns))

	# Step 3: Fetch full book context from parent index
	from langchain_core.documents import Document

	parent_docs = []
	for isbn in parent_isbns[:k]:
	rec = metadata_store.get_book_metadata(isbn)
	if rec:
	doc = Document(
	page_content=(
	f"Title: {rec.get('title', 'Unknown')}\n"
	f"ISBN: {rec.get('isbn13', isbn)}\n"
	f"Description: {rec.get('description', '')}"
	),
	metadata={
	"isbn": rec.get("isbn13", isbn),
	"title": rec.get("title"),
	},
	)
	parent_docs.append(doc)

	# Fallback: If BM25 didn't work, try similarity search with ISBN
	if not parent_docs and self.db:
	for isbn in parent_isbns[:k]:
	results = self.db.similarity_search(isbn, k=1)
	if results:
	parent_docs.append(results[0])

	# Final fallback: Return chunks with enriched context
	if not parent_docs:
	logger.warning(
	"Parent lookup failed, returning chunks with ISBN context"
	)
	# Enrich chunks with their ISBN context
	for chunk in chunk_results[:k]:
	chunk.metadata["note"] = (
	f"From review of ISBN: {chunk.metadata.get('parent_isbn')}"
	)
	return chunk_results[:k]

	return parent_docs

	def add_book(self, book_data: dict):
	"""
	Dynamically add a new book to the ChromaDB vector index.

	Note: FTS5 incremental updates are handled separately via
	metadata_store.insert_book_with_fts() called from RecommendationOrchestrator.add_new_book().
	This method only handles the dense vector index.

	Args:
	book_data: Dict with isbn13, title, authors, description, etc.
	"""
	from langchain_core.documents import Document

	isbn = str(book_data.get("isbn13"))
	title = book_data.get("title", "")
	author = book_data.get("authors", "")
	description = book_data.get("description", "")

	# Add to ChromaDB (dense vector index)
	content = (
	f"Title: {title}\nAuthor: {author}\nDescription: {description}\nISBN: {isbn}"
	)
	doc = Document(
	page_content=content,
	metadata={
	"isbn": isbn,
	"isbn13": isbn,
	"title": title,
	"authors": author,
	"description": description,
	},
	)

	if self.db:
	self.db.add_documents([doc])
	logger.info("Added book %s to ChromaDB", isbn)


	__all__ = ["VectorDB"]