book-rec-with-LLMs / src /init_db.py
ymlin105's picture
chore: remove legacy files and scripts no longer part of the main architecture
3f281f1
import os
import shutil
import sys
from pathlib import Path
# Add project root to Python path
sys.path.append(str(Path(__file__).parent.parent))
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
from tqdm import tqdm
def init_db():
print("="*50)
print("πŸ“š Book Recommender: Vector Database Builder")
print("="*50)
# FIX: Disable Tokenizers Parallelism to prevent deadlocks on macOS
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Force CPU for data ingestion to avoid MPS (Metal) async hangs during long processing.
# Reliability is key for building the DB; GPU acceleration is only needed for inference.
device = "cpu"
print("🐒 Forcing CPU for stable database ingestion (prevents macOS Freezes).")
# 1. Clear existing DB if any (to avoid duplicates/corruption)
if CHROMA_DB_DIR.exists():
print(f"πŸ—‘οΈ Cleaning existing database at {CHROMA_DB_DIR}...")
shutil.rmtree(CHROMA_DB_DIR)
# 2. Initialize Embeddings
print(f"πŸ”Œ Loading Embedding Model: {EMBEDDING_MODEL}...")
embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True, 'batch_size': 512} # Increase inference batch size for GPU
)
# 3. Create DB Client
print(f"πŸ’Ύ Initializing ChromaDB persistence at {CHROMA_DB_DIR}...")
db = Chroma(
persist_directory=str(CHROMA_DB_DIR),
embedding_function=embeddings
)
# 4. Stream and Index
if not REVIEW_HIGHLIGHTS_TXT.exists():
print(f"❌ Error: Review Highlights file not found at {REVIEW_HIGHLIGHTS_TXT}")
return
# Count lines first for progress bar
print("πŸ“Š Counting documents...")
total_lines = sum(1 for _ in open(REVIEW_HIGHLIGHTS_TXT, 'r', encoding='utf-8'))
print(f" Found {total_lines} documents to index.")
batch_size = 2000 # Increased batch size for optimal GPU throughput
documents = []
# MAX_DOCS=0 for full index; default 20000 for demo
max_docs = int(os.getenv("MAX_DOCS", "20000")) or None
print(f"πŸš€ Starting Ingestion (Source: Review Highlights, Limit: {max_docs or 'all'})...")
with open(REVIEW_HIGHLIGHTS_TXT, 'r', encoding='utf-8') as f:
# Use islice for efficient subsetting
from itertools import islice
total = min(total_lines, max_docs) if max_docs else total_lines
for line in tqdm(islice(f, max_docs), total=total, unit="doc", desc="Indexing Reviews"):
line = line.strip()
if not line:
continue
# Create Document object
# Note: We assume the line is the ISBN + Description format from previous ETL
# If strictly just description, simpler. Adapting to generic line-based doc.
documents.append(Document(page_content=line))
# Batch Insert
if len(documents) >= batch_size:
db.add_documents(documents)
documents = []
# Final Batch
if documents:
db.add_documents(documents)
print("\nβœ… Verification:")
print(f" Total Documents in DB: {db._collection.count()}")
print("πŸŽ‰ Vector Database Built Successfully!")
if __name__ == "__main__":
init_db()