Spaces:
Sleeping
Sleeping
File size: 3,550 Bytes
ad8974a d2570c2 653865f ad8974a 1a2c179 ad8974a d2570c2 3f281f1 d2570c2 ad8974a 1a2c179 ad8974a 1a2c179 ad8974a 950f43a 1a2c179 653865f 950f43a ad8974a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | import os
import shutil
import sys
from pathlib import Path
# Add project root to Python path
sys.path.append(str(Path(__file__).parent.parent))
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
from tqdm import tqdm
def init_db():
print("="*50)
print("π Book Recommender: Vector Database Builder")
print("="*50)
# FIX: Disable Tokenizers Parallelism to prevent deadlocks on macOS
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Force CPU for data ingestion to avoid MPS (Metal) async hangs during long processing.
# Reliability is key for building the DB; GPU acceleration is only needed for inference.
device = "cpu"
print("π’ Forcing CPU for stable database ingestion (prevents macOS Freezes).")
# 1. Clear existing DB if any (to avoid duplicates/corruption)
if CHROMA_DB_DIR.exists():
print(f"ποΈ Cleaning existing database at {CHROMA_DB_DIR}...")
shutil.rmtree(CHROMA_DB_DIR)
# 2. Initialize Embeddings
print(f"π Loading Embedding Model: {EMBEDDING_MODEL}...")
embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True, 'batch_size': 512} # Increase inference batch size for GPU
)
# 3. Create DB Client
print(f"πΎ Initializing ChromaDB persistence at {CHROMA_DB_DIR}...")
db = Chroma(
persist_directory=str(CHROMA_DB_DIR),
embedding_function=embeddings
)
# 4. Stream and Index
if not REVIEW_HIGHLIGHTS_TXT.exists():
print(f"β Error: Review Highlights file not found at {REVIEW_HIGHLIGHTS_TXT}")
return
# Count lines first for progress bar
print("π Counting documents...")
total_lines = sum(1 for _ in open(REVIEW_HIGHLIGHTS_TXT, 'r', encoding='utf-8'))
print(f" Found {total_lines} documents to index.")
batch_size = 2000 # Increased batch size for optimal GPU throughput
documents = []
# MAX_DOCS=0 for full index; default 20000 for demo
max_docs = int(os.getenv("MAX_DOCS", "20000")) or None
print(f"π Starting Ingestion (Source: Review Highlights, Limit: {max_docs or 'all'})...")
with open(REVIEW_HIGHLIGHTS_TXT, 'r', encoding='utf-8') as f:
# Use islice for efficient subsetting
from itertools import islice
total = min(total_lines, max_docs) if max_docs else total_lines
for line in tqdm(islice(f, max_docs), total=total, unit="doc", desc="Indexing Reviews"):
line = line.strip()
if not line:
continue
# Create Document object
# Note: We assume the line is the ISBN + Description format from previous ETL
# If strictly just description, simpler. Adapting to generic line-based doc.
documents.append(Document(page_content=line))
# Batch Insert
if len(documents) >= batch_size:
db.add_documents(documents)
documents = []
# Final Batch
if documents:
db.add_documents(documents)
print("\nβ
Verification:")
print(f" Total Documents in DB: {db._collection.count()}")
print("π Vector Database Built Successfully!")
if __name__ == "__main__":
init_db()
|