Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / scripts /data /init_dual_index.py

ymlin105

feat(v2.5): ItemCF direction weight, Swing recall, LGBMRanker

fe617ac 22 days ago

raw

history blame contribute delete

2.19 kB

	#!/usr/bin/env python3
	"""
	Dual Index Initialization Script
	Creates a separate ChromaDB collection for review chunks (Small-to-Big architecture).

	SOTA Reference: LlamaIndex Parent-Child Retrieval
	"""
	import json
	from pathlib import Path
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.documents import Document
	from tqdm import tqdm

	CHUNK_PATH = "data/review_chunks.jsonl"
	PERSIST_DIR = "data/chroma_chunks"
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	BATCH_SIZE = 5000


	def load_chunks(path: str, limit: int = None):
	"""Load chunks from JSONL file."""
	chunks = []
	with open(path, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	if limit and i >= limit:
	break
	data = json.loads(line)
	doc = Document(
	page_content=data["text"],
	metadata={"parent_isbn": data["parent_isbn"]}
	)
	chunks.append(doc)
	return chunks


	def init_chunk_index():
	"""Initialize the chunk-level ChromaDB index."""
	print(f"Loading embedding model: {EMBEDDING_MODEL}")
	embeddings = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	model_kwargs={"device": "mps"}, # Use Metal on Mac
	encode_kwargs={"normalize_embeddings": True}
	)

	print(f"Loading chunks from {CHUNK_PATH}...")
	chunks = load_chunks(CHUNK_PATH)
	print(f"Loaded {len(chunks)} chunks")

	# Create index in batches
	print(f"Creating ChromaDB index at {PERSIST_DIR}...")

	# First batch creates the collection
	db = Chroma.from_documents(
	documents=chunks[:BATCH_SIZE],
	embedding=embeddings,
	persist_directory=PERSIST_DIR,
	collection_name="review_chunks"
	)

	# Add remaining in batches
	for i in tqdm(range(BATCH_SIZE, len(chunks), BATCH_SIZE), desc="Indexing"):
	batch = chunks[i:i+BATCH_SIZE]
	db.add_documents(batch)

	print(f"Index created with {len(chunks)} chunks.")
	print(f"Persisted to {PERSIST_DIR}")


	if __name__ == "__main__":
	init_chunk_index()