Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Dual Index Initialization Script | |
| Creates a separate ChromaDB collection for review chunks (Small-to-Big architecture). | |
| SOTA Reference: LlamaIndex Parent-Child Retrieval | |
| """ | |
| import json | |
| from pathlib import Path | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_core.documents import Document | |
| from tqdm import tqdm | |
| CHUNK_PATH = "data/review_chunks.jsonl" | |
| PERSIST_DIR = "data/chroma_chunks" | |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| BATCH_SIZE = 5000 | |
| def load_chunks(path: str, limit: int = None): | |
| """Load chunks from JSONL file.""" | |
| chunks = [] | |
| with open(path, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| if limit and i >= limit: | |
| break | |
| data = json.loads(line) | |
| doc = Document( | |
| page_content=data["text"], | |
| metadata={"parent_isbn": data["parent_isbn"]} | |
| ) | |
| chunks.append(doc) | |
| return chunks | |
| def init_chunk_index(): | |
| """Initialize the chunk-level ChromaDB index.""" | |
| print(f"Loading embedding model: {EMBEDDING_MODEL}") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=EMBEDDING_MODEL, | |
| model_kwargs={"device": "mps"}, # Use Metal on Mac | |
| encode_kwargs={"normalize_embeddings": True} | |
| ) | |
| print(f"Loading chunks from {CHUNK_PATH}...") | |
| chunks = load_chunks(CHUNK_PATH) | |
| print(f"Loaded {len(chunks)} chunks") | |
| # Create index in batches | |
| print(f"Creating ChromaDB index at {PERSIST_DIR}...") | |
| # First batch creates the collection | |
| db = Chroma.from_documents( | |
| documents=chunks[:BATCH_SIZE], | |
| embedding=embeddings, | |
| persist_directory=PERSIST_DIR, | |
| collection_name="review_chunks" | |
| ) | |
| # Add remaining in batches | |
| for i in tqdm(range(BATCH_SIZE, len(chunks), BATCH_SIZE), desc="Indexing"): | |
| batch = chunks[i:i+BATCH_SIZE] | |
| db.add_documents(batch) | |
| print(f"Index created with {len(chunks)} chunks.") | |
| print(f"Persisted to {PERSIST_DIR}") | |
| if __name__ == "__main__": | |
| init_chunk_index() | |