""" One time indexing: Load constitution PDF -> chunks -> embed -> store in ChromaDB Robust: Handles tables/images """ import os from pathlib import Path from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SimpleNodeParser Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2", device="cpu") def build_index(): print("Building index...") reader = SimpleDirectoryReader(input_dir="./data", recursive=True, file_extractor={"pdf": "llama_index.readers.file.PDFReader"}) documents = reader.load_data() node_parser = SimpleNodeParser(chunk_size=1024, chunk_overlap=200) index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=[node_parser]) index.storage_context.persist(persist_dir="./chroma_db") print("Index built successfully!") if __name__ == "__main__": build_index()