| """ One time indexing: Load constitution PDF -> chunks -> embed -> store in ChromaDB | |
| Robust: Handles tables/images """ | |
| import os | |
| from pathlib import Path | |
| from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core.node_parser import SimpleNodeParser | |
| Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2", | |
| device="cpu") | |
| def build_index(): | |
| print("Building index...") | |
| reader = SimpleDirectoryReader(input_dir="./data", recursive=True, file_extractor={"pdf": "llama_index.readers.file.PDFReader"}) | |
| documents = reader.load_data() | |
| node_parser = SimpleNodeParser(chunk_size=1024, chunk_overlap=200) | |
| index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=[node_parser]) | |
| index.storage_context.persist(persist_dir="./chroma_db") | |
| print("Index built successfully!") | |
| if __name__ == "__main__": | |
| build_index() | |