YourHonor / index_builder.py
Abhijeet Mendhe
Upload folder using huggingface_hub
54d04d4 verified
""" One time indexing: Load constitution PDF -> chunks -> embed -> store in ChromaDB
Robust: Handles tables/images """
import os
from pathlib import Path
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SimpleNodeParser
Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2",
device="cpu")
def build_index():
print("Building index...")
reader = SimpleDirectoryReader(input_dir="./data", recursive=True, file_extractor={"pdf": "llama_index.readers.file.PDFReader"})
documents = reader.load_data()
node_parser = SimpleNodeParser(chunk_size=1024, chunk_overlap=200)
index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=[node_parser])
index.storage_context.persist(persist_dir="./chroma_db")
print("Index built successfully!")
if __name__ == "__main__":
build_index()