Spaces:

devAby
/

YourHonor

Sleeping

YourHonor / index_builder.py

Abhijeet Mendhe

Upload folder using huggingface_hub

54d04d4 verified 14 days ago

1.02 kB

	""" One time indexing: Load constitution PDF -> chunks -> embed -> store in ChromaDB
	Robust: Handles tables/images """

	import os
	from pathlib import Path
	from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.core.node_parser import SimpleNodeParser

	Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2",
	device="cpu")

	def build_index():
	print("Building index...")
	reader = SimpleDirectoryReader(input_dir="./data", recursive=True, file_extractor={"pdf": "llama_index.readers.file.PDFReader"})
	documents = reader.load_data()
	node_parser = SimpleNodeParser(chunk_size=1024, chunk_overlap=200)
	index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=[node_parser])
	index.storage_context.persist(persist_dir="./chroma_db")
	print("Index built successfully!")

	if __name__ == "__main__":
	build_index()