File size: 1,020 Bytes
54d04d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
""" One time indexing: Load constitution PDF -> chunks -> embed -> store in ChromaDB

Robust: Handles tables/images """

import os
from pathlib import Path
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SimpleNodeParser

Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2", 
device="cpu")

def build_index():
    print("Building index...")
    reader = SimpleDirectoryReader(input_dir="./data", recursive=True, file_extractor={"pdf": "llama_index.readers.file.PDFReader"})
    documents = reader.load_data()
    node_parser = SimpleNodeParser(chunk_size=1024, chunk_overlap=200)
    index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=[node_parser])
    index.storage_context.persist(persist_dir="./chroma_db")
    print("Index built successfully!")

if __name__ == "__main__":
    build_index()