from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader from langchain_chroma import Chroma from langchain.schema import Document from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from dotenv import load_dotenv import os import shutil load_dotenv() OPEN_AI_KEY = os.getenv('OPEN_AI_KEY') CHROMA_PATH = "chroma" DATA_PATH = "data/" TEST_PATH = "data/theory_of_computation.pdf" embed = OpenAIEmbeddings( api_key=OPEN_AI_KEY, model="text-embedding-3-large" ) def main(): generate_data_store() # print(load_documents()) def generate_data_store(): documents = load_documents() chunks = split_text(documents) save_to_chroma(chunks) def load_documents(): loader = PyPDFDirectoryLoader(DATA_PATH) docs = loader.load() print(docs[0].metadata) return docs # loader = PyPDFLoader(TEST_PATH) # docs = [] # docs_lazy = loader.load() # for doc in docs_lazy: # docs.append(doc) # return docs_lazy def split_text(documents: list[Document]): # chunk_size = 1000, # chunk_overlap = 200, # length_function = len, # add_start_index = True, text_splitter = RecursiveCharacterTextSplitter( chunk_size=1100, chunk_overlap=100, length_function=len, ) chunks = text_splitter.split_documents(documents) print(f"Split {len(documents)} documents into {len(chunks)} chunks.") document = chunks[10] print(document.page_content) print(document.metadata) return chunks def save_to_chroma(chunks: list[Document]): if os.path.exists(CHROMA_PATH): # clear out the DB first shutil.rmtree(CHROMA_PATH) db = Chroma( collection_name="linux_funds", embedding_function=embed, persist_directory=CHROMA_PATH ) # below breaks text & metadata down to Chroma vector store texts = [chunk.page_content for chunk in chunks] metadatas = [chunk.metadata for chunk in chunks] db.add_texts(texts=texts, metadatas=metadatas) print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.") if __name__ == "__main__": main()