Spaces:
Runtime error
Runtime error
| from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader | |
| from langchain_chroma import Chroma | |
| from langchain.schema import Document | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from dotenv import load_dotenv | |
| import os | |
| import shutil | |
| load_dotenv() | |
| OPEN_AI_KEY = os.getenv('OPEN_AI_KEY') | |
| CHROMA_PATH = "chroma" | |
| DATA_PATH = "data/" | |
| TEST_PATH = "data/theory_of_computation.pdf" | |
| embed = OpenAIEmbeddings( | |
| api_key=OPEN_AI_KEY, | |
| model="text-embedding-3-large" | |
| ) | |
| def main(): | |
| generate_data_store() | |
| # print(load_documents()) | |
| def generate_data_store(): | |
| documents = load_documents() | |
| chunks = split_text(documents) | |
| save_to_chroma(chunks) | |
| def load_documents(): | |
| loader = PyPDFDirectoryLoader(DATA_PATH) | |
| docs = loader.load() | |
| print(docs[0].metadata) | |
| return docs | |
| # loader = PyPDFLoader(TEST_PATH) | |
| # docs = [] | |
| # docs_lazy = loader.load() | |
| # for doc in docs_lazy: | |
| # docs.append(doc) | |
| # return docs_lazy | |
| def split_text(documents: list[Document]): | |
| # chunk_size = 1000, | |
| # chunk_overlap = 200, | |
| # length_function = len, | |
| # add_start_index = True, | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1100, | |
| chunk_overlap=100, | |
| length_function=len, | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Split {len(documents)} documents into {len(chunks)} chunks.") | |
| document = chunks[10] | |
| print(document.page_content) | |
| print(document.metadata) | |
| return chunks | |
| def save_to_chroma(chunks: list[Document]): | |
| if os.path.exists(CHROMA_PATH): # clear out the DB first | |
| shutil.rmtree(CHROMA_PATH) | |
| db = Chroma( | |
| collection_name="linux_funds", | |
| embedding_function=embed, | |
| persist_directory=CHROMA_PATH | |
| ) | |
| # below breaks text & metadata down to Chroma vector store | |
| texts = [chunk.page_content for chunk in chunks] | |
| metadatas = [chunk.metadata for chunk in chunks] | |
| db.add_texts(texts=texts, metadatas=metadatas) | |
| print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.") | |
| if __name__ == "__main__": | |
| main() | |