from pypdf import PdfReader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma import re import os def extract_text_from_pdf(file_path:str) -> str: reader = PdfReader(file_path) text = "" for page in reader.pages: text += page.extract_text() or "" return text def pdf_to_documents(file_path:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings,chunk_size=1000,chunk_overlap=200,metadata:dict=None): text = extract_text_from_pdf(file_path) text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\s]", "", text) if not text.strip(): return [] splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap) chunks = splitter.split_text(text) docs = [] for i,chunk in enumerate(chunks): #print(f"index: {i} , {chunk}") meta = metadata.copy() if metadata else {} meta.update({"chunk":i}) docs.append(Document(page_content=chunk, metadata=meta)) if os.path.exists(database_name): Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection() vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name) return docs,vectorstore def store_data(text:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings): text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, chunk_overlap = 0, separators = [" ", ",", "\n"] ) #with open(file_path) as f: # text = f.read() texts = text_splitter.split_text(text) #print(f"split: {texts}") docs = [Document(page_content=t) for t in texts] if os.path.exists(database_name): Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection() vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name) return vectorstore