from langchain.document_loaders import PyPDFLoader, TextLoader from langchain.docstore.document import Document from typing import List from langchain.text_splitter import ( RecursiveCharacterTextSplitter, ) def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]: """Load a file and return the text. Args: file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_. with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True. Raises: ValueError: If the file type is not supported. Returns: List[Document]: List of documents. """ if file_path.endswith(".pdf"): # load documents loader = PyPDFLoader(file_path) docs = loader.load() # # split documents # text_splitter = RecursiveCharacterTextSplitter( # chunk_size=1000, chunk_overlap=150 # ) # docs = text_splitter.split_documents(docs) elif file_path.endswith(".txt"): loader = TextLoader(file_path) docs = loader.load() else: raise ValueError( f"File type ({file_path.split('.')[1]}) not supported. Please upload a pdf or txt file." ) for doc in docs: doc.page_content = doc.page_content.replace("\n", " \n ") # if doc contains a page append it to the text if with_pageinfo and hasattr(doc, "metadata"): doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join( doc.page_content.split(" .") ) return docs