| from langchain.document_loaders import PyPDFLoader, TextLoader |
| from langchain.docstore.document import Document |
| from typing import List |
| from langchain.text_splitter import ( |
| RecursiveCharacterTextSplitter, |
| ) |
|
|
|
|
| def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]: |
| """Load a file and return the text. |
| |
| Args: |
| file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_. |
| with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True. |
| |
| Raises: |
| ValueError: If the file type is not supported. |
| |
| Returns: |
| List[Document]: List of documents. |
| """ |
| if file_path.endswith(".pdf"): |
| |
| loader = PyPDFLoader(file_path) |
| docs = loader.load() |
| |
| |
| |
| |
| |
| elif file_path.endswith(".txt"): |
| loader = TextLoader(file_path) |
| docs = loader.load() |
| else: |
| raise ValueError( |
| f"File type ({file_path.split('.')[1]}) not supported. Please upload a pdf or txt file." |
| ) |
| for doc in docs: |
| doc.page_content = doc.page_content.replace("\n", " \n ") |
| |
| if with_pageinfo and hasattr(doc, "metadata"): |
| doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join( |
| doc.page_content.split(" .") |
| ) |
|
|
| return docs |
|
|