File size: 1,645 Bytes
2f12302
 
 
229f176
 
 
2f12302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229f176
2f12302
 
229f176
 
 
 
 
2f12302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.docstore.document import Document
from typing import List
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)


def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]:
    """Load a file and return the text.

    Args:
        file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
        with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True.

    Raises:
        ValueError: If the file type is not supported.

    Returns:
        List[Document]: List of documents.
    """
    if file_path.endswith(".pdf"):
        # load documents
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        # # split documents
        # text_splitter = RecursiveCharacterTextSplitter(
        #     chunk_size=1000, chunk_overlap=150
        # )
        # docs = text_splitter.split_documents(docs)
    elif file_path.endswith(".txt"):
        loader = TextLoader(file_path)
        docs = loader.load()
    else:
        raise ValueError(
            f"File type ({file_path.split('.')[1]}) not supported. Please upload a pdf or txt file."
        )
    for doc in docs:
        doc.page_content = doc.page_content.replace("\n", " \n ")
        # if doc contains a page append it to the text
        if with_pageinfo and hasattr(doc, "metadata"):
            doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join(
                doc.page_content.split(" .")
            )

    return docs