prototype / src /doc_loading.py
fvde's picture
Upload folder using huggingface_hub
229f176
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.docstore.document import Document
from typing import List
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
)
def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]:
"""Load a file and return the text.
Args:
file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True.
Raises:
ValueError: If the file type is not supported.
Returns:
List[Document]: List of documents.
"""
if file_path.endswith(".pdf"):
# load documents
loader = PyPDFLoader(file_path)
docs = loader.load()
# # split documents
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=1000, chunk_overlap=150
# )
# docs = text_splitter.split_documents(docs)
elif file_path.endswith(".txt"):
loader = TextLoader(file_path)
docs = loader.load()
else:
raise ValueError(
f"File type ({file_path.split('.')[1]}) not supported. Please upload a pdf or txt file."
)
for doc in docs:
doc.page_content = doc.page_content.replace("\n", " \n ")
# if doc contains a page append it to the text
if with_pageinfo and hasattr(doc, "metadata"):
doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join(
doc.page_content.split(" .")
)
return docs