Spaces:

lenox-ai
/

prototype

Runtime error

prototype / src /doc_loading.py

Upload folder using huggingface_hub

229f176 over 2 years ago

1.65 kB

	from langchain.document_loaders import PyPDFLoader, TextLoader
	from langchain.docstore.document import Document
	from typing import List
	from langchain.text_splitter import (
	RecursiveCharacterTextSplitter,
	)


	def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]:
	"""Load a file and return the text.

	Args:
	file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
	with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True.

	Raises:
	ValueError: If the file type is not supported.

	Returns:
	List[Document]: List of documents.
	"""
	if file_path.endswith(".pdf"):
	# load documents
	loader = PyPDFLoader(file_path)
	docs = loader.load()
	# # split documents
	# text_splitter = RecursiveCharacterTextSplitter(
	# chunk_size=1000, chunk_overlap=150
	# )
	# docs = text_splitter.split_documents(docs)
	elif file_path.endswith(".txt"):
	loader = TextLoader(file_path)
	docs = loader.load()
	else:
	raise ValueError(
	f"File type ({file_path.split('.')[1]}) not supported. Please upload a pdf or txt file."
	)
	for doc in docs:
	doc.page_content = doc.page_content.replace("\n", " \n ")
	# if doc contains a page append it to the text
	if with_pageinfo and hasattr(doc, "metadata"):
	doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join(
	doc.page_content.split(" .")
	)

	return docs