Spaces:
Sleeping
Sleeping
File size: 1,453 Bytes
6e357ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain_core.documents import Document
import tempfile
def load_document_from_file(file) -> list[Document]:
suffix = "." + file.filename.split(".")[-1]
temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
temp.write(file.file.read())
temp.close()
if suffix == ".pdf":
loader = PyPDFLoader(temp.name)
elif suffix == ".txt":
loader = TextLoader(temp.name)
else:
raise ValueError("Unsupported File Type")
return loader.load()
def load_document_from_url(url: str) -> list[Document]:
loader = WebBaseLoader(url)
return loader.load()
# def load_document_file_or_url(file_or_url) -> list[Document]:
# # If input is a URL string
# if isinstance(file_or_url, str) and re.match(r'^https?://', file_or_url):
# loader = WebBaseLoader(file_or_url)
# return loader.load()
# # Otherwise treat it as a file-like object (e.g. from file upload)
# suffix = "." + file_or_url.filename.split(".")[-1]
# temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
# temp.write(file_or_url.file.read())
# temp.close()
# if suffix == ".pdf":
# loader = PyPDFLoader(temp.name)
# elif suffix == ".txt":
# loader = TextLoader(temp.name)
# else:
# raise ValueError("Unsupported File Type")
# return loader.load() |