from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader from langchain_core.documents import Document import tempfile def load_document_from_file(file) -> list[Document]: suffix = "." + file.filename.split(".")[-1] temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) temp.write(file.file.read()) temp.close() if suffix == ".pdf": loader = PyPDFLoader(temp.name) elif suffix == ".txt": loader = TextLoader(temp.name) else: raise ValueError("Unsupported File Type") return loader.load() def load_document_from_url(url: str) -> list[Document]: loader = WebBaseLoader(url) return loader.load() # def load_document_file_or_url(file_or_url) -> list[Document]: # # If input is a URL string # if isinstance(file_or_url, str) and re.match(r'^https?://', file_or_url): # loader = WebBaseLoader(file_or_url) # return loader.load() # # Otherwise treat it as a file-like object (e.g. from file upload) # suffix = "." + file_or_url.filename.split(".")[-1] # temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) # temp.write(file_or_url.file.read()) # temp.close() # if suffix == ".pdf": # loader = PyPDFLoader(temp.name) # elif suffix == ".txt": # loader = TextLoader(temp.name) # else: # raise ValueError("Unsupported File Type") # return loader.load()