Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader | |
| from langchain_core.documents import Document | |
| import tempfile | |
| def load_document_from_file(file) -> list[Document]: | |
| suffix = "." + file.filename.split(".")[-1] | |
| temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) | |
| temp.write(file.file.read()) | |
| temp.close() | |
| if suffix == ".pdf": | |
| loader = PyPDFLoader(temp.name) | |
| elif suffix == ".txt": | |
| loader = TextLoader(temp.name) | |
| else: | |
| raise ValueError("Unsupported File Type") | |
| return loader.load() | |
| def load_document_from_url(url: str) -> list[Document]: | |
| loader = WebBaseLoader(url) | |
| return loader.load() | |
| # def load_document_file_or_url(file_or_url) -> list[Document]: | |
| # # If input is a URL string | |
| # if isinstance(file_or_url, str) and re.match(r'^https?://', file_or_url): | |
| # loader = WebBaseLoader(file_or_url) | |
| # return loader.load() | |
| # # Otherwise treat it as a file-like object (e.g. from file upload) | |
| # suffix = "." + file_or_url.filename.split(".")[-1] | |
| # temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) | |
| # temp.write(file_or_url.file.read()) | |
| # temp.close() | |
| # if suffix == ".pdf": | |
| # loader = PyPDFLoader(temp.name) | |
| # elif suffix == ".txt": | |
| # loader = TextLoader(temp.name) | |
| # else: | |
| # raise ValueError("Unsupported File Type") | |
| # return loader.load() |