File size: 1,453 Bytes
6e357ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain_core.documents import Document
import tempfile

def load_document_from_file(file) -> list[Document]:
    suffix = "." + file.filename.split(".")[-1]
    temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
    temp.write(file.file.read())
    temp.close()

    if suffix == ".pdf":
        loader = PyPDFLoader(temp.name)
    elif suffix == ".txt":
        loader = TextLoader(temp.name)
    else:
        raise ValueError("Unsupported File Type")
    return loader.load()

def load_document_from_url(url: str) -> list[Document]:
    loader = WebBaseLoader(url)
    return loader.load()

# def load_document_file_or_url(file_or_url) -> list[Document]:
#     # If input is a URL string
#     if isinstance(file_or_url, str) and re.match(r'^https?://', file_or_url):
#         loader = WebBaseLoader(file_or_url)
#         return loader.load()

#     # Otherwise treat it as a file-like object (e.g. from file upload)
#     suffix = "." + file_or_url.filename.split(".")[-1]
#     temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
#     temp.write(file_or_url.file.read())
#     temp.close()

#     if suffix == ".pdf":
#         loader = PyPDFLoader(temp.name)
#     elif suffix == ".txt":
#         loader = TextLoader(temp.name)
#     else:
#         raise ValueError("Unsupported File Type")
    
#     return loader.load()