Spaces:
Paused
Paused
File size: 1,680 Bytes
1bc3f18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | from config import get_settings
import os
def get_file_extension(file_id: str):
return os.path.splitext(file_id)[-1]
def load_file(file_path: str):
if get_settings().CustomLoaders==True:
from ingestion.loaders.pdf_loader import load_pdf
from ingestion.loaders.txt_loader import load_txt
from ingestion.loaders.md_loader import load_md
from ingestion.loaders.docx_loader import load_docx
#Dispatcher
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
docs = load_pdf(file_path)
elif ext == ".docx":
docs = load_docx(file_path)
elif ext == ".md":
docs = load_md(file_path)
elif ext == ".txt":
docs = load_txt(file_path)
else:
print(f"Unsupported file type: {ext}")
return []
# Return list of Document objects as-is
return docs
elif get_settings().CustomLoaders==False:
from langchain_community.document_loaders import (
TextLoader,
Docx2txtLoader,
UnstructuredMarkdownLoader,
PyMuPDFLoader,
)
extension = get_file_extension(file_path)
if extension == ".txt":
return TextLoader(file_path, encoding="utf8").load()
elif extension == ".docx":
return Docx2txtLoader(file_path).load()
elif extension == ".md":
return UnstructuredMarkdownLoader(file_path).load()
elif extension in [".pdf"]:
return PyMuPDFLoader(file_path).load()
else:
raise ValueError(f"Unsupported file extension: {extension}")
|