EXAM_RAG_API / ingestion /loaders /File_loader.py
MinaNasser's picture
1st
1bc3f18
from config import get_settings
import os
def get_file_extension(file_id: str):
return os.path.splitext(file_id)[-1]
def load_file(file_path: str):
if get_settings().CustomLoaders==True:
from ingestion.loaders.pdf_loader import load_pdf
from ingestion.loaders.txt_loader import load_txt
from ingestion.loaders.md_loader import load_md
from ingestion.loaders.docx_loader import load_docx
#Dispatcher
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
docs = load_pdf(file_path)
elif ext == ".docx":
docs = load_docx(file_path)
elif ext == ".md":
docs = load_md(file_path)
elif ext == ".txt":
docs = load_txt(file_path)
else:
print(f"Unsupported file type: {ext}")
return []
# Return list of Document objects as-is
return docs
elif get_settings().CustomLoaders==False:
from langchain_community.document_loaders import (
TextLoader,
Docx2txtLoader,
UnstructuredMarkdownLoader,
PyMuPDFLoader,
)
extension = get_file_extension(file_path)
if extension == ".txt":
return TextLoader(file_path, encoding="utf8").load()
elif extension == ".docx":
return Docx2txtLoader(file_path).load()
elif extension == ".md":
return UnstructuredMarkdownLoader(file_path).load()
elif extension in [".pdf"]:
return PyMuPDFLoader(file_path).load()
else:
raise ValueError(f"Unsupported file extension: {extension}")