ARC / app /rag /loader.py
github-actions[bot]
Deploy from GitHub Actions: 37b5d8a4f4600eb83d09a6e8bb178d0f7e6bc890
c1b316f
from langchain_community.document_loaders import (
UnstructuredCSVLoader,
UnstructuredWordDocumentLoader,
JSONLoader,
TextLoader,
UnstructuredExcelLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
)
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_core.documents import Document
# PDF
# https://docs.langchain.com/oss/python/integrations/document_loaders/pymupdf4llm
def read_pdf(path: str) -> list[Document]:
loader = PyMuPDF4LLMLoader(path)
docs = loader.load()
return docs
# TXT
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
def read_txt(path: str) -> list[Document]:
loader = TextLoader(path, encoding="utf-8")
docs = loader.load()
return docs
# CSV
# https://python.langchain.com/docs/integrations/document_loaders/csv
def read_csv(path: str) -> list[Document]:
loader = UnstructuredCSVLoader(file_path=path, mode="elements", encoding="utf-8")
docs = loader.load()
return docs
# MD
# https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/
def read_md(path: str) -> list[Document]:
loader = UnstructuredMarkdownLoader(path, encoding="utf-8")
docs = loader.load()
return docs
# JSON
# https://python.langchain.com/docs/integrations/document_loaders/json
def read_json(path: str) -> list[Document]:
loader = JSONLoader(file_path=path, jq_schema=".", text_content=False)
docs = loader.load()
return docs
# DOCX
# https://python.langchain.com/docs/integrations/document_loaders/microsoft_word
def read_docx(path: str) -> list[Document]:
loader = UnstructuredWordDocumentLoader(path, mode="elements")
docs = loader.load()
return docs
# XLSX
# https://python.langchain.com/docs/integrations/document_loaders/microsoft_excel
def read_xlsx(path: str) -> list[Document]:
loader = UnstructuredExcelLoader(path, mode="elements")
docs = loader.load()
return docs
# PPTX
# https://python.langchain.com/docs/integrations/document_loaders/microsoft_powerpoint
def read_pptx(path: str) -> list[Document]:
loader = UnstructuredPowerPointLoader(path, mode="elements")
docs = loader.load()
return docs
# TEX
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
def read_tex(path: str) -> list[Document]:
loader = TextLoader(path, encoding="utf-8")
docs = loader.load()
return docs