File size: 1,220 Bytes
d44b33d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | """Load raw documents from disk into LangChain ``Document`` objects.
Supports PDF (PyMuPDF), plain text, and Markdown. Each document gets ``source`` and
``page`` metadata for downstream chunking and citations.
"""
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
def load_documents(paths: str | list[str]) -> list[Document]:
"""Load one or more files; raise ``ValueError`` for unsupported extensions."""
normalized_paths = [paths] if isinstance(paths, str) else paths
all_docs: list[Document] = []
for path_str in normalized_paths:
path = Path(path_str)
suffix = path.suffix.lower()
if suffix == ".pdf":
loader = PyMuPDFLoader(str(path_str))
elif suffix in {".txt", ".md"}:
loader = TextLoader(str(path_str), encoding="utf-8")
else:
raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")
documents = loader.load()
for doc in documents:
doc.metadata.setdefault("source", path.name)
doc.metadata.setdefault("page", 0)
all_docs.extend(documents)
return all_docs
|