File size: 1,220 Bytes
d44b33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Load raw documents from disk into LangChain ``Document`` objects.

Supports PDF (PyMuPDF), plain text, and Markdown. Each document gets ``source`` and
``page`` metadata for downstream chunking and citations.
"""

from pathlib import Path

from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader


def load_documents(paths: str | list[str]) -> list[Document]:
    """Load one or more files; raise ``ValueError`` for unsupported extensions."""
    normalized_paths = [paths] if isinstance(paths, str) else paths
    all_docs: list[Document] = []
    for path_str in normalized_paths:
        path = Path(path_str)
        suffix = path.suffix.lower()

        if suffix == ".pdf":
            loader = PyMuPDFLoader(str(path_str))
        elif suffix in {".txt", ".md"}:
            loader = TextLoader(str(path_str), encoding="utf-8")
        else:
            raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")

        documents = loader.load()
        for doc in documents:
            doc.metadata.setdefault("source", path.name)
            doc.metadata.setdefault("page", 0)
        all_docs.extend(documents)

    return all_docs