from pathlib import Path from langchain_core.documents import Document from langchain_community.document_loaders import PyMuPDFLoader, TextLoader def load_documents(paths: str | list[str]) -> list[Document]: normalized_paths = [paths] if isinstance(paths, str) else paths all_docs: list[Document] = [] for path_str in normalized_paths: path = Path(path_str) suffix = path.suffix.lower() if suffix == ".pdf": loader = PyMuPDFLoader(str(path_str)) elif suffix in {".txt", ".md"}: loader = TextLoader(str(path_str), encoding="utf-8") else: raise ValueError(f"Unsupported file type: {suffix or 'unknown'}") documents = loader.load() for doc in documents: doc.metadata.setdefault("source", path.name) doc.metadata.setdefault("page", 0) all_docs.extend(documents) return all_docs