""" document_loader.py ------------------ Loads and parses documents from a file path or directory. Supports PDF, TXT, DOCX, HTML, and Markdown (.md). Returns a list of LangChain Document objects with metadata (source filename, page number where available). Bug Fix (Step 1): - Changed setdefault to direct assignment so source metadata is always normalized to just the filename, not the full absolute path. Enhancement (Step 2): - Added .md (Markdown) support via TextLoader. """ import logging from pathlib import Path from typing import List from langchain.schema import Document from langchain_community.document_loaders import ( PyMuPDFLoader, TextLoader, Docx2txtLoader, UnstructuredHTMLLoader, ) logger = logging.getLogger(__name__) # Map file extensions → loader class LOADER_MAP = { ".pdf": PyMuPDFLoader, ".txt": TextLoader, ".md": TextLoader, # Markdown support added ".docx": Docx2txtLoader, ".html": UnstructuredHTMLLoader, ".htm": UnstructuredHTMLLoader, } def load_document(file_path: str | Path) -> List[Document]: """ Load a single document and return a list of LangChain Document objects. Args: file_path: Absolute or relative path to the document. Returns: List of Document objects with `.page_content` and `.metadata`. Raises: ValueError: If the file type is not supported. FileNotFoundError: If the file does not exist. """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") ext = file_path.suffix.lower() loader_cls = LOADER_MAP.get(ext) if loader_cls is None: supported = ", ".join(LOADER_MAP.keys()) raise ValueError( f"Unsupported file type '{ext}'. Supported types: {supported}" ) logger.info("Loading %s with %s", file_path.name, loader_cls.__name__) loader = loader_cls(str(file_path)) docs = loader.load() # BUG FIX: always overwrite source with just the filename. # LangChain loaders set source to the full absolute path by default. # setdefault() would leave the full path intact since the key already exists. # Direct assignment always normalises to just the filename. for doc in docs: doc.metadata["source"] = file_path.name logger.info("Loaded %d page(s) from %s", len(docs), file_path.name) return docs def load_documents_from_directory(directory: str | Path) -> List[Document]: """ Recursively load all supported documents from a directory. Args: directory: Path to the folder containing source documents. Returns: Flat list of Document objects from all files found. """ directory = Path(directory) if not directory.is_dir(): raise NotADirectoryError(f"Not a directory: {directory}") all_docs: List[Document] = [] supported_exts = set(LOADER_MAP.keys()) for file_path in sorted(directory.rglob("*")): if file_path.suffix.lower() in supported_exts: try: docs = load_document(file_path) all_docs.extend(docs) except Exception as exc: logger.warning("Skipping %s — %s", file_path.name, exc) logger.info( "Loaded %d document chunk(s) from directory '%s'", len(all_docs), directory, ) return all_docs