Spaces:
Running
Running
| """ | |
| document_loader.py | |
| ------------------ | |
| Loads and parses documents from a file path or directory. | |
| Supports PDF, TXT, DOCX, HTML, and Markdown (.md). | |
| Returns a list of LangChain Document objects with metadata | |
| (source filename, page number where available). | |
| Bug Fix (Step 1): | |
| - Changed setdefault to direct assignment so source metadata is always | |
| normalized to just the filename, not the full absolute path. | |
| Enhancement (Step 2): | |
| - Added .md (Markdown) support via TextLoader. | |
| """ | |
| import logging | |
| from pathlib import Path | |
| from typing import List | |
| from langchain.schema import Document | |
| from langchain_community.document_loaders import ( | |
| PyMuPDFLoader, | |
| TextLoader, | |
| Docx2txtLoader, | |
| UnstructuredHTMLLoader, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Map file extensions → loader class | |
| LOADER_MAP = { | |
| ".pdf": PyMuPDFLoader, | |
| ".txt": TextLoader, | |
| ".md": TextLoader, # Markdown support added | |
| ".docx": Docx2txtLoader, | |
| ".html": UnstructuredHTMLLoader, | |
| ".htm": UnstructuredHTMLLoader, | |
| } | |
| def load_document(file_path: str | Path) -> List[Document]: | |
| """ | |
| Load a single document and return a list of LangChain Document objects. | |
| Args: | |
| file_path: Absolute or relative path to the document. | |
| Returns: | |
| List of Document objects with `.page_content` and `.metadata`. | |
| Raises: | |
| ValueError: If the file type is not supported. | |
| FileNotFoundError: If the file does not exist. | |
| """ | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| ext = file_path.suffix.lower() | |
| loader_cls = LOADER_MAP.get(ext) | |
| if loader_cls is None: | |
| supported = ", ".join(LOADER_MAP.keys()) | |
| raise ValueError( | |
| f"Unsupported file type '{ext}'. Supported types: {supported}" | |
| ) | |
| logger.info("Loading %s with %s", file_path.name, loader_cls.__name__) | |
| loader = loader_cls(str(file_path)) | |
| docs = loader.load() | |
| # BUG FIX: always overwrite source with just the filename. | |
| # LangChain loaders set source to the full absolute path by default. | |
| # setdefault() would leave the full path intact since the key already exists. | |
| # Direct assignment always normalises to just the filename. | |
| for doc in docs: | |
| doc.metadata["source"] = file_path.name | |
| logger.info("Loaded %d page(s) from %s", len(docs), file_path.name) | |
| return docs | |
| def load_documents_from_directory(directory: str | Path) -> List[Document]: | |
| """ | |
| Recursively load all supported documents from a directory. | |
| Args: | |
| directory: Path to the folder containing source documents. | |
| Returns: | |
| Flat list of Document objects from all files found. | |
| """ | |
| directory = Path(directory) | |
| if not directory.is_dir(): | |
| raise NotADirectoryError(f"Not a directory: {directory}") | |
| all_docs: List[Document] = [] | |
| supported_exts = set(LOADER_MAP.keys()) | |
| for file_path in sorted(directory.rglob("*")): | |
| if file_path.suffix.lower() in supported_exts: | |
| try: | |
| docs = load_document(file_path) | |
| all_docs.extend(docs) | |
| except Exception as exc: | |
| logger.warning("Skipping %s — %s", file_path.name, exc) | |
| logger.info( | |
| "Loaded %d document chunk(s) from directory '%s'", | |
| len(all_docs), | |
| directory, | |
| ) | |
| return all_docs | |