"""Document loader for markdown files - loads RAW markdown to preserve headers.""" from pathlib import Path from typing import List from haystack import Document import logging logger = logging.getLogger(__name__) class MarkdownDocumentLoader: """Loads markdown documents from a directory, preserving markdown structure.""" def __init__(self, documents_path: str): """ Initialize the document loader. Args: documents_path: Path to directory containing markdown files """ self.documents_path = Path(documents_path) def load_documents(self) -> List[Document]: """ Load all markdown documents from the configured directory. Loads RAW markdown content to preserve headers for semantic chunking. Returns: List of Haystack Document objects with raw markdown content """ if not self.documents_path.exists(): raise FileNotFoundError(f"Documents path does not exist: {self.documents_path}") documents = [] markdown_files = list(self.documents_path.glob("*.md")) if not markdown_files: logger.warning(f"No markdown files found in {self.documents_path}") return documents logger.info(f"Loading {len(markdown_files)} markdown files from {self.documents_path}") for md_file in markdown_files: try: # Load RAW markdown content (preserving ## headers) content = md_file.read_text(encoding='utf-8') # Create Haystack Document with metadata doc = Document( content=content, meta={ "source_file": md_file.name, "file_name": md_file.stem, "file_path": str(md_file), } ) documents.append(doc) logger.info(f"Loaded document: {md_file.name}") except Exception as e: logger.error(f"Error loading {md_file.name}: {e}") continue logger.info(f"Successfully loaded {len(documents)} documents") return documents