awellis's picture
Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking
78a356b
"""Document loader for markdown files - loads RAW markdown to preserve headers."""
from pathlib import Path
from typing import List
from haystack import Document
import logging
logger = logging.getLogger(__name__)
class MarkdownDocumentLoader:
"""Loads markdown documents from a directory, preserving markdown structure."""
def __init__(self, documents_path: str):
"""
Initialize the document loader.
Args:
documents_path: Path to directory containing markdown files
"""
self.documents_path = Path(documents_path)
def load_documents(self) -> List[Document]:
"""
Load all markdown documents from the configured directory.
Loads RAW markdown content to preserve headers for semantic chunking.
Returns:
List of Haystack Document objects with raw markdown content
"""
if not self.documents_path.exists():
raise FileNotFoundError(f"Documents path does not exist: {self.documents_path}")
documents = []
markdown_files = list(self.documents_path.glob("*.md"))
if not markdown_files:
logger.warning(f"No markdown files found in {self.documents_path}")
return documents
logger.info(f"Loading {len(markdown_files)} markdown files from {self.documents_path}")
for md_file in markdown_files:
try:
# Load RAW markdown content (preserving ## headers)
content = md_file.read_text(encoding='utf-8')
# Create Haystack Document with metadata
doc = Document(
content=content,
meta={
"source_file": md_file.name,
"file_name": md_file.stem,
"file_path": str(md_file),
}
)
documents.append(doc)
logger.info(f"Loaded document: {md_file.name}")
except Exception as e:
logger.error(f"Error loading {md_file.name}: {e}")
continue
logger.info(f"Successfully loaded {len(documents)} documents")
return documents