Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

bfh-studadmin-assist / src /document_processing /loader.py

Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking

78a356b 5 months ago

raw

history blame contribute delete

2.22 kB

	"""Document loader for markdown files - loads RAW markdown to preserve headers."""

	from pathlib import Path
	from typing import List
	from haystack import Document
	import logging

	logger = logging.getLogger(__name__)


	class MarkdownDocumentLoader:
	"""Loads markdown documents from a directory, preserving markdown structure."""

	def __init__(self, documents_path: str):
	"""
	Initialize the document loader.

	Args:
	documents_path: Path to directory containing markdown files
	"""
	self.documents_path = Path(documents_path)

	def load_documents(self) -> List[Document]:
	"""
	Load all markdown documents from the configured directory.
	Loads RAW markdown content to preserve headers for semantic chunking.

	Returns:
	List of Haystack Document objects with raw markdown content
	"""
	if not self.documents_path.exists():
	raise FileNotFoundError(f"Documents path does not exist: {self.documents_path}")

	documents = []
	markdown_files = list(self.documents_path.glob("*.md"))

	if not markdown_files:
	logger.warning(f"No markdown files found in {self.documents_path}")
	return documents

	logger.info(f"Loading {len(markdown_files)} markdown files from {self.documents_path}")

	for md_file in markdown_files:
	try:
	# Load RAW markdown content (preserving ## headers)
	content = md_file.read_text(encoding='utf-8')

	# Create Haystack Document with metadata
	doc = Document(
	content=content,
	meta={
	"source_file": md_file.name,
	"file_name": md_file.stem,
	"file_path": str(md_file),
	}
	)

	documents.append(doc)
	logger.info(f"Loaded document: {md_file.name}")

	except Exception as e:
	logger.error(f"Error loading {md_file.name}: {e}")
	continue

	logger.info(f"Successfully loaded {len(documents)} documents")
	return documents