"""Document chunking with markdown-aware semantic splitting.""" from typing import List from haystack import Document from haystack.components.preprocessors import DocumentSplitter from haystack.components.converters import MarkdownToDocument import logging import re logger = logging.getLogger(__name__) class SemanticChunker: """Chunks documents using markdown-aware semantic splitting.""" def __init__( self, chunk_size: int = 300, chunk_overlap: int = 50, min_chunk_size: int = 100, ): """ Initialize the chunker. Args: chunk_size: Target number of words per chunk (not used for markdown splitting) chunk_overlap: Number of words to overlap between chunks min_chunk_size: Minimum number of words per chunk """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size # Fallback splitter for non-markdown documents self.splitter = DocumentSplitter( split_by="sentence", split_length=chunk_size, split_overlap=chunk_overlap, split_threshold=min_chunk_size, ) # Markdown to plain text converter self.md_converter = MarkdownToDocument() def _markdown_to_plain(self, markdown_text: str) -> str: """Convert markdown to plain text, removing formatting.""" # Simple markdown to plain text conversion text = markdown_text # Remove ## headers but keep the text text = re.sub(r'^##\s+(.+)$', r'\1', text, flags=re.MULTILINE) # Remove bold/italic text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) # Remove links but keep text text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) # Remove bullet points text = re.sub(r'^\*\s+', '', text, flags=re.MULTILINE) return text.strip() def _split_by_markdown_headers(self, doc: Document) -> List[Document]: """Split a document by markdown H2 headers (##), then convert to plain text.""" content = doc.content # Find all H2 headers and their positions pattern = r'^## (.+)$' matches = list(re.finditer(pattern, content, re.MULTILINE)) if not matches: # No headers found, convert whole document to plain text plain_text = self._markdown_to_plain(content) return [Document(content=plain_text, meta=doc.meta)] chunks = [] doc_title = doc.meta.get("file_name", "Unknown") # Extract preamble (before first header) if matches[0].start() > 0: preamble_md = content[:matches[0].start()].strip() if preamble_md: preamble_plain = self._markdown_to_plain(preamble_md) if len(preamble_plain.split()) >= 10: chunk_meta = {**doc.meta, "section": "Introduction"} chunks.append(Document(content=preamble_plain, meta=chunk_meta)) # Extract each section between headers for i, match in enumerate(matches): header = match.group(1).strip() start = match.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(content) section_md = content[start:end].strip() if section_md: # Convert markdown section to plain text section_plain = self._markdown_to_plain(section_md) logger.debug(f"Section '{header}': {len(section_plain.split())} words") chunk_meta = {**doc.meta, "section": header} chunks.append(Document(content=section_plain, meta=chunk_meta)) logger.info(f"Split '{doc_title}' into {len(chunks)} sections by markdown headers") return chunks def chunk_documents(self, documents: List[Document]) -> List[Document]: """ Chunk documents into smaller pieces using markdown-aware splitting. Args: documents: List of documents to chunk Returns: List of chunked documents with metadata """ if not documents: logger.warning("No documents to chunk") return [] logger.info(f"Chunking {len(documents)} documents with markdown-aware splitting") # First, split by markdown headers all_chunks = [] for doc in documents: header_chunks = self._split_by_markdown_headers(doc) all_chunks.extend(header_chunks) # Add chunk metadata for idx, doc in enumerate(all_chunks): if doc.meta is None: doc.meta = {} doc.meta["chunk_id"] = idx doc.meta["chunk_size"] = len(doc.content.split()) logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents") # Log statistics chunk_sizes = [doc.meta.get("chunk_size", 0) for doc in all_chunks] if chunk_sizes: avg_size = sum(chunk_sizes) / len(chunk_sizes) logger.info( f"Chunk statistics - Avg: {avg_size:.1f} words, " f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}" ) return all_chunks