Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Document chunking with markdown-aware semantic splitting.""" | |
| from typing import List | |
| from haystack import Document | |
| from haystack.components.preprocessors import DocumentSplitter | |
| from haystack.components.converters import MarkdownToDocument | |
| import logging | |
| import re | |
| logger = logging.getLogger(__name__) | |
| class SemanticChunker: | |
| """Chunks documents using markdown-aware semantic splitting.""" | |
| def __init__( | |
| self, | |
| chunk_size: int = 300, | |
| chunk_overlap: int = 50, | |
| min_chunk_size: int = 100, | |
| ): | |
| """ | |
| Initialize the chunker. | |
| Args: | |
| chunk_size: Target number of words per chunk (not used for markdown splitting) | |
| chunk_overlap: Number of words to overlap between chunks | |
| min_chunk_size: Minimum number of words per chunk | |
| """ | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.min_chunk_size = min_chunk_size | |
| # Fallback splitter for non-markdown documents | |
| self.splitter = DocumentSplitter( | |
| split_by="sentence", | |
| split_length=chunk_size, | |
| split_overlap=chunk_overlap, | |
| split_threshold=min_chunk_size, | |
| ) | |
| # Markdown to plain text converter | |
| self.md_converter = MarkdownToDocument() | |
| def _markdown_to_plain(self, markdown_text: str) -> str: | |
| """Convert markdown to plain text, removing formatting.""" | |
| # Simple markdown to plain text conversion | |
| text = markdown_text | |
| # Remove ## headers but keep the text | |
| text = re.sub(r'^##\s+(.+)$', r'\1', text, flags=re.MULTILINE) | |
| # Remove bold/italic | |
| text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) | |
| text = re.sub(r'\*(.+?)\*', r'\1', text) | |
| # Remove links but keep text | |
| text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) | |
| # Remove bullet points | |
| text = re.sub(r'^\*\s+', '', text, flags=re.MULTILINE) | |
| return text.strip() | |
| def _split_by_markdown_headers(self, doc: Document) -> List[Document]: | |
| """Split a document by markdown H2 headers (##), then convert to plain text.""" | |
| content = doc.content | |
| # Find all H2 headers and their positions | |
| pattern = r'^## (.+)$' | |
| matches = list(re.finditer(pattern, content, re.MULTILINE)) | |
| if not matches: | |
| # No headers found, convert whole document to plain text | |
| plain_text = self._markdown_to_plain(content) | |
| return [Document(content=plain_text, meta=doc.meta)] | |
| chunks = [] | |
| doc_title = doc.meta.get("file_name", "Unknown") | |
| # Extract preamble (before first header) | |
| if matches[0].start() > 0: | |
| preamble_md = content[:matches[0].start()].strip() | |
| if preamble_md: | |
| preamble_plain = self._markdown_to_plain(preamble_md) | |
| if len(preamble_plain.split()) >= 10: | |
| chunk_meta = {**doc.meta, "section": "Introduction"} | |
| chunks.append(Document(content=preamble_plain, meta=chunk_meta)) | |
| # Extract each section between headers | |
| for i, match in enumerate(matches): | |
| header = match.group(1).strip() | |
| start = match.start() | |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(content) | |
| section_md = content[start:end].strip() | |
| if section_md: | |
| # Convert markdown section to plain text | |
| section_plain = self._markdown_to_plain(section_md) | |
| logger.debug(f"Section '{header}': {len(section_plain.split())} words") | |
| chunk_meta = {**doc.meta, "section": header} | |
| chunks.append(Document(content=section_plain, meta=chunk_meta)) | |
| logger.info(f"Split '{doc_title}' into {len(chunks)} sections by markdown headers") | |
| return chunks | |
| def chunk_documents(self, documents: List[Document]) -> List[Document]: | |
| """ | |
| Chunk documents into smaller pieces using markdown-aware splitting. | |
| Args: | |
| documents: List of documents to chunk | |
| Returns: | |
| List of chunked documents with metadata | |
| """ | |
| if not documents: | |
| logger.warning("No documents to chunk") | |
| return [] | |
| logger.info(f"Chunking {len(documents)} documents with markdown-aware splitting") | |
| # First, split by markdown headers | |
| all_chunks = [] | |
| for doc in documents: | |
| header_chunks = self._split_by_markdown_headers(doc) | |
| all_chunks.extend(header_chunks) | |
| # Add chunk metadata | |
| for idx, doc in enumerate(all_chunks): | |
| if doc.meta is None: | |
| doc.meta = {} | |
| doc.meta["chunk_id"] = idx | |
| doc.meta["chunk_size"] = len(doc.content.split()) | |
| logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents") | |
| # Log statistics | |
| chunk_sizes = [doc.meta.get("chunk_size", 0) for doc in all_chunks] | |
| if chunk_sizes: | |
| avg_size = sum(chunk_sizes) / len(chunk_sizes) | |
| logger.info( | |
| f"Chunk statistics - Avg: {avg_size:.1f} words, " | |
| f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}" | |
| ) | |
| return all_chunks | |