Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

File size: 5,314 Bytes

"""Document chunking with markdown-aware semantic splitting."""

from typing import List
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.converters import MarkdownToDocument
import logging
import re

logger = logging.getLogger(__name__)


class SemanticChunker:
    """Chunks documents using markdown-aware semantic splitting."""

    def __init__(
        self,
        chunk_size: int = 300,
        chunk_overlap: int = 50,
        min_chunk_size: int = 100,
    ):
        """
        Initialize the chunker.

        Args:
            chunk_size: Target number of words per chunk (not used for markdown splitting)
            chunk_overlap: Number of words to overlap between chunks
            min_chunk_size: Minimum number of words per chunk
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.min_chunk_size = min_chunk_size

        # Fallback splitter for non-markdown documents
        self.splitter = DocumentSplitter(
            split_by="sentence",
            split_length=chunk_size,
            split_overlap=chunk_overlap,
            split_threshold=min_chunk_size,
        )

        # Markdown to plain text converter
        self.md_converter = MarkdownToDocument()

    def _markdown_to_plain(self, markdown_text: str) -> str:
        """Convert markdown to plain text, removing formatting."""
        # Simple markdown to plain text conversion
        text = markdown_text
        # Remove ## headers but keep the text
        text = re.sub(r'^##\s+(.+)$', r'\1', text, flags=re.MULTILINE)
        # Remove bold/italic
        text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
        text = re.sub(r'\*(.+?)\*', r'\1', text)
        # Remove links but keep text
        text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
        # Remove bullet points
        text = re.sub(r'^\*\s+', '', text, flags=re.MULTILINE)
        return text.strip()

    def _split_by_markdown_headers(self, doc: Document) -> List[Document]:
        """Split a document by markdown H2 headers (##), then convert to plain text."""
        content = doc.content

        # Find all H2 headers and their positions
        pattern = r'^## (.+)$'
        matches = list(re.finditer(pattern, content, re.MULTILINE))

        if not matches:
            # No headers found, convert whole document to plain text
            plain_text = self._markdown_to_plain(content)
            return [Document(content=plain_text, meta=doc.meta)]

        chunks = []
        doc_title = doc.meta.get("file_name", "Unknown")

        # Extract preamble (before first header)
        if matches[0].start() > 0:
            preamble_md = content[:matches[0].start()].strip()
            if preamble_md:
                preamble_plain = self._markdown_to_plain(preamble_md)
                if len(preamble_plain.split()) >= 10:
                    chunk_meta = {**doc.meta, "section": "Introduction"}
                    chunks.append(Document(content=preamble_plain, meta=chunk_meta))

        # Extract each section between headers
        for i, match in enumerate(matches):
            header = match.group(1).strip()
            start = match.start()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)

            section_md = content[start:end].strip()

            if section_md:
                # Convert markdown section to plain text
                section_plain = self._markdown_to_plain(section_md)

                logger.debug(f"Section '{header}': {len(section_plain.split())} words")

                chunk_meta = {**doc.meta, "section": header}
                chunks.append(Document(content=section_plain, meta=chunk_meta))

        logger.info(f"Split '{doc_title}' into {len(chunks)} sections by markdown headers")
        return chunks

    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """
        Chunk documents into smaller pieces using markdown-aware splitting.

        Args:
            documents: List of documents to chunk

        Returns:
            List of chunked documents with metadata
        """
        if not documents:
            logger.warning("No documents to chunk")
            return []

        logger.info(f"Chunking {len(documents)} documents with markdown-aware splitting")

        # First, split by markdown headers
        all_chunks = []
        for doc in documents:
            header_chunks = self._split_by_markdown_headers(doc)
            all_chunks.extend(header_chunks)

        # Add chunk metadata
        for idx, doc in enumerate(all_chunks):
            if doc.meta is None:
                doc.meta = {}
            doc.meta["chunk_id"] = idx
            doc.meta["chunk_size"] = len(doc.content.split())

        logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents")

        # Log statistics
        chunk_sizes = [doc.meta.get("chunk_size", 0) for doc in all_chunks]
        if chunk_sizes:
            avg_size = sum(chunk_sizes) / len(chunk_sizes)
            logger.info(
                f"Chunk statistics - Avg: {avg_size:.1f} words, "
                f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}"
            )

        return all_chunks