Spaces:

makdadTaleb
/

rag-lecture-saver-api

Sleeping

File size: 3,494 Bytes

4e7e4c0

from typing import List, Dict
import re

from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)

# --------------------------------------
# Settings
# --------------------------------------
HEADERS_TO_SPLIT_ON = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
]

SEPARATORS = ["\n\n", "\n", " ", ""]

MIN_CHUNK_LENGTH = 40  


# --------------------------------------
# Main API 
# --------------------------------------
def chunk_document(

    document: Dict,

    chunk_size: int = 800,

    overlap: int = 100,

) -> List[str]:
    """

    Final structure-aware chunking for Markdown (Docling output)



    Input:

        document = {

            "text": "... markdown ...",

            "metadata": {

                "source": "...",

                "page": int,

                "format": "markdown"

            }

        }



    Output:

        List[str]  

    """

    text = document

    # 1) Header-based splitting
    header_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=HEADERS_TO_SPLIT_ON,
        strip_headers=False,
    )

    header_sections = header_splitter.split_text(text)

    # 2) Recursive splitter (size-based)
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=SEPARATORS,
    )

    final_chunks: List[str] = []

    for section in header_sections:
        section_text = section.page_content.strip()

        # -----------------------------
        # Noise filtering 
        # -----------------------------
        if _is_noise(section_text):
            continue

        # -----------------------------
        # Table handling
        # -----------------------------
        if _looks_like_markdown_table(section_text):
            final_chunks.append(section_text)
            continue

        # -----------------------------
        # Merge header-only chunks
        # -----------------------------
        if _is_header_only(section_text):
            continue

        # -----------------------------
        # Size-based splitting
        # -----------------------------
        sub_chunks = recursive_splitter.split_text(section_text)

        for sub in sub_chunks:
            sub = sub.strip()
            if len(sub) < MIN_CHUNK_LENGTH:
                continue

            final_chunks.append(sub)

    return final_chunks


# --------------------------------------
# Helpers
# --------------------------------------
def _looks_like_markdown_table(text: str) -> bool:
    lines = text.splitlines()
    if len(lines) < 2:
        return False

    has_pipes = any("|" in line for line in lines)
    has_separator = any(
        re.match(r"^\s*\|?[\s:-]+\|", line) for line in lines
    )

    return has_pipes and has_separator


def _is_header_only(text: str) -> bool:
    """

    Detect chunks that are only headers (e.g. '## العنوان')

    """
    lines = text.splitlines()
    if len(lines) != 1:
        return False

    return lines[0].lstrip().startswith("#")


def _is_noise(text: str) -> bool:
    """

    Remove garbage chunks: symbols, single letters, etc.

    """
    stripped = text.strip()

    if len(stripped) < 10:
        return True

    if re.fullmatch(r"[■S\s]+", stripped):
        return True

    return False