File size: 3,494 Bytes
4e7e4c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import List, Dict
import re

from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)

# --------------------------------------
# Settings
# --------------------------------------
HEADERS_TO_SPLIT_ON = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
]

SEPARATORS = ["\n\n", "\n", " ", ""]

MIN_CHUNK_LENGTH = 40  


# --------------------------------------
# Main API 
# --------------------------------------
def chunk_document(

    document: Dict,

    chunk_size: int = 800,

    overlap: int = 100,

) -> List[str]:
    """

    Final structure-aware chunking for Markdown (Docling output)



    Input:

        document = {

            "text": "... markdown ...",

            "metadata": {

                "source": "...",

                "page": int,

                "format": "markdown"

            }

        }



    Output:

        List[str]  

    """

    text = document

    # 1) Header-based splitting
    header_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=HEADERS_TO_SPLIT_ON,
        strip_headers=False,
    )

    header_sections = header_splitter.split_text(text)

    # 2) Recursive splitter (size-based)
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=SEPARATORS,
    )

    final_chunks: List[str] = []

    for section in header_sections:
        section_text = section.page_content.strip()

        # -----------------------------
        # Noise filtering 
        # -----------------------------
        if _is_noise(section_text):
            continue

        # -----------------------------
        # Table handling
        # -----------------------------
        if _looks_like_markdown_table(section_text):
            final_chunks.append(section_text)
            continue

        # -----------------------------
        # Merge header-only chunks
        # -----------------------------
        if _is_header_only(section_text):
            continue

        # -----------------------------
        # Size-based splitting
        # -----------------------------
        sub_chunks = recursive_splitter.split_text(section_text)

        for sub in sub_chunks:
            sub = sub.strip()
            if len(sub) < MIN_CHUNK_LENGTH:
                continue

            final_chunks.append(sub)

    return final_chunks


# --------------------------------------
# Helpers
# --------------------------------------
def _looks_like_markdown_table(text: str) -> bool:
    lines = text.splitlines()
    if len(lines) < 2:
        return False

    has_pipes = any("|" in line for line in lines)
    has_separator = any(
        re.match(r"^\s*\|?[\s:-]+\|", line) for line in lines
    )

    return has_pipes and has_separator


def _is_header_only(text: str) -> bool:
    """

    Detect chunks that are only headers (e.g. '## العنوان')

    """
    lines = text.splitlines()
    if len(lines) != 1:
        return False

    return lines[0].lstrip().startswith("#")


def _is_noise(text: str) -> bool:
    """

    Remove garbage chunks: symbols, single letters, etc.

    """
    stripped = text.strip()

    if len(stripped) < 10:
        return True

    if re.fullmatch(r"[■S\s]+", stripped):
        return True

    return False