Spaces:
Sleeping
Sleeping
File size: 3,494 Bytes
4e7e4c0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | from typing import List, Dict
import re
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
# --------------------------------------
# Settings
# --------------------------------------
HEADERS_TO_SPLIT_ON = [
("#", "h1"),
("##", "h2"),
("###", "h3"),
]
SEPARATORS = ["\n\n", "\n", " ", ""]
MIN_CHUNK_LENGTH = 40
# --------------------------------------
# Main API
# --------------------------------------
def chunk_document(
document: Dict,
chunk_size: int = 800,
overlap: int = 100,
) -> List[str]:
"""
Final structure-aware chunking for Markdown (Docling output)
Input:
document = {
"text": "... markdown ...",
"metadata": {
"source": "...",
"page": int,
"format": "markdown"
}
}
Output:
List[str]
"""
text = document
# 1) Header-based splitting
header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=HEADERS_TO_SPLIT_ON,
strip_headers=False,
)
header_sections = header_splitter.split_text(text)
# 2) Recursive splitter (size-based)
recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=SEPARATORS,
)
final_chunks: List[str] = []
for section in header_sections:
section_text = section.page_content.strip()
# -----------------------------
# Noise filtering
# -----------------------------
if _is_noise(section_text):
continue
# -----------------------------
# Table handling
# -----------------------------
if _looks_like_markdown_table(section_text):
final_chunks.append(section_text)
continue
# -----------------------------
# Merge header-only chunks
# -----------------------------
if _is_header_only(section_text):
continue
# -----------------------------
# Size-based splitting
# -----------------------------
sub_chunks = recursive_splitter.split_text(section_text)
for sub in sub_chunks:
sub = sub.strip()
if len(sub) < MIN_CHUNK_LENGTH:
continue
final_chunks.append(sub)
return final_chunks
# --------------------------------------
# Helpers
# --------------------------------------
def _looks_like_markdown_table(text: str) -> bool:
lines = text.splitlines()
if len(lines) < 2:
return False
has_pipes = any("|" in line for line in lines)
has_separator = any(
re.match(r"^\s*\|?[\s:-]+\|", line) for line in lines
)
return has_pipes and has_separator
def _is_header_only(text: str) -> bool:
"""
Detect chunks that are only headers (e.g. '## العنوان')
"""
lines = text.splitlines()
if len(lines) != 1:
return False
return lines[0].lstrip().startswith("#")
def _is_noise(text: str) -> bool:
"""
Remove garbage chunks: symbols, single letters, etc.
"""
stripped = text.strip()
if len(stripped) < 10:
return True
if re.fullmatch(r"[■S\s]+", stripped):
return True
return False
|