rag-lecture-saver-api / src /preprocessing /chunker_markdown.py
makdadTaleb's picture
Upload folder using huggingface_hub
4e7e4c0 verified
from typing import List, Dict
import re
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
# --------------------------------------
# Settings
# --------------------------------------
HEADERS_TO_SPLIT_ON = [
("#", "h1"),
("##", "h2"),
("###", "h3"),
]
SEPARATORS = ["\n\n", "\n", " ", ""]
MIN_CHUNK_LENGTH = 40
# --------------------------------------
# Main API
# --------------------------------------
def chunk_document(
document: Dict,
chunk_size: int = 800,
overlap: int = 100,
) -> List[str]:
"""
Final structure-aware chunking for Markdown (Docling output)
Input:
document = {
"text": "... markdown ...",
"metadata": {
"source": "...",
"page": int,
"format": "markdown"
}
}
Output:
List[str]
"""
text = document
# 1) Header-based splitting
header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=HEADERS_TO_SPLIT_ON,
strip_headers=False,
)
header_sections = header_splitter.split_text(text)
# 2) Recursive splitter (size-based)
recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=SEPARATORS,
)
final_chunks: List[str] = []
for section in header_sections:
section_text = section.page_content.strip()
# -----------------------------
# Noise filtering
# -----------------------------
if _is_noise(section_text):
continue
# -----------------------------
# Table handling
# -----------------------------
if _looks_like_markdown_table(section_text):
final_chunks.append(section_text)
continue
# -----------------------------
# Merge header-only chunks
# -----------------------------
if _is_header_only(section_text):
continue
# -----------------------------
# Size-based splitting
# -----------------------------
sub_chunks = recursive_splitter.split_text(section_text)
for sub in sub_chunks:
sub = sub.strip()
if len(sub) < MIN_CHUNK_LENGTH:
continue
final_chunks.append(sub)
return final_chunks
# --------------------------------------
# Helpers
# --------------------------------------
def _looks_like_markdown_table(text: str) -> bool:
lines = text.splitlines()
if len(lines) < 2:
return False
has_pipes = any("|" in line for line in lines)
has_separator = any(
re.match(r"^\s*\|?[\s:-]+\|", line) for line in lines
)
return has_pipes and has_separator
def _is_header_only(text: str) -> bool:
"""
Detect chunks that are only headers (e.g. '## العنوان')
"""
lines = text.splitlines()
if len(lines) != 1:
return False
return lines[0].lstrip().startswith("#")
def _is_noise(text: str) -> bool:
"""
Remove garbage chunks: symbols, single letters, etc.
"""
stripped = text.strip()
if len(stripped) < 10:
return True
if re.fullmatch(r"[■S\s]+", stripped):
return True
return False