Spaces:

makdadTaleb
/

rag-lecture-saver-api

Sleeping

App Files Files Community

rag-lecture-saver-api / src /preprocessing /chunker_markdown.py

makdadTaleb

Upload folder using huggingface_hub

4e7e4c0 verified about 2 months ago

raw

history blame contribute delete

3.49 kB

	from typing import List, Dict
	import re

	from langchain_text_splitters import (
	MarkdownHeaderTextSplitter,
	RecursiveCharacterTextSplitter,
	)

	# --------------------------------------
	# Settings
	# --------------------------------------
	HEADERS_TO_SPLIT_ON = [
	("#", "h1"),
	("##", "h2"),
	("###", "h3"),
	]

	SEPARATORS = ["\n\n", "\n", " ", ""]

	MIN_CHUNK_LENGTH = 40


	# --------------------------------------
	# Main API
	# --------------------------------------
	def chunk_document(
	document: Dict,
	chunk_size: int = 800,
	overlap: int = 100,
	) -> List[str]:
	"""
	Final structure-aware chunking for Markdown (Docling output)

	Input:
	document = {
	"text": "... markdown ...",
	"metadata": {
	"source": "...",
	"page": int,
	"format": "markdown"
	}
	}

	Output:
	List[str]
	"""

	text = document

	# 1) Header-based splitting
	header_splitter = MarkdownHeaderTextSplitter(
	headers_to_split_on=HEADERS_TO_SPLIT_ON,
	strip_headers=False,
	)

	header_sections = header_splitter.split_text(text)

	# 2) Recursive splitter (size-based)
	recursive_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=overlap,
	separators=SEPARATORS,
	)

	final_chunks: List[str] = []

	for section in header_sections:
	section_text = section.page_content.strip()

	# -----------------------------
	# Noise filtering
	# -----------------------------
	if _is_noise(section_text):
	continue

	# -----------------------------
	# Table handling
	# -----------------------------
	if _looks_like_markdown_table(section_text):
	final_chunks.append(section_text)
	continue

	# -----------------------------
	# Merge header-only chunks
	# -----------------------------
	if _is_header_only(section_text):
	continue

	# -----------------------------
	# Size-based splitting
	# -----------------------------
	sub_chunks = recursive_splitter.split_text(section_text)

	for sub in sub_chunks:
	sub = sub.strip()
	if len(sub) < MIN_CHUNK_LENGTH:
	continue

	final_chunks.append(sub)

	return final_chunks


	# --------------------------------------
	# Helpers
	# --------------------------------------
	def _looks_like_markdown_table(text: str) -> bool:
	lines = text.splitlines()
	if len(lines) < 2:
	return False

	has_pipes = any("\|" in line for line in lines)
	has_separator = any(
	re.match(r"^\s*\\|?[\s:-]+\\|", line) for line in lines
	)

	return has_pipes and has_separator


	def _is_header_only(text: str) -> bool:
	"""
	Detect chunks that are only headers (e.g. '## العنوان')
	"""
	lines = text.splitlines()
	if len(lines) != 1:
	return False

	return lines[0].lstrip().startswith("#")


	def _is_noise(text: str) -> bool:
	"""
	Remove garbage chunks: symbols, single letters, etc.
	"""
	stripped = text.strip()

	if len(stripped) < 10:
	return True

	if re.fullmatch(r"[■S\s]+", stripped):
	return True

	return False