Spaces:

bilalRHCH
/

arabic-tts-api

Sleeping

arabic-tts-api / text_preprocessor.py

AI Assistant

Async updates for long text book generation feature

5b28f25 about 1 month ago

2.29 kB

	import re

	def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]:
	"""
	Splits text into chunks, prioritizing paragraph breaks, then sentence terminators,
	then commas, and finally spaces. Ensure no word is chopped midway.
	"""
	if not text:
	return []

	# Helper function to split by a delimiter and respect max length
	def _split_respecting_length(text_part, delimiter_pattern, sep=" "):
	parts = re.split(delimiter_pattern, text_part)
	res = []
	current = ""
	for p in parts:
	p = p.strip()
	if not p: continue

	if len(current) + len(p) + 1 <= max_chunk_length:
	current = f"{current}{sep}{p}" if current else p
	else:
	if current:
	res.append(current)
	current = p
	if current:
	res.append(current)
	return res

	# 1. Paragraphs
	paragraphs = [p for p in re.split(r'\n+', text) if p.strip()]

	chunks = []
	for para in paragraphs:
	if len(para) <= max_chunk_length:
	chunks.append(para)
	continue

	# 2. Sentences
	sentences = []
	for p in _split_respecting_length(para, r'(?<=[.!?؟])\s+'):
	if len(p) <= max_chunk_length:
	sentences.append(p)
	else:
	# 3. Commas
	commas = []
	for c in _split_respecting_length(p, r'(?<=[,،])\s+'):
	if len(c) <= max_chunk_length:
	commas.append(c)
	else:
	# 4. Words
	words_split = _split_respecting_length(c, r'\s+')
	commas.extend(words_split)
	sentences.extend(commas)
	chunks.extend(sentences)

	return chunks

	# Quick test if run directly
	if __name__ == "__main__":
	test_text = "مرحباً بكم. هذا هو النص الأول! وهذا هو النص الثاني، الذي سنقوم بتقسيمه. " * 10
	print(f"Original text length: {len(test_text)}")
	res = chunk_text(test_text, max_chunk_length=100)
	for i, c in enumerate(res):
	print(f"Chunk {i+1} (len={len(c)}): {c}")