import re def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]: """ Splits text into chunks, prioritizing paragraph breaks, then sentence terminators, then commas, and finally spaces. Ensure no word is chopped midway. """ if not text: return [] # Helper function to split by a delimiter and respect max length def _split_respecting_length(text_part, delimiter_pattern, sep=" "): parts = re.split(delimiter_pattern, text_part) res = [] current = "" for p in parts: p = p.strip() if not p: continue if len(current) + len(p) + 1 <= max_chunk_length: current = f"{current}{sep}{p}" if current else p else: if current: res.append(current) current = p if current: res.append(current) return res # 1. Paragraphs paragraphs = [p for p in re.split(r'\n+', text) if p.strip()] chunks = [] for para in paragraphs: if len(para) <= max_chunk_length: chunks.append(para) continue # 2. Sentences sentences = [] for p in _split_respecting_length(para, r'(?<=[.!?؟])\s+'): if len(p) <= max_chunk_length: sentences.append(p) else: # 3. Commas commas = [] for c in _split_respecting_length(p, r'(?<=[,،])\s+'): if len(c) <= max_chunk_length: commas.append(c) else: # 4. Words words_split = _split_respecting_length(c, r'\s+') commas.extend(words_split) sentences.extend(commas) chunks.extend(sentences) return chunks # Quick test if run directly if __name__ == "__main__": test_text = "مرحباً بكم. هذا هو النص الأول! وهذا هو النص الثاني، الذي سنقوم بتقسيمه. " * 10 print(f"Original text length: {len(test_text)}") res = chunk_text(test_text, max_chunk_length=100) for i, c in enumerate(res): print(f"Chunk {i+1} (len={len(c)}): {c}")