Spaces:
Sleeping
Sleeping
| import re | |
| def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]: | |
| """ | |
| Splits text into chunks, prioritizing paragraph breaks, then sentence terminators, | |
| then commas, and finally spaces. Ensure no word is chopped midway. | |
| """ | |
| if not text: | |
| return [] | |
| # Helper function to split by a delimiter and respect max length | |
| def _split_respecting_length(text_part, delimiter_pattern, sep=" "): | |
| parts = re.split(delimiter_pattern, text_part) | |
| res = [] | |
| current = "" | |
| for p in parts: | |
| p = p.strip() | |
| if not p: continue | |
| if len(current) + len(p) + 1 <= max_chunk_length: | |
| current = f"{current}{sep}{p}" if current else p | |
| else: | |
| if current: | |
| res.append(current) | |
| current = p | |
| if current: | |
| res.append(current) | |
| return res | |
| # 1. Paragraphs | |
| paragraphs = [p for p in re.split(r'\n+', text) if p.strip()] | |
| chunks = [] | |
| for para in paragraphs: | |
| if len(para) <= max_chunk_length: | |
| chunks.append(para) | |
| continue | |
| # 2. Sentences | |
| sentences = [] | |
| for p in _split_respecting_length(para, r'(?<=[.!?ุ])\s+'): | |
| if len(p) <= max_chunk_length: | |
| sentences.append(p) | |
| else: | |
| # 3. Commas | |
| commas = [] | |
| for c in _split_respecting_length(p, r'(?<=[,ุ])\s+'): | |
| if len(c) <= max_chunk_length: | |
| commas.append(c) | |
| else: | |
| # 4. Words | |
| words_split = _split_respecting_length(c, r'\s+') | |
| commas.extend(words_split) | |
| sentences.extend(commas) | |
| chunks.extend(sentences) | |
| return chunks | |
| # Quick test if run directly | |
| if __name__ == "__main__": | |
| test_text = "ู ุฑุญุจุงู ุจูู . ูุฐุง ูู ุงููุต ุงูุฃูู! ููุฐุง ูู ุงููุต ุงูุซุงููุ ุงูุฐู ุณูููู ุจุชูุณูู ู. " * 10 | |
| print(f"Original text length: {len(test_text)}") | |
| res = chunk_text(test_text, max_chunk_length=100) | |
| for i, c in enumerate(res): | |
| print(f"Chunk {i+1} (len={len(c)}): {c}") | |