Spaces:
Sleeping
Sleeping
File size: 2,293 Bytes
5b28f25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import re
def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]:
"""
Splits text into chunks, prioritizing paragraph breaks, then sentence terminators,
then commas, and finally spaces. Ensure no word is chopped midway.
"""
if not text:
return []
# Helper function to split by a delimiter and respect max length
def _split_respecting_length(text_part, delimiter_pattern, sep=" "):
parts = re.split(delimiter_pattern, text_part)
res = []
current = ""
for p in parts:
p = p.strip()
if not p: continue
if len(current) + len(p) + 1 <= max_chunk_length:
current = f"{current}{sep}{p}" if current else p
else:
if current:
res.append(current)
current = p
if current:
res.append(current)
return res
# 1. Paragraphs
paragraphs = [p for p in re.split(r'\n+', text) if p.strip()]
chunks = []
for para in paragraphs:
if len(para) <= max_chunk_length:
chunks.append(para)
continue
# 2. Sentences
sentences = []
for p in _split_respecting_length(para, r'(?<=[.!?ุ])\s+'):
if len(p) <= max_chunk_length:
sentences.append(p)
else:
# 3. Commas
commas = []
for c in _split_respecting_length(p, r'(?<=[,ุ])\s+'):
if len(c) <= max_chunk_length:
commas.append(c)
else:
# 4. Words
words_split = _split_respecting_length(c, r'\s+')
commas.extend(words_split)
sentences.extend(commas)
chunks.extend(sentences)
return chunks
# Quick test if run directly
if __name__ == "__main__":
test_text = "ู
ุฑุญุจุงู ุจูู
. ูุฐุง ูู ุงููุต ุงูุฃูู! ููุฐุง ูู ุงููุต ุงูุซุงููุ ุงูุฐู ุณูููู
ุจุชูุณูู
ู. " * 10
print(f"Original text length: {len(test_text)}")
res = chunk_text(test_text, max_chunk_length=100)
for i, c in enumerate(res):
print(f"Chunk {i+1} (len={len(c)}): {c}")
|