File size: 2,293 Bytes
5b28f25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re

def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]:
    """
    Splits text into chunks, prioritizing paragraph breaks, then sentence terminators,
    then commas, and finally spaces. Ensure no word is chopped midway.
    """
    if not text:
        return []

    # Helper function to split by a delimiter and respect max length
    def _split_respecting_length(text_part, delimiter_pattern, sep=" "):
        parts = re.split(delimiter_pattern, text_part)
        res = []
        current = ""
        for p in parts:
            p = p.strip()
            if not p: continue
            
            if len(current) + len(p) + 1 <= max_chunk_length:
                current = f"{current}{sep}{p}" if current else p
            else:
                if current:
                    res.append(current)
                current = p
        if current:
            res.append(current)
        return res

    # 1. Paragraphs
    paragraphs = [p for p in re.split(r'\n+', text) if p.strip()]
    
    chunks = []
    for para in paragraphs:
        if len(para) <= max_chunk_length:
            chunks.append(para)
            continue
            
        # 2. Sentences
        sentences = []
        for p in _split_respecting_length(para, r'(?<=[.!?ุŸ])\s+'):
            if len(p) <= max_chunk_length:
                sentences.append(p)
            else:
                # 3. Commas
                commas = []
                for c in _split_respecting_length(p, r'(?<=[,ุŒ])\s+'):
                    if len(c) <= max_chunk_length:
                        commas.append(c)
                    else:
                        # 4. Words
                        words_split = _split_respecting_length(c, r'\s+')
                        commas.extend(words_split)
                sentences.extend(commas)
        chunks.extend(sentences)
        
    return chunks

# Quick test if run directly
if __name__ == "__main__":
    test_text = "ู…ุฑุญุจุงู‹ ุจูƒู…. ู‡ุฐุง ู‡ูˆ ุงู„ู†ุต ุงู„ุฃูˆู„! ูˆู‡ุฐุง ู‡ูˆ ุงู„ู†ุต ุงู„ุซุงู†ูŠุŒ ุงู„ุฐูŠ ุณู†ู‚ูˆู… ุจุชู‚ุณูŠู…ู‡. " * 10
    print(f"Original text length: {len(test_text)}")
    res = chunk_text(test_text, max_chunk_length=100)
    for i, c in enumerate(res):
        print(f"Chunk {i+1} (len={len(c)}): {c}")