File size: 928 Bytes
beba6d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def chunk_text(text, max_tokens=300, max_chunks=10):
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Better sentence splitting

    chunks = []
    current_chunk = []
    current_len = 0

    for sentence in sentences:
        words = sentence.split()
        if not words:
            continue
        if current_len + len(words) <= max_tokens:
            current_chunk.extend(words)
            current_len += len(words)
        else:
            chunk = " ".join(current_chunk).strip()
            if chunk:
                chunks.append(chunk)
            if len(chunks) >= max_chunks:
                break
            current_chunk = words
            current_len = len(words)

    # Add the last chunk
    if current_chunk and len(chunks) < max_chunks:
        chunk = " ".join(current_chunk).strip()
        if chunk:
            chunks.append(chunk)

    return chunks