Spaces:
Running
Running
File size: 928 Bytes
beba6d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | def chunk_text(text, max_tokens=300, max_chunks=10):
import re
sentences = re.split(r'(?<=[.!?])\s+', text) # Better sentence splitting
chunks = []
current_chunk = []
current_len = 0
for sentence in sentences:
words = sentence.split()
if not words:
continue
if current_len + len(words) <= max_tokens:
current_chunk.extend(words)
current_len += len(words)
else:
chunk = " ".join(current_chunk).strip()
if chunk:
chunks.append(chunk)
if len(chunks) >= max_chunks:
break
current_chunk = words
current_len = len(words)
# Add the last chunk
if current_chunk and len(chunks) < max_chunks:
chunk = " ".join(current_chunk).strip()
if chunk:
chunks.append(chunk)
return chunks
|