pdf / chunker.py
Dpshkh's picture
Upload 8 files
beba6d9 verified
raw
history blame contribute delete
928 Bytes
def chunk_text(text, max_tokens=300, max_chunks=10):
import re
sentences = re.split(r'(?<=[.!?])\s+', text) # Better sentence splitting
chunks = []
current_chunk = []
current_len = 0
for sentence in sentences:
words = sentence.split()
if not words:
continue
if current_len + len(words) <= max_tokens:
current_chunk.extend(words)
current_len += len(words)
else:
chunk = " ".join(current_chunk).strip()
if chunk:
chunks.append(chunk)
if len(chunks) >= max_chunks:
break
current_chunk = words
current_len = len(words)
# Add the last chunk
if current_chunk and len(chunks) < max_chunks:
chunk = " ".join(current_chunk).strip()
if chunk:
chunks.append(chunk)
return chunks