Spaces:
Running
Running
| def chunk_text(text, max_tokens=300, max_chunks=10): | |
| import re | |
| sentences = re.split(r'(?<=[.!?])\s+', text) # Better sentence splitting | |
| chunks = [] | |
| current_chunk = [] | |
| current_len = 0 | |
| for sentence in sentences: | |
| words = sentence.split() | |
| if not words: | |
| continue | |
| if current_len + len(words) <= max_tokens: | |
| current_chunk.extend(words) | |
| current_len += len(words) | |
| else: | |
| chunk = " ".join(current_chunk).strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| if len(chunks) >= max_chunks: | |
| break | |
| current_chunk = words | |
| current_len = len(words) | |
| # Add the last chunk | |
| if current_chunk and len(chunks) < max_chunks: | |
| chunk = " ".join(current_chunk).strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| return chunks | |