Faraday / processing /chunker.py
Saurab Mishra
Initial open source release
34dcea4
"""
processing.chunker — Paragraph-aware text chunker with overlap.
Splits text into ~300-word chunks, preferring paragraph boundaries.
Falls back to word-level splitting for very long paragraphs.
Maintains configurable word overlap between adjacent chunks
to preserve context continuity for embeddings.
"""
import re
from typing import List
def chunk_text(
text: str,
max_words: int = 300,
overlap: int = 50,
min_chunk_words: int = 15,
) -> List[str]:
"""
Split text into chunks of approximately `max_words`, aligned
to paragraph boundaries where possible.
Args:
text: Input text to chunk.
max_words: Target maximum words per chunk.
overlap: Words of overlap between consecutive chunks.
min_chunk_words: Discard trailing chunks smaller than this.
Returns:
List of text chunks.
"""
if not text or not text.strip():
return []
chunks: List[str] = []
# Step 1: Split into paragraphs (double-newline separated)
paragraphs = re.split(r"\n\n+", text)
current_words: List[str] = []
current_len = 0
for para in paragraphs:
words = para.split()
if not words:
continue
# If this paragraph fits in the current chunk, append it
if current_len + len(words) <= max_words:
current_words.extend(words)
current_len += len(words)
else:
# Flush current chunk if it has content
if current_words:
chunks.append(" ".join(current_words))
# Retain overlap from end of current chunk
if overlap > 0:
current_words = current_words[-overlap:]
current_len = len(current_words)
else:
current_words = []
current_len = 0
# If paragraph itself exceeds max_words, split strictly
if len(words) > max_words:
i = 0
while i < len(words):
sub = words[i : i + max_words]
chunks.append(" ".join(sub))
step = max(1, max_words - overlap)
i += step
# Reset after processing oversized paragraph
current_words = []
current_len = 0
else:
current_words.extend(words)
current_len += len(words)
# Flush remaining words if they form a meaningful chunk
if current_words and len(current_words) >= min_chunk_words:
chunks.append(" ".join(current_words))
return chunks