Spaces:

SauRabM
/

Faraday

Running

Saurab Mishra

Initial open source release

34dcea4 about 2 months ago

2.66 kB

	"""
	processing.chunker — Paragraph-aware text chunker with overlap.

	Splits text into ~300-word chunks, preferring paragraph boundaries.
	Falls back to word-level splitting for very long paragraphs.
	Maintains configurable word overlap between adjacent chunks
	to preserve context continuity for embeddings.
	"""

	import re
	from typing import List


	def chunk_text(
	text: str,
	max_words: int = 300,
	overlap: int = 50,
	min_chunk_words: int = 15,
	) -> List[str]:
	"""
	Split text into chunks of approximately `max_words`, aligned
	to paragraph boundaries where possible.

	Args:
	text: Input text to chunk.
	max_words: Target maximum words per chunk.
	overlap: Words of overlap between consecutive chunks.
	min_chunk_words: Discard trailing chunks smaller than this.

	Returns:
	List of text chunks.
	"""
	if not text or not text.strip():
	return []

	chunks: List[str] = []

	# Step 1: Split into paragraphs (double-newline separated)
	paragraphs = re.split(r"\n\n+", text)

	current_words: List[str] = []
	current_len = 0

	for para in paragraphs:
	words = para.split()
	if not words:
	continue

	# If this paragraph fits in the current chunk, append it
	if current_len + len(words) <= max_words:
	current_words.extend(words)
	current_len += len(words)
	else:
	# Flush current chunk if it has content
	if current_words:
	chunks.append(" ".join(current_words))
	# Retain overlap from end of current chunk
	if overlap > 0:
	current_words = current_words[-overlap:]
	current_len = len(current_words)
	else:
	current_words = []
	current_len = 0

	# If paragraph itself exceeds max_words, split strictly
	if len(words) > max_words:
	i = 0
	while i < len(words):
	sub = words[i : i + max_words]
	chunks.append(" ".join(sub))
	step = max(1, max_words - overlap)
	i += step
	# Reset after processing oversized paragraph
	current_words = []
	current_len = 0
	else:
	current_words.extend(words)
	current_len += len(words)

	# Flush remaining words if they form a meaningful chunk
	if current_words and len(current_words) >= min_chunk_words:
	chunks.append(" ".join(current_words))

	return chunks