# utils/chunking.py import re import nltk nltk.download("punkt", quiet=True) from nltk.tokenize import sent_tokenize def smart_chunk_text(text, chunk_size=300, overlap=50): # Ensure input is a string if isinstance(text, list): text = "\n".join(text) # Split text into paragraphs paragraphs = re.split(r"\n\s*\n", text) chunks = [] for para in paragraphs: sentences = sent_tokenize(para) words = [] for sent in sentences: sent_words = sent.split() # If sentence itself is longer than chunk_size, break it if len(sent_words) > chunk_size: for i in range(0, len(sent_words), chunk_size - overlap): part = " ".join(sent_words[i:i+chunk_size]) if len(part.split()) > 30: chunks.append(part) else: words.extend(sent_words) # If collected enough words, make a chunk if len(words) >= chunk_size: chunk = " ".join(words[:chunk_size]) chunks.append(chunk) # Keep overlap words = words[chunk_size - overlap:] # Leftover words (end of paragraph) if words and len(words) > 30: chunks.append(" ".join(words)) return chunks