import re def extract_text(text: str, max_words: int = 300): """ Split raw text into chunks of ~300 words. Suitable for document ingestion before embeddings. Args: text (str): Raw text input max_words (int): Max words per chunk (default 300) Returns: List[str]: List of chunked text segments """ # Normalize whitespace clean = re.sub(r'\s+', ' ', text).strip() if not clean: return [] words = clean.split(" ") chunks = [] current = [] count = 0 for word in words: current.append(word) count += 1 if count >= max_words: chunks.append(" ".join(current)) current = [] count = 0 # Add final chunk if current: chunks.append(" ".join(current)) return chunks