Spaces:
Sleeping
Sleeping
| import re | |
| def extract_text(text: str, max_words: int = 300): | |
| """ | |
| Split raw text into chunks of ~300 words. | |
| Suitable for document ingestion before embeddings. | |
| Args: | |
| text (str): Raw text input | |
| max_words (int): Max words per chunk (default 300) | |
| Returns: | |
| List[str]: List of chunked text segments | |
| """ | |
| # Normalize whitespace | |
| clean = re.sub(r'\s+', ' ', text).strip() | |
| if not clean: | |
| return [] | |
| words = clean.split(" ") | |
| chunks = [] | |
| current = [] | |
| count = 0 | |
| for word in words: | |
| current.append(word) | |
| count += 1 | |
| if count >= max_words: | |
| chunks.append(" ".join(current)) | |
| current = [] | |
| count = 0 | |
| # Add final chunk | |
| if current: | |
| chunks.append(" ".join(current)) | |
| return chunks | |