IntegraChat / backend /api /utils /text_extractor.py
nothingworry's picture
Add RAG MCP Server with Supabase vector search
c16e1c9
raw
history blame
831 Bytes
import re
def extract_text(text: str, max_words: int = 300):
"""
Split raw text into chunks of ~300 words.
Suitable for document ingestion before embeddings.
Args:
text (str): Raw text input
max_words (int): Max words per chunk (default 300)
Returns:
List[str]: List of chunked text segments
"""
# Normalize whitespace
clean = re.sub(r'\s+', ' ', text).strip()
if not clean:
return []
words = clean.split(" ")
chunks = []
current = []
count = 0
for word in words:
current.append(word)
count += 1
if count >= max_words:
chunks.append(" ".join(current))
current = []
count = 0
# Add final chunk
if current:
chunks.append(" ".join(current))
return chunks