Spaces:

nothingworry
/

IntegraChat

Sleeping

File size: 831 Bytes

c16e1c9

import re

def extract_text(text: str, max_words: int = 300):
    """
    Split raw text into chunks of ~300 words.
    Suitable for document ingestion before embeddings.

    Args:
        text (str): Raw text input
        max_words (int): Max words per chunk (default 300)

    Returns:
        List[str]: List of chunked text segments
    """

    # Normalize whitespace
    clean = re.sub(r'\s+', ' ', text).strip()

    if not clean:
        return []

    words = clean.split(" ")
    chunks = []

    current = []
    count = 0

    for word in words:
        current.append(word)
        count += 1

        if count >= max_words:
            chunks.append(" ".join(current))
            current = []
            count = 0

    # Add final chunk
    if current:
        chunks.append(" ".join(current))

    return chunks