File size: 831 Bytes
c16e1c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re

def extract_text(text: str, max_words: int = 300):
    """
    Split raw text into chunks of ~300 words.
    Suitable for document ingestion before embeddings.

    Args:
        text (str): Raw text input
        max_words (int): Max words per chunk (default 300)

    Returns:
        List[str]: List of chunked text segments
    """

    # Normalize whitespace
    clean = re.sub(r'\s+', ' ', text).strip()

    if not clean:
        return []

    words = clean.split(" ")
    chunks = []

    current = []
    count = 0

    for word in words:
        current.append(word)
        count += 1

        if count >= max_words:
            chunks.append(" ".join(current))
            current = []
            count = 0

    # Add final chunk
    if current:
        chunks.append(" ".join(current))

    return chunks