File size: 1,425 Bytes
e1c0b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
core/chunker.py — Split course text into thematic chunks.

Responsibility:
    Take the raw text produced by core/parser.py and segment it into
    semantically coherent chunks suitable for question generation.

Strategy:
    1. Split on double newlines (paragraph boundaries).
    2. Merge short paragraphs with the previous chunk so every chunk
       meets a minimum word threshold.
    3. Cap chunk size at MAX_WORDS so the LLM context window isn't
       overwhelmed.

Public API:
    chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]
"""


def chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]:
    """Split *text* into thematic chunks and return them as a list."""
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks: list[str] = []
    current_words: list[str] = []

    for para in paragraphs:
        words = para.split()
        if len(current_words) + len(words) > max_words and len(current_words) >= min_words:
            chunks.append(" ".join(current_words))
            current_words = words
        else:
            current_words.extend(words)

    if len(current_words) >= min_words:
        chunks.append(" ".join(current_words))
    elif chunks:
        # Merge a trailing fragment into the last chunk rather than discarding it.
        chunks[-1] = chunks[-1] + " " + " ".join(current_words)

    return chunks