""" core/chunker.py — Split course text into thematic chunks. Responsibility: Take the raw text produced by core/parser.py and segment it into semantically coherent chunks suitable for question generation. Strategy: 1. Split on double newlines (paragraph boundaries). 2. Merge short paragraphs with the previous chunk so every chunk meets a minimum word threshold. 3. Cap chunk size at MAX_WORDS so the LLM context window isn't overwhelmed. Public API: chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str] """ def chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]: """Split *text* into thematic chunks and return them as a list.""" paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] chunks: list[str] = [] current_words: list[str] = [] for para in paragraphs: words = para.split() if len(current_words) + len(words) > max_words and len(current_words) >= min_words: chunks.append(" ".join(current_words)) current_words = words else: current_words.extend(words) if len(current_words) >= min_words: chunks.append(" ".join(current_words)) elif chunks: # Merge a trailing fragment into the last chunk rather than discarding it. chunks[-1] = chunks[-1] + " " + " ".join(current_words) return chunks