PaperProf / core /chunker.py
Ryadg's picture
feat: initial project structure with all core modules
e1c0b77
Raw
History Blame Contribute Delete
1.43 kB
"""
core/chunker.py — Split course text into thematic chunks.
Responsibility:
Take the raw text produced by core/parser.py and segment it into
semantically coherent chunks suitable for question generation.
Strategy:
1. Split on double newlines (paragraph boundaries).
2. Merge short paragraphs with the previous chunk so every chunk
meets a minimum word threshold.
3. Cap chunk size at MAX_WORDS so the LLM context window isn't
overwhelmed.
Public API:
chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]
"""
def chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]:
"""Split *text* into thematic chunks and return them as a list."""
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
chunks: list[str] = []
current_words: list[str] = []
for para in paragraphs:
words = para.split()
if len(current_words) + len(words) > max_words and len(current_words) >= min_words:
chunks.append(" ".join(current_words))
current_words = words
else:
current_words.extend(words)
if len(current_words) >= min_words:
chunks.append(" ".join(current_words))
elif chunks:
# Merge a trailing fragment into the last chunk rather than discarding it.
chunks[-1] = chunks[-1] + " " + " ".join(current_words)
return chunks