Spaces:

build-small-hackathon
/

PaperProf

Sleeping

PaperProf / core /chunker.py

feat: initial project structure with all core modules

e1c0b77 21 days ago

1.43 kB

	"""
	core/chunker.py — Split course text into thematic chunks.

	Responsibility:
	Take the raw text produced by core/parser.py and segment it into
	semantically coherent chunks suitable for question generation.

	Strategy:
	1. Split on double newlines (paragraph boundaries).
	2. Merge short paragraphs with the previous chunk so every chunk
	meets a minimum word threshold.
	3. Cap chunk size at MAX_WORDS so the LLM context window isn't
	overwhelmed.

	Public API:
	chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]
	"""


	def chunk_text(text: str, min_words: int = 60, max_words: int = 300) -> list[str]:
	"""Split text into thematic chunks and return them as a list."""
	paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
	chunks: list[str] = []
	current_words: list[str] = []

	for para in paragraphs:
	words = para.split()
	if len(current_words) + len(words) > max_words and len(current_words) >= min_words:
	chunks.append(" ".join(current_words))
	current_words = words
	else:
	current_words.extend(words)

	if len(current_words) >= min_words:
	chunks.append(" ".join(current_words))
	elif chunks:
	# Merge a trailing fragment into the last chunk rather than discarding it.
	chunks[-1] = chunks[-1] + " " + " ".join(current_words)

	return chunks