from __future__ import annotations import re from collections.abc import Iterable _WHITESPACE_RE = re.compile(r"\s+") def clean_text(text: str) -> str: """Normalize extracted lecture-note text without removing useful punctuation.""" text = text.replace("\x00", " ") text = re.sub(r"-\s*\n\s*", "", text) text = text.replace("\n", " ") text = _WHITESPACE_RE.sub(" ", text) return text.strip() def token_count(text: str) -> int: return len(text.split()) def split_into_chunks(text: str, min_tokens: int = 300, max_tokens: int = 500) -> list[str]: """Split text into roughly 300-500 token chunks using sentence boundaries.""" cleaned = clean_text(text) if not cleaned: return [] sentences = re.split(r"(?<=[.!?])\s+", cleaned) chunks: list[str] = [] current: list[str] = [] current_tokens = 0 for sentence in sentences: words = sentence.split() if not words: continue if len(words) > max_tokens: if current: chunks.append(" ".join(current).strip()) current = [] current_tokens = 0 chunks.extend(_split_long_sentence(words, max_tokens)) continue would_exceed = current_tokens + len(words) > max_tokens can_close = current_tokens >= min_tokens if current and would_exceed and can_close: chunks.append(" ".join(current).strip()) current = [sentence] current_tokens = len(words) else: current.append(sentence) current_tokens += len(words) if current: tail = " ".join(current).strip() if chunks and token_count(tail) < min_tokens // 2: chunks[-1] = f"{chunks[-1]} {tail}".strip() else: chunks.append(tail) return [chunk for chunk in chunks if chunk] def _split_long_sentence(words: Iterable[str], max_tokens: int) -> list[str]: word_list = list(words) return [ " ".join(word_list[index : index + max_tokens]).strip() for index in range(0, len(word_list), max_tokens) ] def first_sentences(text: str, limit: int = 3) -> str: sentences = re.split(r"(?<=[.!?])\s+", clean_text(text)) selected = [sentence for sentence in sentences if sentence][:limit] return " ".join(selected).strip()