Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| from collections.abc import Iterable | |
| _WHITESPACE_RE = re.compile(r"\s+") | |
| def clean_text(text: str) -> str: | |
| """Normalize extracted lecture-note text without removing useful punctuation.""" | |
| text = text.replace("\x00", " ") | |
| text = re.sub(r"-\s*\n\s*", "", text) | |
| text = text.replace("\n", " ") | |
| text = _WHITESPACE_RE.sub(" ", text) | |
| return text.strip() | |
| def token_count(text: str) -> int: | |
| return len(text.split()) | |
| def split_into_chunks(text: str, min_tokens: int = 300, max_tokens: int = 500) -> list[str]: | |
| """Split text into roughly 300-500 token chunks using sentence boundaries.""" | |
| cleaned = clean_text(text) | |
| if not cleaned: | |
| return [] | |
| sentences = re.split(r"(?<=[.!?])\s+", cleaned) | |
| chunks: list[str] = [] | |
| current: list[str] = [] | |
| current_tokens = 0 | |
| for sentence in sentences: | |
| words = sentence.split() | |
| if not words: | |
| continue | |
| if len(words) > max_tokens: | |
| if current: | |
| chunks.append(" ".join(current).strip()) | |
| current = [] | |
| current_tokens = 0 | |
| chunks.extend(_split_long_sentence(words, max_tokens)) | |
| continue | |
| would_exceed = current_tokens + len(words) > max_tokens | |
| can_close = current_tokens >= min_tokens | |
| if current and would_exceed and can_close: | |
| chunks.append(" ".join(current).strip()) | |
| current = [sentence] | |
| current_tokens = len(words) | |
| else: | |
| current.append(sentence) | |
| current_tokens += len(words) | |
| if current: | |
| tail = " ".join(current).strip() | |
| if chunks and token_count(tail) < min_tokens // 2: | |
| chunks[-1] = f"{chunks[-1]} {tail}".strip() | |
| else: | |
| chunks.append(tail) | |
| return [chunk for chunk in chunks if chunk] | |
| def _split_long_sentence(words: Iterable[str], max_tokens: int) -> list[str]: | |
| word_list = list(words) | |
| return [ | |
| " ".join(word_list[index : index + max_tokens]).strip() | |
| for index in range(0, len(word_list), max_tokens) | |
| ] | |
| def first_sentences(text: str, limit: int = 3) -> str: | |
| sentences = re.split(r"(?<=[.!?])\s+", clean_text(text)) | |
| selected = [sentence for sentence in sentences if sentence][:limit] | |
| return " ".join(selected).strip() | |