pranshu dhiman
Initial commit with Docker and Streamlit
46b701f
Raw
History Blame Contribute Delete
2.37 kB
from __future__ import annotations
import re
from collections.abc import Iterable
_WHITESPACE_RE = re.compile(r"\s+")
def clean_text(text: str) -> str:
"""Normalize extracted lecture-note text without removing useful punctuation."""
text = text.replace("\x00", " ")
text = re.sub(r"-\s*\n\s*", "", text)
text = text.replace("\n", " ")
text = _WHITESPACE_RE.sub(" ", text)
return text.strip()
def token_count(text: str) -> int:
return len(text.split())
def split_into_chunks(text: str, min_tokens: int = 300, max_tokens: int = 500) -> list[str]:
"""Split text into roughly 300-500 token chunks using sentence boundaries."""
cleaned = clean_text(text)
if not cleaned:
return []
sentences = re.split(r"(?<=[.!?])\s+", cleaned)
chunks: list[str] = []
current: list[str] = []
current_tokens = 0
for sentence in sentences:
words = sentence.split()
if not words:
continue
if len(words) > max_tokens:
if current:
chunks.append(" ".join(current).strip())
current = []
current_tokens = 0
chunks.extend(_split_long_sentence(words, max_tokens))
continue
would_exceed = current_tokens + len(words) > max_tokens
can_close = current_tokens >= min_tokens
if current and would_exceed and can_close:
chunks.append(" ".join(current).strip())
current = [sentence]
current_tokens = len(words)
else:
current.append(sentence)
current_tokens += len(words)
if current:
tail = " ".join(current).strip()
if chunks and token_count(tail) < min_tokens // 2:
chunks[-1] = f"{chunks[-1]} {tail}".strip()
else:
chunks.append(tail)
return [chunk for chunk in chunks if chunk]
def _split_long_sentence(words: Iterable[str], max_tokens: int) -> list[str]:
word_list = list(words)
return [
" ".join(word_list[index : index + max_tokens]).strip()
for index in range(0, len(word_list), max_tokens)
]
def first_sentences(text: str, limit: int = 3) -> str:
sentences = re.split(r"(?<=[.!?])\s+", clean_text(text))
selected = [sentence for sentence in sentences if sentence][:limit]
return " ".join(selected).strip()