from __future__ import annotations

import re
from collections.abc import Iterable


_WHITESPACE_RE = re.compile(r"\s+")


def clean_text(text: str) -> str:
    """Normalize extracted lecture-note text without removing useful punctuation."""
    text = text.replace("\x00", " ")
    text = re.sub(r"-\s*\n\s*", "", text)
    text = text.replace("\n", " ")
    text = _WHITESPACE_RE.sub(" ", text)
    return text.strip()


def token_count(text: str) -> int:
    return len(text.split())


def split_into_chunks(text: str, min_tokens: int = 300, max_tokens: int = 500) -> list[str]:
    """Split text into roughly 300-500 token chunks using sentence boundaries."""
    cleaned = clean_text(text)
    if not cleaned:
        return []

    sentences = re.split(r"(?<=[.!?])\s+", cleaned)
    chunks: list[str] = []
    current: list[str] = []
    current_tokens = 0

    for sentence in sentences:
        words = sentence.split()
        if not words:
            continue

        if len(words) > max_tokens:
            if current:
                chunks.append(" ".join(current).strip())
                current = []
                current_tokens = 0
            chunks.extend(_split_long_sentence(words, max_tokens))
            continue

        would_exceed = current_tokens + len(words) > max_tokens
        can_close = current_tokens >= min_tokens
        if current and would_exceed and can_close:
            chunks.append(" ".join(current).strip())
            current = [sentence]
            current_tokens = len(words)
        else:
            current.append(sentence)
            current_tokens += len(words)

    if current:
        tail = " ".join(current).strip()
        if chunks and token_count(tail) < min_tokens // 2:
            chunks[-1] = f"{chunks[-1]} {tail}".strip()
        else:
            chunks.append(tail)

    return [chunk for chunk in chunks if chunk]


def _split_long_sentence(words: Iterable[str], max_tokens: int) -> list[str]:
    word_list = list(words)
    return [
        " ".join(word_list[index : index + max_tokens]).strip()
        for index in range(0, len(word_list), max_tokens)
    ]


def first_sentences(text: str, limit: int = 3) -> str:
    sentences = re.split(r"(?<=[.!?])\s+", clean_text(text))
    selected = [sentence for sentence in sentences if sentence][:limit]
    return " ".join(selected).strip()