"""Split long text into overlapping chunks for embedding.

Approximation: 1500 chars ≈ 375 tokens for English/Latin scripts. That fits
the SmolLM2 2048-token context window comfortably when retrieving 3 chunks
plus a few chat turns. For CJK or other dense scripts the chunks will be
fewer tokens per char but the embedder doesn't care.
"""

from __future__ import annotations

import re


def chunk_text(
    text: str,
    chunk_chars: int = 1500,
    overlap_chars: int = 200,
) -> list[str]:
    """Paragraph-aware splitter with overlap.

    Walk paragraphs (separated by blank lines). For each:
      • If it fits in the current buffer: append.
      • Else, flush the buffer (with tail-overlap into the next chunk).
      • If the paragraph itself is longer than chunk_chars: hard-split it.
    """
    text = re.sub(r"\r\n?", "\n", text or "")
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    if not text:
        return []
    if len(text) <= chunk_chars:
        return [text]

    chunks: list[str] = []
    buffer = ""

    for para in text.split("\n\n"):
        para = para.strip()
        if not para:
            continue

        if _can_fit(buffer, para, chunk_chars):
            buffer = _append(buffer, para)
            continue

        # Doesn't fit — flush the buffer (if any).
        if buffer:
            chunks.append(buffer)
            buffer = buffer[-overlap_chars:] if overlap_chars > 0 else ""

        # Now try fitting again into the smaller (overlap-only) buffer.
        if _can_fit(buffer, para, chunk_chars):
            buffer = _append(buffer, para)
            continue

        # Paragraph alone exceeds chunk_chars — hard-split it.
        chunks.extend(_hard_split(para, chunk_chars, overlap_chars))
        buffer = ""

    if buffer:
        chunks.append(buffer)

    return [c.strip() for c in chunks if c.strip()]


def _can_fit(buffer: str, para: str, chunk_chars: int) -> bool:
    sep = 2 if buffer else 0
    return len(buffer) + sep + len(para) <= chunk_chars


def _append(buffer: str, para: str) -> str:
    return f"{buffer}\n\n{para}" if buffer else para


def _hard_split(text: str, chunk_chars: int, overlap_chars: int) -> list[str]:
    step = max(1, chunk_chars - overlap_chars)
    return [text[i : i + chunk_chars] for i in range(0, len(text), step)]