"""Text splitting utilities for bulk processing.

Strategy: **1 paragraph = 1 bulk**, with no cross-paragraph packing.

Each paragraph is sent to the LLM as its own bulk so that:
- LLM never sees two similar adjacent paragraphs in one bulk
  (this triggered "같은 화제 반복" drops — e.g. inserted-pair case in
  the O42W2BIE25HZ 배터리 기사)
- No paragraph is ever split across bulks
  (this triggered "불완전 조각" drops — e.g. JABBR33ZE 홍보 기사)

A paragraph that exceeds `max_chars` is the only exception: it is split
into sentences via KSS (pecab backend is not thread-safe, so we serialize
with a module-level lock and fall back to regex on pecab asserts), and
those sentences are packed within that one paragraph's scope only. Bulk
boundaries still never cross into adjacent paragraphs.
"""

import logging
import re
import threading

import kss

logger = logging.getLogger(__name__)

# pecab (KSS 기본 백엔드) 가 스레드 세이프가 아님 — 동시 호출 시 내부 상태가
# 섞여 다른 스레드의 입력이 반환되는 cross-contamination 관찰됨. mecab 을
# 시스템에 심기 전까지 전역 뮤텍스로 직렬화한다.
_KSS_LOCK = threading.Lock()

# Korean + Latin sentence-terminal punctuation followed by whitespace.
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?…。!?])\s+")


def _safe_split_sentences(para: str) -> list[str]:
    try:
        with _KSS_LOCK:
            sents = list(kss.split_sentences(para))
        return [s.strip() for s in sents if s and s.strip()]
    except (AssertionError, IndexError, AttributeError, ValueError, TypeError) as e:
        logger.warning(
            "kss.split_sentences failed (%s); using regex fallback", type(e).__name__
        )
        parts = [p.strip() for p in _SENTENCE_SPLIT_RE.split(para) if p.strip()]
        return parts if parts else [para]


def _pack_sentences(sentences: list[str], max_chars: int) -> list[str]:
    """Pack sentences joined by ' ' up to `max_chars`, never splitting a sentence."""
    bulks: list[str] = []
    current: list[str] = []
    current_len = 0
    for sent in sentences:
        if not sent:
            continue
        add = (1 + len(sent)) if current else len(sent)
        if current and current_len + add > max_chars:
            bulks.append(" ".join(current))
            current, current_len = [sent], len(sent)
        else:
            current.append(sent)
            current_len += add
    if current:
        bulks.append(" ".join(current))
    return bulks


def split_into_bulks(text: str, max_chars: int = 500) -> list[str]:
    """Return one bulk per paragraph. Long paragraphs are sentence-packed in-scope.

    - Paragraph delimiter is `\\n` (so `\\n\\n`-separated input works too —
      empty strings between are filtered).
    - A paragraph ≤ `max_chars` is emitted as a single bulk.
    - A paragraph > `max_chars` is split into sentences via KSS (regex
      fallback on pecab asserts) and those sentences are packed within
      that paragraph only — bulks never span paragraph boundaries.
    """
    if not text or not text.strip():
        return []

    paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()]
    if not paragraphs:
        return []

    bulks: list[str] = []
    for para in paragraphs:
        if len(para) <= max_chars:
            bulks.append(para)
            continue
        sentences = _safe_split_sentences(para)
        bulks.extend(_pack_sentences(sentences, max_chars))

    return bulks