"""Text splitting utilities for bulk processing. Strategy: **1 paragraph = 1 bulk**, with no cross-paragraph packing. Each paragraph is sent to the LLM as its own bulk so that: - LLM never sees two similar adjacent paragraphs in one bulk (this triggered "같은 화제 반복" drops — e.g. inserted-pair case in the O42W2BIE25HZ 배터리 기사) - No paragraph is ever split across bulks (this triggered "불완전 조각" drops — e.g. JABBR33ZE 홍보 기사) A paragraph that exceeds `max_chars` is the only exception: it is split into sentences via KSS (pecab backend is not thread-safe, so we serialize with a module-level lock and fall back to regex on pecab asserts), and those sentences are packed within that one paragraph's scope only. Bulk boundaries still never cross into adjacent paragraphs. """ import logging import re import threading import kss logger = logging.getLogger(__name__) # pecab (KSS 기본 백엔드) 가 스레드 세이프가 아님 — 동시 호출 시 내부 상태가 # 섞여 다른 스레드의 입력이 반환되는 cross-contamination 관찰됨. mecab 을 # 시스템에 심기 전까지 전역 뮤텍스로 직렬화한다. _KSS_LOCK = threading.Lock() # Korean + Latin sentence-terminal punctuation followed by whitespace. _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?…。!?])\s+") def _safe_split_sentences(para: str) -> list[str]: try: with _KSS_LOCK: sents = list(kss.split_sentences(para)) return [s.strip() for s in sents if s and s.strip()] except (AssertionError, IndexError, AttributeError, ValueError, TypeError) as e: logger.warning( "kss.split_sentences failed (%s); using regex fallback", type(e).__name__ ) parts = [p.strip() for p in _SENTENCE_SPLIT_RE.split(para) if p.strip()] return parts if parts else [para] def _pack_sentences(sentences: list[str], max_chars: int) -> list[str]: """Pack sentences joined by ' ' up to `max_chars`, never splitting a sentence.""" bulks: list[str] = [] current: list[str] = [] current_len = 0 for sent in sentences: if not sent: continue add = (1 + len(sent)) if current else len(sent) if current and current_len + add > max_chars: bulks.append(" ".join(current)) current, current_len = [sent], len(sent) else: current.append(sent) current_len += add if current: bulks.append(" ".join(current)) return bulks def split_into_bulks(text: str, max_chars: int = 500) -> list[str]: """Return one bulk per paragraph. Long paragraphs are sentence-packed in-scope. - Paragraph delimiter is `\\n` (so `\\n\\n`-separated input works too — empty strings between are filtered). - A paragraph ≤ `max_chars` is emitted as a single bulk. - A paragraph > `max_chars` is split into sentences via KSS (regex fallback on pecab asserts) and those sentences are packed within that paragraph only — bulks never span paragraph boundaries. """ if not text or not text.strip(): return [] paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()] if not paragraphs: return [] bulks: list[str] = [] for para in paragraphs: if len(para) <= max_chars: bulks.append(para) continue sentences = _safe_split_sentences(para) bulks.extend(_pack_sentences(sentences, max_chars)) return bulks