Spaces:
Sleeping
Sleeping
| """Text splitting utilities for bulk processing. | |
| Strategy: **1 paragraph = 1 bulk**, with no cross-paragraph packing. | |
| Each paragraph is sent to the LLM as its own bulk so that: | |
| - LLM never sees two similar adjacent paragraphs in one bulk | |
| (this triggered "같은 화제 반복" drops — e.g. inserted-pair case in | |
| the O42W2BIE25HZ 배터리 기사) | |
| - No paragraph is ever split across bulks | |
| (this triggered "불완전 조각" drops — e.g. JABBR33ZE 홍보 기사) | |
| A paragraph that exceeds `max_chars` is the only exception: it is split | |
| into sentences via KSS (pecab backend is not thread-safe, so we serialize | |
| with a module-level lock and fall back to regex on pecab asserts), and | |
| those sentences are packed within that one paragraph's scope only. Bulk | |
| boundaries still never cross into adjacent paragraphs. | |
| """ | |
| import logging | |
| import re | |
| import threading | |
| import kss | |
| logger = logging.getLogger(__name__) | |
| # pecab (KSS 기본 백엔드) 가 스레드 세이프가 아님 — 동시 호출 시 내부 상태가 | |
| # 섞여 다른 스레드의 입력이 반환되는 cross-contamination 관찰됨. mecab 을 | |
| # 시스템에 심기 전까지 전역 뮤텍스로 직렬화한다. | |
| _KSS_LOCK = threading.Lock() | |
| # Korean + Latin sentence-terminal punctuation followed by whitespace. | |
| _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?…。!?])\s+") | |
| def _safe_split_sentences(para: str) -> list[str]: | |
| try: | |
| with _KSS_LOCK: | |
| sents = list(kss.split_sentences(para)) | |
| return [s.strip() for s in sents if s and s.strip()] | |
| except (AssertionError, IndexError, AttributeError, ValueError, TypeError) as e: | |
| logger.warning( | |
| "kss.split_sentences failed (%s); using regex fallback", type(e).__name__ | |
| ) | |
| parts = [p.strip() for p in _SENTENCE_SPLIT_RE.split(para) if p.strip()] | |
| return parts if parts else [para] | |
| def _pack_sentences(sentences: list[str], max_chars: int) -> list[str]: | |
| """Pack sentences joined by ' ' up to `max_chars`, never splitting a sentence.""" | |
| bulks: list[str] = [] | |
| current: list[str] = [] | |
| current_len = 0 | |
| for sent in sentences: | |
| if not sent: | |
| continue | |
| add = (1 + len(sent)) if current else len(sent) | |
| if current and current_len + add > max_chars: | |
| bulks.append(" ".join(current)) | |
| current, current_len = [sent], len(sent) | |
| else: | |
| current.append(sent) | |
| current_len += add | |
| if current: | |
| bulks.append(" ".join(current)) | |
| return bulks | |
| def split_into_bulks(text: str, max_chars: int = 500) -> list[str]: | |
| """Return one bulk per paragraph. Long paragraphs are sentence-packed in-scope. | |
| - Paragraph delimiter is `\\n` (so `\\n\\n`-separated input works too — | |
| empty strings between are filtered). | |
| - A paragraph ≤ `max_chars` is emitted as a single bulk. | |
| - A paragraph > `max_chars` is split into sentences via KSS (regex | |
| fallback on pecab asserts) and those sentences are packed within | |
| that paragraph only — bulks never span paragraph boundaries. | |
| """ | |
| if not text or not text.strip(): | |
| return [] | |
| paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()] | |
| if not paragraphs: | |
| return [] | |
| bulks: list[str] = [] | |
| for para in paragraphs: | |
| if len(para) <= max_chars: | |
| bulks.append(para) | |
| continue | |
| sentences = _safe_split_sentences(para) | |
| bulks.extend(_pack_sentences(sentences, max_chars)) | |
| return bulks | |