Spaces:

dev-strender
/

proofread-20261h-demo

Sleeping

App Files Files Community

proofread-20261h-demo / text_splitter.py

dev-strender

fix: 1-paragraph-per-bulk chunking + KSS sentence-split for overflow

a0be335 29 days ago

raw

history blame contribute delete

3.53 kB

	"""Text splitting utilities for bulk processing.

	Strategy: 1 paragraph = 1 bulk, with no cross-paragraph packing.

	Each paragraph is sent to the LLM as its own bulk so that:
	- LLM never sees two similar adjacent paragraphs in one bulk
	(this triggered "같은 화제 반복" drops — e.g. inserted-pair case in
	the O42W2BIE25HZ 배터리 기사)
	- No paragraph is ever split across bulks
	(this triggered "불완전 조각" drops — e.g. JABBR33ZE 홍보 기사)

	A paragraph that exceeds `max_chars` is the only exception: it is split
	into sentences via KSS (pecab backend is not thread-safe, so we serialize
	with a module-level lock and fall back to regex on pecab asserts), and
	those sentences are packed within that one paragraph's scope only. Bulk
	boundaries still never cross into adjacent paragraphs.
	"""

	import logging
	import re
	import threading

	import kss

	logger = logging.getLogger(__name__)

	# pecab (KSS 기본 백엔드) 가 스레드 세이프가 아님 — 동시 호출 시 내부 상태가
	# 섞여 다른 스레드의 입력이 반환되는 cross-contamination 관찰됨. mecab 을
	# 시스템에 심기 전까지 전역 뮤텍스로 직렬화한다.
	_KSS_LOCK = threading.Lock()

	# Korean + Latin sentence-terminal punctuation followed by whitespace.
	_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?…。!?])\s+")


	def _safe_split_sentences(para: str) -> list[str]:
	try:
	with _KSS_LOCK:
	sents = list(kss.split_sentences(para))
	return [s.strip() for s in sents if s and s.strip()]
	except (AssertionError, IndexError, AttributeError, ValueError, TypeError) as e:
	logger.warning(
	"kss.split_sentences failed (%s); using regex fallback", type(e).__name__
	)
	parts = [p.strip() for p in _SENTENCE_SPLIT_RE.split(para) if p.strip()]
	return parts if parts else [para]


	def _pack_sentences(sentences: list[str], max_chars: int) -> list[str]:
	"""Pack sentences joined by ' ' up to `max_chars`, never splitting a sentence."""
	bulks: list[str] = []
	current: list[str] = []
	current_len = 0
	for sent in sentences:
	if not sent:
	continue
	add = (1 + len(sent)) if current else len(sent)
	if current and current_len + add > max_chars:
	bulks.append(" ".join(current))
	current, current_len = [sent], len(sent)
	else:
	current.append(sent)
	current_len += add
	if current:
	bulks.append(" ".join(current))
	return bulks


	def split_into_bulks(text: str, max_chars: int = 500) -> list[str]:
	"""Return one bulk per paragraph. Long paragraphs are sentence-packed in-scope.

	- Paragraph delimiter is `\\n` (so `\\n\\n`-separated input works too —
	empty strings between are filtered).
	- A paragraph ≤ `max_chars` is emitted as a single bulk.
	- A paragraph > `max_chars` is split into sentences via KSS (regex
	fallback on pecab asserts) and those sentences are packed within
	that paragraph only — bulks never span paragraph boundaries.
	"""
	if not text or not text.strip():
	return []

	paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()]
	if not paragraphs:
	return []

	bulks: list[str] = []
	for para in paragraphs:
	if len(para) <= max_chars:
	bulks.append(para)
	continue
	sentences = _safe_split_sentences(para)
	bulks.extend(_pack_sentences(sentences, max_chars))

	return bulks