cot-anc / app /analysis /sentence_split.py
BART-ender's picture
Deploy Thought Anchors
fda8fb3 verified
from __future__ import annotations
import re
from dataclasses import dataclass
THINK_TAG_RE = re.compile(r"</?think>", re.IGNORECASE)
@dataclass(slots=True)
class SentenceSpan:
text: str
start_char: int
end_char: int
def normalize_trace_text(raw_trace_text: str) -> str:
return THINK_TAG_RE.sub("", raw_trace_text)
def _non_whitespace_token_count(text: str) -> int:
return len(re.findall(r"\S+", text))
def split_sentences(
text: str,
*,
min_token_like_units: int = 2,
) -> list[SentenceSpan]:
if not text:
return []
raw_spans: list[tuple[int, int]] = []
start = 0
index = 0
text_length = len(text)
while index < text_length:
if text[index : index + 2] == "\n\n":
end = index + 2
raw_spans.append((start, end))
start = end
index = end
continue
if text[index] in ".!?":
end = index + 1
while end < text_length and text[end] in "\"')]}":
end += 1
while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n":
end += 1
raw_spans.append((start, end))
start = end
index = end
continue
index += 1
if start < text_length:
raw_spans.append((start, text_length))
merged: list[tuple[int, int]] = []
for span_start, span_end in raw_spans:
fragment = text[span_start:span_end]
if not fragment:
continue
if merged and _non_whitespace_token_count(fragment) < min_token_like_units:
previous_start, _ = merged[-1]
merged[-1] = (previous_start, span_end)
continue
merged.append((span_start, span_end))
if len(merged) > 1:
last_start, last_end = merged[-1]
if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units:
prev_start, _ = merged[-2]
merged[-2] = (prev_start, last_end)
merged.pop()
return [
SentenceSpan(
text=text[span_start:span_end],
start_char=span_start,
end_char=span_end,
)
for span_start, span_end in merged
if text[span_start:span_end]
]