from __future__ import annotations import re from dataclasses import dataclass THINK_TAG_RE = re.compile(r"", re.IGNORECASE) @dataclass(slots=True) class SentenceSpan: text: str start_char: int end_char: int def normalize_trace_text(raw_trace_text: str) -> str: return THINK_TAG_RE.sub("", raw_trace_text) def _non_whitespace_token_count(text: str) -> int: return len(re.findall(r"\S+", text)) def split_sentences( text: str, *, min_token_like_units: int = 2, ) -> list[SentenceSpan]: if not text: return [] raw_spans: list[tuple[int, int]] = [] start = 0 index = 0 text_length = len(text) while index < text_length: if text[index : index + 2] == "\n\n": end = index + 2 raw_spans.append((start, end)) start = end index = end continue if text[index] in ".!?": end = index + 1 while end < text_length and text[end] in "\"')]}": end += 1 while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n": end += 1 raw_spans.append((start, end)) start = end index = end continue index += 1 if start < text_length: raw_spans.append((start, text_length)) merged: list[tuple[int, int]] = [] for span_start, span_end in raw_spans: fragment = text[span_start:span_end] if not fragment: continue if merged and _non_whitespace_token_count(fragment) < min_token_like_units: previous_start, _ = merged[-1] merged[-1] = (previous_start, span_end) continue merged.append((span_start, span_end)) if len(merged) > 1: last_start, last_end = merged[-1] if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units: prev_start, _ = merged[-2] merged[-2] = (prev_start, last_end) merged.pop() return [ SentenceSpan( text=text[span_start:span_end], start_char=span_start, end_char=span_end, ) for span_start, span_end in merged if text[span_start:span_end] ]