Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| THINK_TAG_RE = re.compile(r"</?think>", re.IGNORECASE) | |
| class SentenceSpan: | |
| text: str | |
| start_char: int | |
| end_char: int | |
| def normalize_trace_text(raw_trace_text: str) -> str: | |
| return THINK_TAG_RE.sub("", raw_trace_text) | |
| def _non_whitespace_token_count(text: str) -> int: | |
| return len(re.findall(r"\S+", text)) | |
| def split_sentences( | |
| text: str, | |
| *, | |
| min_token_like_units: int = 2, | |
| ) -> list[SentenceSpan]: | |
| if not text: | |
| return [] | |
| raw_spans: list[tuple[int, int]] = [] | |
| start = 0 | |
| index = 0 | |
| text_length = len(text) | |
| while index < text_length: | |
| if text[index : index + 2] == "\n\n": | |
| end = index + 2 | |
| raw_spans.append((start, end)) | |
| start = end | |
| index = end | |
| continue | |
| if text[index] in ".!?": | |
| end = index + 1 | |
| while end < text_length and text[end] in "\"')]}": | |
| end += 1 | |
| while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n": | |
| end += 1 | |
| raw_spans.append((start, end)) | |
| start = end | |
| index = end | |
| continue | |
| index += 1 | |
| if start < text_length: | |
| raw_spans.append((start, text_length)) | |
| merged: list[tuple[int, int]] = [] | |
| for span_start, span_end in raw_spans: | |
| fragment = text[span_start:span_end] | |
| if not fragment: | |
| continue | |
| if merged and _non_whitespace_token_count(fragment) < min_token_like_units: | |
| previous_start, _ = merged[-1] | |
| merged[-1] = (previous_start, span_end) | |
| continue | |
| merged.append((span_start, span_end)) | |
| if len(merged) > 1: | |
| last_start, last_end = merged[-1] | |
| if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units: | |
| prev_start, _ = merged[-2] | |
| merged[-2] = (prev_start, last_end) | |
| merged.pop() | |
| return [ | |
| SentenceSpan( | |
| text=text[span_start:span_end], | |
| start_char=span_start, | |
| end_char=span_end, | |
| ) | |
| for span_start, span_end in merged | |
| if text[span_start:span_end] | |
| ] | |