File size: 2,297 Bytes
fda8fb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import annotations

import re
from dataclasses import dataclass

THINK_TAG_RE = re.compile(r"</?think>", re.IGNORECASE)


@dataclass(slots=True)
class SentenceSpan:
    text: str
    start_char: int
    end_char: int


def normalize_trace_text(raw_trace_text: str) -> str:
    return THINK_TAG_RE.sub("", raw_trace_text)


def _non_whitespace_token_count(text: str) -> int:
    return len(re.findall(r"\S+", text))


def split_sentences(
    text: str,
    *,
    min_token_like_units: int = 2,
) -> list[SentenceSpan]:
    if not text:
        return []

    raw_spans: list[tuple[int, int]] = []
    start = 0
    index = 0
    text_length = len(text)

    while index < text_length:
        if text[index : index + 2] == "\n\n":
            end = index + 2
            raw_spans.append((start, end))
            start = end
            index = end
            continue

        if text[index] in ".!?":
            end = index + 1
            while end < text_length and text[end] in "\"')]}":
                end += 1
            while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n":
                end += 1
            raw_spans.append((start, end))
            start = end
            index = end
            continue

        index += 1

    if start < text_length:
        raw_spans.append((start, text_length))

    merged: list[tuple[int, int]] = []
    for span_start, span_end in raw_spans:
        fragment = text[span_start:span_end]
        if not fragment:
            continue
        if merged and _non_whitespace_token_count(fragment) < min_token_like_units:
            previous_start, _ = merged[-1]
            merged[-1] = (previous_start, span_end)
            continue
        merged.append((span_start, span_end))

    if len(merged) > 1:
        last_start, last_end = merged[-1]
        if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units:
            prev_start, _ = merged[-2]
            merged[-2] = (prev_start, last_end)
            merged.pop()

    return [
        SentenceSpan(
            text=text[span_start:span_end],
            start_char=span_start,
            end_char=span_end,
        )
        for span_start, span_end in merged
        if text[span_start:span_end]
    ]