File size: 1,771 Bytes
0748838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from __future__ import annotations

from typing import List, Optional, Tuple

from .constants import (
    SENTENCE_SPLIT_REGEX,
    Z1_ANCHOR_CHAR,
    Z1_ANCHOR_MAX_CHARS,
    Z4_ANCHOR_REGEX,
)


def find_z1_anchor(text: str, max_chars: int = Z1_ANCHOR_MAX_CHARS) -> Optional[int]:
    if not text:
        return None
    limit = min(len(text), max_chars)
    idx = text.rfind(Z1_ANCHOR_CHAR, 0, limit)
    if idx == -1:
        return None
    return idx + 1


def find_z4_anchor(text: str) -> Optional[int]:
    if not text:
        return None
    match = Z4_ANCHOR_REGEX.search(text)
    if not match:
        return None
    return match.end()


def sentence_boundaries(text: str) -> List[int]:
    """
    Return a sorted unique list of candidate boundary positions (character offsets),
    including 0 and len(text). Also inject Z1/Z4 anchors as additional candidates.
    """
    if not text:
        return [0]

    boundaries = [0]
    for match in SENTENCE_SPLIT_REGEX.finditer(text):
        end = match.end()
        if end > boundaries[-1]:
            boundaries.append(end)

    z1_end = find_z1_anchor(text)
    z4_end = find_z4_anchor(text)
    for pos in (z1_end, z4_end):
        if pos is not None and 0 < pos <= len(text):
            boundaries.append(pos)

    boundaries = sorted(set(boundaries))
    if boundaries[0] != 0:
        boundaries.insert(0, 0)
    if boundaries[-1] != len(text):
        boundaries.append(len(text))
    return boundaries


def build_sentence_slices(text: str) -> List[Tuple[int, int]]:
    bounds = sentence_boundaries(text)
    slices: List[Tuple[int, int]] = []
    for i in range(len(bounds) - 1):
        s, e = bounds[i], bounds[i + 1]
        if e > s:
            slices.append((s, e))
    return slices