from __future__ import annotations from typing import List, Optional, Tuple from .constants import ( SENTENCE_SPLIT_REGEX, Z1_ANCHOR_CHAR, Z1_ANCHOR_MAX_CHARS, Z4_ANCHOR_REGEX, ) def find_z1_anchor(text: str, max_chars: int = Z1_ANCHOR_MAX_CHARS) -> Optional[int]: if not text: return None limit = min(len(text), max_chars) idx = text.rfind(Z1_ANCHOR_CHAR, 0, limit) if idx == -1: return None return idx + 1 def find_z4_anchor(text: str) -> Optional[int]: if not text: return None match = Z4_ANCHOR_REGEX.search(text) if not match: return None return match.end() def sentence_boundaries(text: str) -> List[int]: """ Return a sorted unique list of candidate boundary positions (character offsets), including 0 and len(text). Also inject Z1/Z4 anchors as additional candidates. """ if not text: return [0] boundaries = [0] for match in SENTENCE_SPLIT_REGEX.finditer(text): end = match.end() if end > boundaries[-1]: boundaries.append(end) z1_end = find_z1_anchor(text) z4_end = find_z4_anchor(text) for pos in (z1_end, z4_end): if pos is not None and 0 < pos <= len(text): boundaries.append(pos) boundaries = sorted(set(boundaries)) if boundaries[0] != 0: boundaries.insert(0, 0) if boundaries[-1] != len(text): boundaries.append(len(text)) return boundaries def build_sentence_slices(text: str) -> List[Tuple[int, int]]: bounds = sentence_boundaries(text) slices: List[Tuple[int, int]] = [] for i in range(len(bounds) - 1): s, e = bounds[i], bounds[i + 1] if e > s: slices.append((s, e)) return slices