hf-upload
Upload inference bundle
0748838
from __future__ import annotations
from typing import List, Optional, Tuple
from .constants import (
SENTENCE_SPLIT_REGEX,
Z1_ANCHOR_CHAR,
Z1_ANCHOR_MAX_CHARS,
Z4_ANCHOR_REGEX,
)
def find_z1_anchor(text: str, max_chars: int = Z1_ANCHOR_MAX_CHARS) -> Optional[int]:
if not text:
return None
limit = min(len(text), max_chars)
idx = text.rfind(Z1_ANCHOR_CHAR, 0, limit)
if idx == -1:
return None
return idx + 1
def find_z4_anchor(text: str) -> Optional[int]:
if not text:
return None
match = Z4_ANCHOR_REGEX.search(text)
if not match:
return None
return match.end()
def sentence_boundaries(text: str) -> List[int]:
"""
Return a sorted unique list of candidate boundary positions (character offsets),
including 0 and len(text). Also inject Z1/Z4 anchors as additional candidates.
"""
if not text:
return [0]
boundaries = [0]
for match in SENTENCE_SPLIT_REGEX.finditer(text):
end = match.end()
if end > boundaries[-1]:
boundaries.append(end)
z1_end = find_z1_anchor(text)
z4_end = find_z4_anchor(text)
for pos in (z1_end, z4_end):
if pos is not None and 0 < pos <= len(text):
boundaries.append(pos)
boundaries = sorted(set(boundaries))
if boundaries[0] != 0:
boundaries.insert(0, 0)
if boundaries[-1] != len(text):
boundaries.append(len(text))
return boundaries
def build_sentence_slices(text: str) -> List[Tuple[int, int]]:
bounds = sentence_boundaries(text)
slices: List[Tuple[int, int]] = []
for i in range(len(bounds) - 1):
s, e = bounds[i], bounds[i + 1]
if e > s:
slices.append((s, e))
return slices