WilliamCHN
/

Legal_Document_Segment_Model

Model card Files Files and versions

Legal_Document_Segment_Model / src /judgment_partition_infer /text_utils.py

hf-upload

Upload inference bundle

0748838 about 2 months ago

history blame contribute delete

1.77 kB

	from __future__ import annotations

	from typing import List, Optional, Tuple

	from .constants import (
	SENTENCE_SPLIT_REGEX,
	Z1_ANCHOR_CHAR,
	Z1_ANCHOR_MAX_CHARS,
	Z4_ANCHOR_REGEX,
	)


	def find_z1_anchor(text: str, max_chars: int = Z1_ANCHOR_MAX_CHARS) -> Optional[int]:
	if not text:
	return None
	limit = min(len(text), max_chars)
	idx = text.rfind(Z1_ANCHOR_CHAR, 0, limit)
	if idx == -1:
	return None
	return idx + 1


	def find_z4_anchor(text: str) -> Optional[int]:
	if not text:
	return None
	match = Z4_ANCHOR_REGEX.search(text)
	if not match:
	return None
	return match.end()


	def sentence_boundaries(text: str) -> List[int]:
	"""
	Return a sorted unique list of candidate boundary positions (character offsets),
	including 0 and len(text). Also inject Z1/Z4 anchors as additional candidates.
	"""
	if not text:
	return [0]

	boundaries = [0]
	for match in SENTENCE_SPLIT_REGEX.finditer(text):
	end = match.end()
	if end > boundaries[-1]:
	boundaries.append(end)

	z1_end = find_z1_anchor(text)
	z4_end = find_z4_anchor(text)
	for pos in (z1_end, z4_end):
	if pos is not None and 0 < pos <= len(text):
	boundaries.append(pos)

	boundaries = sorted(set(boundaries))
	if boundaries[0] != 0:
	boundaries.insert(0, 0)
	if boundaries[-1] != len(text):
	boundaries.append(len(text))
	return boundaries


	def build_sentence_slices(text: str) -> List[Tuple[int, int]]:
	bounds = sentence_boundaries(text)
	slices: List[Tuple[int, int]] = []
	for i in range(len(bounds) - 1):
	s, e = bounds[i], bounds[i + 1]
	if e > s:
	slices.append((s, e))
	return slices