Spaces:

BART-ender
/

cot-anc

Sleeping

App Files Files Community

cot-anc / app /analysis /sentence_split.py

BART-ender

Deploy Thought Anchors

fda8fb3 verified 5 days ago

raw

history blame contribute delete

2.3 kB

	from __future__ import annotations

	import re
	from dataclasses import dataclass

	THINK_TAG_RE = re.compile(r"</?think>", re.IGNORECASE)


	@dataclass(slots=True)
	class SentenceSpan:
	text: str
	start_char: int
	end_char: int


	def normalize_trace_text(raw_trace_text: str) -> str:
	return THINK_TAG_RE.sub("", raw_trace_text)


	def _non_whitespace_token_count(text: str) -> int:
	return len(re.findall(r"\S+", text))


	def split_sentences(
	text: str,
	*,
	min_token_like_units: int = 2,
	) -> list[SentenceSpan]:
	if not text:
	return []

	raw_spans: list[tuple[int, int]] = []
	start = 0
	index = 0
	text_length = len(text)

	while index < text_length:
	if text[index : index + 2] == "\n\n":
	end = index + 2
	raw_spans.append((start, end))
	start = end
	index = end
	continue

	if text[index] in ".!?":
	end = index + 1
	while end < text_length and text[end] in "\"')]}":
	end += 1
	while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n":
	end += 1
	raw_spans.append((start, end))
	start = end
	index = end
	continue

	index += 1

	if start < text_length:
	raw_spans.append((start, text_length))

	merged: list[tuple[int, int]] = []
	for span_start, span_end in raw_spans:
	fragment = text[span_start:span_end]
	if not fragment:
	continue
	if merged and _non_whitespace_token_count(fragment) < min_token_like_units:
	previous_start, _ = merged[-1]
	merged[-1] = (previous_start, span_end)
	continue
	merged.append((span_start, span_end))

	if len(merged) > 1:
	last_start, last_end = merged[-1]
	if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units:
	prev_start, _ = merged[-2]
	merged[-2] = (prev_start, last_end)
	merged.pop()

	return [
	SentenceSpan(
	text=text[span_start:span_end],
	start_char=span_start,
	end_char=span_end,
	)
	for span_start, span_end in merged
	if text[span_start:span_end]
	]