neuapi / api /src /inference /text_chunker.py
grimshaw's picture
Upload folder using huggingface_hub
35bb6f4 verified
Raw
History Blame Contribute Delete
2.54 kB
from __future__ import annotations
import re
def split_into_sentences(text: str) -> list[str]:
"""Split text into sentences at sentence boundaries."""
# Split on sentence-ending punctuation followed by whitespace
parts = re.split(r'(?<=[.!?;])\s+', text.strip())
return [p.strip() for p in parts if p.strip()]
def chunk_text(text: str, max_chars: int = 500) -> list[str]:
"""Split text into chunks suitable for TTS inference.
First splits by sentences, then groups sentences into chunks
that don't exceed max_chars. If a single sentence exceeds
max_chars, it's split at clause boundaries or word boundaries.
"""
sentences = split_into_sentences(text)
if not sentences:
return [text] if text.strip() else []
chunks: list[str] = []
current = ""
for sentence in sentences:
if len(sentence) > max_chars:
# Flush current
if current:
chunks.append(current)
current = ""
# Split long sentence at clause boundaries
sub_parts = _split_long_sentence(sentence, max_chars)
chunks.extend(sub_parts)
elif len(current) + len(sentence) + 1 > max_chars:
if current:
chunks.append(current)
current = sentence
else:
current = f"{current} {sentence}".strip() if current else sentence
if current:
chunks.append(current)
return chunks
def _split_long_sentence(sentence: str, max_chars: int) -> list[str]:
"""Split a long sentence at commas or word boundaries."""
# Try splitting at commas first
parts = re.split(r',\s*', sentence)
if len(parts) > 1:
result: list[str] = []
current = ""
for part in parts:
candidate = f"{current}, {part}".strip(", ") if current else part
if len(candidate) > max_chars and current:
result.append(current)
current = part
else:
current = candidate
if current:
result.append(current)
return result
# Fallback: split at word boundaries
words = sentence.split()
result = []
current = ""
for word in words:
candidate = f"{current} {word}".strip() if current else word
if len(candidate) > max_chars and current:
result.append(current)
current = word
else:
current = candidate
if current:
result.append(current)
return result