Spaces:

grimshaw
/

neuapi

Sleeping

File size: 2,538 Bytes

35bb6f4

from __future__ import annotations

import re


def split_into_sentences(text: str) -> list[str]:
    """Split text into sentences at sentence boundaries."""
    # Split on sentence-ending punctuation followed by whitespace
    parts = re.split(r'(?<=[.!?;])\s+', text.strip())
    return [p.strip() for p in parts if p.strip()]


def chunk_text(text: str, max_chars: int = 500) -> list[str]:
    """Split text into chunks suitable for TTS inference.

    First splits by sentences, then groups sentences into chunks
    that don't exceed max_chars. If a single sentence exceeds
    max_chars, it's split at clause boundaries or word boundaries.
    """
    sentences = split_into_sentences(text)
    if not sentences:
        return [text] if text.strip() else []

    chunks: list[str] = []
    current = ""

    for sentence in sentences:
        if len(sentence) > max_chars:
            # Flush current
            if current:
                chunks.append(current)
                current = ""
            # Split long sentence at clause boundaries
            sub_parts = _split_long_sentence(sentence, max_chars)
            chunks.extend(sub_parts)
        elif len(current) + len(sentence) + 1 > max_chars:
            if current:
                chunks.append(current)
            current = sentence
        else:
            current = f"{current} {sentence}".strip() if current else sentence

    if current:
        chunks.append(current)

    return chunks


def _split_long_sentence(sentence: str, max_chars: int) -> list[str]:
    """Split a long sentence at commas or word boundaries."""
    # Try splitting at commas first
    parts = re.split(r',\s*', sentence)
    if len(parts) > 1:
        result: list[str] = []
        current = ""
        for part in parts:
            candidate = f"{current}, {part}".strip(", ") if current else part
            if len(candidate) > max_chars and current:
                result.append(current)
                current = part
            else:
                current = candidate
        if current:
            result.append(current)
        return result

    # Fallback: split at word boundaries
    words = sentence.split()
    result = []
    current = ""
    for word in words:
        candidate = f"{current} {word}".strip() if current else word
        if len(candidate) > max_chars and current:
            result.append(current)
            current = word
        else:
            current = candidate
    if current:
        result.append(current)

    return result