src/nmt/streaming_segmenter.py · pltobing/streaming-speech-translation at main

File size: 12,373 Bytes

f724af4
da63a34
 
 
 
 
 
 
 
f724af4
da63a34
f724af4
da63a34
 
 
f724af4
da63a34
 
f724af4
da63a34
 
 
 
f724af4
da63a34
 
 
f724af4
 
da63a34
 
f724af4
56cdd5d
f724af4
 
da63a34
f724af4
 
 
 
 
da63a34
 
f724af4
 
 
 
 
 
 
 
 
da63a34
 
f724af4
 
 
 
 
 
 
 
da63a34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f724af4
 
 
 
 
 
 
da63a34
f724af4
 
 
 
 
da63a34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f724af4
 
 
 
 
 
 
 
 
 
 
 
da63a34
 
f724af4
 
 
 
 
da63a34
 
f724af4
 
 
da63a34
 
f724af4
 
da63a34
 
 
 
f724af4
 
 
da63a34
f724af4
 
da63a34
f724af4
 
 
da63a34
 
f724af4
da63a34
 
f724af4
 
 
da63a34
f724af4
da63a34
f724af4
 
 
 
 
da63a34
f724af4
da63a34
f724af4
 
da63a34
f724af4
 
 
 
da63a34
f724af4
 
 
 
 
da63a34
 
f724af4
da63a34
 
f724af4
 
da63a34
f724af4
da63a34
 
 
f724af4
 
da63a34
 
 
 
f724af4
56cdd5d
f724af4
 
da63a34
f724af4
 
 
 
 
da63a34
 
f724af4
 
 
 
da63a34
f724af4
da63a34
f724af4
 
 
da63a34
f724af4
 
da63a34
f724af4
 
 
 
 
 
da63a34
 
 
f724af4
 
 
 
da63a34
 
f724af4
 
 
 
 
 
 
da63a34
 
f724af4
 
 
 
 
 
da63a34
 
f724af4
da63a34
 
f724af4
 
 
da63a34
f724af4
 
 
 
da63a34
f724af4
 
 
 
da63a34
0c397a9
f724af4
 
da63a34
f724af4
 
 
 
 
 
 
 
 
da63a34
f724af4
 
 
 
 
 
da63a34
 
f724af4
 
 
da63a34
f724af4
da63a34
f724af4
 
 
 
 
 
da63a34
f724af4
 
 
 
da63a34
 
f724af4
 
 
 
 
da63a34
 
f724af4
 
da63a34
 
 
f724af4
 
 
 
da63a34
f724af4
0c397a9
 
da63a34
f724af4
da63a34
f724af4
 
 
da63a34
0c397a9
 
 
f724af4

#!/usr/bin/env python3
# License: CC-BY-NC-ND-4.0
# Created by: Patrick Lumbantobing, Vertox-AI
# Copyright (c) 2026 Vertox-AI. All rights reserved.
#
# This work is licensed under the Creative Commons
# Attribution-NonCommercial-NoDerivatives 4.0 International License.
# To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-nd/4.0/
"""
Streaming ASR Output Segmenter for Translation Pipeline.

Description
-----------
Segments streaming ASR output into optimal chunks for NMT translation.

Designed for NVIDIA NeMo cache-aware streaming ASR feeding TranslateGemma
via llama.cpp on CPU.

Implements:
- Word-count based segmentation (max_words with hold_back).
- Punctuation boundary detection (sentence and clause punctuation).
- Simple text buffer with split-based word counting.

Not yet wired into the active code path (defined in config but unused):
- Pause detection (use_pause_detection, pause_threshold_ms).
- ASR FINAL hypothesis integration (honor_asr_final).
"""

from __future__ import annotations

import logging
import time
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional, Tuple

log = logging.getLogger(__name__)


class BoundaryType(Enum):
    """Types of segment boundaries that can trigger an emit."""

    FINAL = "final"
    PUNCTUATION = "punctuation"
    PAUSE = "pause"
    MAX_TOKENS = "max_tokens"
    FORCED = "forced"


@dataclass
class ASRToken:
    """Represents a single ASR token or word with optional metadata."""

    text: str
    timestamp: float = 0.0
    confidence: float = 1.0
    is_final: bool = False


@dataclass
class SegmentResult:
    """
    Metadata for a single emitted segment.

    Attributes
    ----------
    text :
        Concatenated segment text.
    tokens :
        Tokens contributing to this segment.
    boundary_type :
        The boundary condition that triggered this segment.
    start_time :
        Start timestamp of the segment (seconds).
    end_time :
        End timestamp of the segment (seconds).
    token_count :
        Number of tokens in the segment (derived from `tokens`).
    """

    text: str
    tokens: List[ASRToken]
    boundary_type: BoundaryType
    start_time: float
    end_time: float
    token_count: int = 0

    def __post_init__(self) -> None:
        self.token_count = len(self.tokens)


@dataclass
class SegmenterConfig:
    """
    Configuration for the streaming segmenter.

    Parameters
    ----------
    max_words :
        Maximum words in a segment before triggering MAX_TOKENS.
    min_words :
        Minimum words required before punctuation can trigger a segment.
    hold_back :
        Number of words held back when cutting on MAX_TOKENS to absorb
        ASR tail instability.
    sentence_punct :
        Characters considered strong sentence punctuation.
    clause_punct :
        Characters considered clause-level punctuation.
    use_punctuation :
        Whether punctuation-based boundaries are enabled.
    pause_threshold_ms :
        Pause duration in milliseconds for pause-based segmentation
        (currently not wired into the main path).
    use_pause_detection :
        Whether pause-based segmentation is enabled (currently unused).
    honor_asr_final :
        Whether to honor ASR FINAL hypotheses (currently unused).
    """

    max_words: int = 5
    min_words: int = 3
    hold_back: int = 2
    sentence_punct: str = ".!?"
    clause_punct: str = ",;:"
    use_punctuation: bool = True
    pause_threshold_ms: float = 700
    use_pause_detection: bool = True
    honor_asr_final: bool = True


class StreamingSegmenter:
    """
    Segment streaming ASR tokens for optimal NMT chunking.

    Uses word-count based segmentation with punctuation boundary detection.
    Accumulates tokens in a plain text buffer and triggers emission when
    a punctuation boundary or max_words threshold is reached.

    Boundary priority: sentence punctuation > clause punctuation > max_words.
    A hold-back buffer absorbs ASR tail instability on max_words triggers.

    Parameters
    ----------
    config :
        Segmenter configuration. Defaults to :class:`SegmenterConfig`.
    """

    def __init__(self, config: Optional[SegmenterConfig] = None) -> None:
        self.config: SegmenterConfig = config or SegmenterConfig()

        # Text buffer state (legacy token buffer is kept for compatibility).
        self.text_buffer: str = ""
        self.text_split: List[str] = []
        self.buffer: List[ASRToken] = []

        self.last_token_time: Optional[float] = None
        self.segments_emitted: List[str] = []

        self._all_punct = set(self.config.sentence_punct + self.config.clause_punct)
        self._strong_punct = set(self.config.sentence_punct)

    # ─── Public API ─────────────────────────────────────────────────────────

    def add_token(self, new_text: str, new_timestamp: float) -> Optional[str]:
        """
        Add an ASR text delta and return a segment if a boundary is triggered.

        Parameters
        ----------
        new_text :
            Incremental ASR text token/delta.
        new_timestamp :
            Current timestamp (seconds) for pause detection.

        Returns
        -------
        str or None
            Emitted text segment if a boundary was triggered; otherwise ``None``.
        """
        # Detect punctuation in the new text.
        new_text = new_text.rstrip()
        new_text_punct = ""

        for punct in self._strong_punct:
            if punct in new_text:
                new_text_punct = punct
                break
        if not new_text_punct:
            for punct in self.config.clause_punct:
                if punct in new_text:
                    new_text_punct = punct
                    break

        log.debug(f"add-token text_buffer_before: {self.text_buffer!r}")
        log.debug(f"add-token text_split_before: {self.text_split!r}")

        # Append to text buffer, stripping leading space on first token.
        if self.text_buffer:
            self.text_buffer += new_text
        else:
            if new_text and new_text[0] == " ":
                self.text_buffer = new_text[1:]
            else:
                self.text_buffer = new_text

        self.text_split = self.text_buffer.split(" ")

        log.debug(f"add-token new_text: {new_text!r}")
        log.debug(f"add-token new_text_punct: {new_text_punct!r}")
        log.debug(f"add-token text_buffer_after: {self.text_buffer!r}")
        log.debug(f"add-token text_split_after: {self.text_split!r}")

        self.last_token_time = time.time()

        boundary, index = self._check_boundary(new_text_punct)
        log.debug(f"boundary: {boundary}, index: {index}")
        if boundary:
            return self._emit(boundary, index)
        return None

    def flush(self) -> Optional[str]:
        """
        Force-flush remaining buffer contents as a segment.

        Returns
        -------
        str or None
            Remaining buffered text, or ``None`` if the buffer is empty.
        """
        if self.text_split:
            text = " ".join(t for t in self.text_split)
            self.reset()
            return text
        return None

    def get_buffer_text(self) -> str:
        """Return current legacy token buffer content as a string."""
        return " ".join(t.text for t in self.buffer)

    def get_buffer_size(self) -> int:
        """Return number of token objects in the legacy buffer."""
        return len(self.buffer)

    def get_stats(self) -> Dict[str, int]:
        """
        Return basic segmentation statistics.

        Returns
        -------
        dict
            ``segments_emitted`` – count of segments emitted so far.
            ``buffered_tokens`` – number of words currently buffered.
        """
        n = len(self.segments_emitted)
        return {
            "segments_emitted": n,
            "buffered_tokens": len(self.text_split),
        }

    def reset(self) -> None:
        """Reset all buffer and timing state."""
        self.buffer.clear()
        self.text_split.clear()
        self.text_buffer = ""
        self.last_token_time = None
        self.segments_emitted.clear()

    # ─── Internals ─────────────────────────────────────────────────────────

    def _check_boundary(self, punct_check: str = "") -> Tuple[Optional[BoundaryType], int]:
        """
        Check whether a segment boundary should be triggered.

        Parameters
        ----------
        punct_check :
            Punctuation character found in the latest token, or empty string.

        Returns
        -------
        (BoundaryType or None, int)
            Boundary type and split index, or (None, -1) if no boundary.
        """
        buf_len = len(self.text_split)

        # Punctuation-based boundary.
        if self.config.use_punctuation and punct_check and buf_len >= self.config.min_words:
            if punct_check not in self.text_buffer:
                return None, -1

            index = -1
            for i in range(buf_len - 1, -1, -1):
                if punct_check in self.text_split[i]:
                    index = i
                    break
            if index < 0:
                return None, -1
            return BoundaryType.PUNCTUATION, index

        # Max-token boundary.
        if buf_len >= self.config.max_words + self.config.hold_back:
            return BoundaryType.MAX_TOKENS, -1

        return None, -1

    def _emit(self, boundary_type: BoundaryType, index_buffer_end: int = -1) -> str:
        """
        Emit a segment from the buffer.

        Parameters
        ----------
        boundary_type :
            The boundary condition that triggered this emit.
        index_buffer_end :
            Word index (inclusive) at which to cut the buffer for punctuation
            boundaries. Ignored for MAX_TOKENS (uses max_words cut).

        Returns
        -------
        str
            The emitted text segment (may be empty if misconfigured).
        """
        if boundary_type == BoundaryType.MAX_TOKENS and self.config.hold_back > 0:
            if len(self.text_split) < self.config.max_words + self.config.hold_back:
                log.warning(
                    f"_emit MAX_TOKENS: buffer too short "
                    f"({len(self.text_split)} < "
                    f"{self.config.max_words + self.config.hold_back}), returning empty"
                )
                return ""
            cut = self.config.max_words
            emit_tokens = self.text_split[:cut]
            log.debug(f"_emit text_split_before: {self.text_split!r}")
            log.debug(f"_emit text_buffer_before: {self.text_buffer!r}")
            self.text_split = self.text_split[cut:]
            self.text_buffer = " ".join(t for t in self.text_split)
            log.debug(f"_emit emit_tokens: {emit_tokens!r}")
            log.debug(f"_emit text_split_after: {self.text_split!r}")
            log.debug(f"_emit text_buffer_after: {self.text_buffer!r}")
        else:
            if index_buffer_end < 0:
                log.warning("_emit punctuation: index_buffer_end < 0, returning empty")
                return ""

            emit_tokens = self.text_split[: index_buffer_end + 1]
            log.debug(f"_emit punct {index_buffer_end} text_split_before: {self.text_split!r}")
            log.debug(f"_emit punct {index_buffer_end} text_buffer_before: {self.text_buffer!r}")

            if index_buffer_end < (len(self.text_split) - 1):
                self.text_split = self.text_split[index_buffer_end + 1 :]
                self.text_buffer = " ".join(t for t in self.text_split)
            else:
                self.reset()

            log.debug(f"_emit punct {index_buffer_end} emit_tokens: {emit_tokens!r}")
            log.debug(f"_emit punct {index_buffer_end} text_split_after: {self.text_split!r}")
            log.debug(f"_emit punct {index_buffer_end} text_buffer_after: {self.text_buffer!r}")

        text = " ".join(t for t in emit_tokens)
        self.segments_emitted.append(text)
        return text