src/asr/cache_aware_modules_config.py · pltobing/streaming-speech-translation at main

File size: 3,559 Bytes

#!/usr/bin/env python3
# License: CC-BY-NC-ND-4.0
# Created by: Patrick Lumbantobing, Vertox-AI
# Copyright (c) 2026 Vertox-AI. All rights reserved.
#
# This work is licensed under the Creative Commons
# Attribution-NonCommercial-NoDerivatives 4.0 International License.
# To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-nd/4.0/
"""
Configs for cache-aware streaming audio and feature buffers.

Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main

Defines dataclasses used by the Nemotron cache-aware streaming ASR demo
to control chunking, cache sizes, and frame-level buffering.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import List


@dataclass
class TimestampedResult:
    """
    Timestamped recognition result from the streaming decoder.

    Attributes
    ----------
    text :
        Full recognized text so far.
    timestamps :
        Optional per-token timestamps.
    tokens :
        Optional list of token strings.
    logprobs :
        Optional per-token log-probabilities.
    added_text :
        Incremental text added in the latest step (if any).
    """

    text: str
    timestamps: List[float] | None = None
    tokens: List[str] | None = None
    logprobs: List[float] | None = None
    added_text: str | None = None


@dataclass
class CacheAwareStreamingConfig:
    """
    Configuration for cache-aware streaming audio/feature buffering.

    Parameters
    ----------
    chunk_size :
        Chunk size (in frames) per step. Can be a two-element list to
        specify different sizes for the first and subsequent steps.
    shift_size :
        Shift size (in frames) per step; same two-element semantics as
        ``chunk_size``.
    cache_drop_size :
        Number of steps to drop from the cache periodically.
    last_channel_cache_size :
        Cache size needed for the last channel layers.
    valid_encoder_out_len :
        Number of steps in the final output that are guaranteed to match
        offline encoder output.
    pre_encode_cache_size :
        Cache size for pre-encoding layers to avoid internal caching.
    drop_extra_pre_encoded :
        Number of extra pre-encoded steps to drop.
    last_channel_num, last_time_num :
        Number of channel/time layers that require cache maintenance.
    audio_chunk_frames, audio_chunk_frames_drop, audio_frame_size :
        Audio framing parameters for streaming input.
    input_features :
        Input feature dimension (e.g., mel-spectrogram size).
    conv_context_size, len_layers, d_model :
        Model architecture parameters (convolution context, layers, hidden dim).
    max_tokens_per_step, window_step, subsampling_factor :
        Decoder step and alignment parameters.
    """

    chunk_size: List[int] = field(default_factory=lambda: [49, 56])
    shift_size: List[int] = field(default_factory=lambda: [49, 56])

    cache_drop_size: int = 0
    last_channel_cache_size: int = 70

    valid_encoder_out_len: int = 7

    pre_encode_cache_size: List[int] = field(default_factory=lambda: [0, 9])
    drop_extra_pre_encoded: int = 2

    last_channel_num: int = 0
    last_time_num: int = 0

    audio_chunk_frames: int = 5
    audio_chunk_frames_drop: int = 2
    audio_frame_size: int = 160

    input_features: int = 128

    conv_context_size: List[int] = field(default_factory=lambda: [8, 0])
    len_layers: int = 24
    d_model: int = 1024

    max_tokens_per_step: int = 10
    window_step: float = 0.01
    subsampling_factor: int = 10