#!/usr/bin/env python3 # License: CC-BY-NC-ND-4.0 # Created by: Patrick Lumbantobing, Vertox-AI # Copyright (c) 2026 Vertox-AI. All rights reserved. # # This work is licensed under the Creative Commons # Attribution-NonCommercial-NoDerivatives 4.0 International License. # To view a copy of this license, visit # http://creativecommons.org/licenses/by-nc-nd/4.0/ """ Configs for cache-aware streaming audio and feature buffers. Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main Defines dataclasses used by the Nemotron cache-aware streaming ASR demo to control chunking, cache sizes, and frame-level buffering. """ from __future__ import annotations from dataclasses import dataclass, field from typing import List @dataclass class TimestampedResult: """ Timestamped recognition result from the streaming decoder. Attributes ---------- text : Full recognized text so far. timestamps : Optional per-token timestamps. tokens : Optional list of token strings. logprobs : Optional per-token log-probabilities. added_text : Incremental text added in the latest step (if any). """ text: str timestamps: List[float] | None = None tokens: List[str] | None = None logprobs: List[float] | None = None added_text: str | None = None @dataclass class CacheAwareStreamingConfig: """ Configuration for cache-aware streaming audio/feature buffering. Parameters ---------- chunk_size : Chunk size (in frames) per step. Can be a two-element list to specify different sizes for the first and subsequent steps. shift_size : Shift size (in frames) per step; same two-element semantics as ``chunk_size``. cache_drop_size : Number of steps to drop from the cache periodically. last_channel_cache_size : Cache size needed for the last channel layers. valid_encoder_out_len : Number of steps in the final output that are guaranteed to match offline encoder output. pre_encode_cache_size : Cache size for pre-encoding layers to avoid internal caching. drop_extra_pre_encoded : Number of extra pre-encoded steps to drop. last_channel_num, last_time_num : Number of channel/time layers that require cache maintenance. audio_chunk_frames, audio_chunk_frames_drop, audio_frame_size : Audio framing parameters for streaming input. input_features : Input feature dimension (e.g., mel-spectrogram size). conv_context_size, len_layers, d_model : Model architecture parameters (convolution context, layers, hidden dim). max_tokens_per_step, window_step, subsampling_factor : Decoder step and alignment parameters. """ chunk_size: List[int] = field(default_factory=lambda: [49, 56]) shift_size: List[int] = field(default_factory=lambda: [49, 56]) cache_drop_size: int = 0 last_channel_cache_size: int = 70 valid_encoder_out_len: int = 7 pre_encode_cache_size: List[int] = field(default_factory=lambda: [0, 9]) drop_extra_pre_encoded: int = 2 last_channel_num: int = 0 last_time_num: int = 0 audio_chunk_frames: int = 5 audio_chunk_frames_drop: int = 2 audio_frame_size: int = 160 input_features: int = 128 conv_context_size: List[int] = field(default_factory=lambda: [8, 0]) len_layers: int = 24 d_model: int = 1024 max_tokens_per_step: int = 10 window_step: float = 0.01 subsampling_factor: int = 10