Text-to-Speech
ONNX
GGUF
speech-translation
streaming-speech-translation
speech
audio
speech-recognition
automatic-speech-recognition
streaming-asr
ASR
NeMo
ONNX
cache-aware ASR
FastConformer
RNNT
Parakeet
neural-machine-translation
NMT
gemma3
llama-cpp
GGUF
conversational
TTS
xtts
xttsv2
voice-clone
gpt2
hifigan
multilingual
vq
perceiver-encoder
websocket
File size: 3,559 Bytes
f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 da63a34 f724af4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | #!/usr/bin/env python3
# License: CC-BY-NC-ND-4.0
# Created by: Patrick Lumbantobing, Vertox-AI
# Copyright (c) 2026 Vertox-AI. All rights reserved.
#
# This work is licensed under the Creative Commons
# Attribution-NonCommercial-NoDerivatives 4.0 International License.
# To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-nd/4.0/
"""
Configs for cache-aware streaming audio and feature buffers.
Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main
Defines dataclasses used by the Nemotron cache-aware streaming ASR demo
to control chunking, cache sizes, and frame-level buffering.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List
@dataclass
class TimestampedResult:
"""
Timestamped recognition result from the streaming decoder.
Attributes
----------
text :
Full recognized text so far.
timestamps :
Optional per-token timestamps.
tokens :
Optional list of token strings.
logprobs :
Optional per-token log-probabilities.
added_text :
Incremental text added in the latest step (if any).
"""
text: str
timestamps: List[float] | None = None
tokens: List[str] | None = None
logprobs: List[float] | None = None
added_text: str | None = None
@dataclass
class CacheAwareStreamingConfig:
"""
Configuration for cache-aware streaming audio/feature buffering.
Parameters
----------
chunk_size :
Chunk size (in frames) per step. Can be a two-element list to
specify different sizes for the first and subsequent steps.
shift_size :
Shift size (in frames) per step; same two-element semantics as
``chunk_size``.
cache_drop_size :
Number of steps to drop from the cache periodically.
last_channel_cache_size :
Cache size needed for the last channel layers.
valid_encoder_out_len :
Number of steps in the final output that are guaranteed to match
offline encoder output.
pre_encode_cache_size :
Cache size for pre-encoding layers to avoid internal caching.
drop_extra_pre_encoded :
Number of extra pre-encoded steps to drop.
last_channel_num, last_time_num :
Number of channel/time layers that require cache maintenance.
audio_chunk_frames, audio_chunk_frames_drop, audio_frame_size :
Audio framing parameters for streaming input.
input_features :
Input feature dimension (e.g., mel-spectrogram size).
conv_context_size, len_layers, d_model :
Model architecture parameters (convolution context, layers, hidden dim).
max_tokens_per_step, window_step, subsampling_factor :
Decoder step and alignment parameters.
"""
chunk_size: List[int] = field(default_factory=lambda: [49, 56])
shift_size: List[int] = field(default_factory=lambda: [49, 56])
cache_drop_size: int = 0
last_channel_cache_size: int = 70
valid_encoder_out_len: int = 7
pre_encode_cache_size: List[int] = field(default_factory=lambda: [0, 9])
drop_extra_pre_encoded: int = 2
last_channel_num: int = 0
last_time_num: int = 0
audio_chunk_frames: int = 5
audio_chunk_frames_drop: int = 2
audio_frame_size: int = 160
input_features: int = 128
conv_context_size: List[int] = field(default_factory=lambda: [8, 0])
len_layers: int = 24
d_model: int = 1024
max_tokens_per_step: int = 10
window_step: float = 0.01
subsampling_factor: int = 10
|