| { |
| "model_type": "fastconformer_rnnt", |
| "source_model": "nvidia/nemotron-speech-streaming-en-0.6b", |
| "format": "coreml", |
|
|
| "audio": { |
| "sample_rate": 16000, |
| "sample_format": "S16_LE", |
| "bytes_per_second": 32000 |
| }, |
|
|
| "preprocessor": { |
| "n_mels": 128, |
| "n_fft": 512, |
| "hop_length": 160, |
| "win_length": 400, |
| "window": "hann", |
| "preemph": 0.97, |
| "dither": 1e-05, |
| "normalize": null, |
| "pad_to": 0, |
| "mel_norm": "slaney", |
| "mel_layout": "band_major", |
| "_comment": "mel_layout=band_major means output shape is [n_mels, n_frames], not [n_frames, n_mels]. Mel preprocessing runs on the host (not in CoreML)." |
| }, |
|
|
| "encoder": { |
| "model_file": "encoder.mlmodelc", |
| "layers": 24, |
| "dim": 1024, |
| "chunk_mel_frames": 56, |
| "pre_encode_cache_frames": 9, |
| "total_input_frames": 65, |
| "compute_units": "CPU_AND_NE", |
| "compute_precision": "FLOAT16", |
| "cache_last_channel_shape": [1, 24, 70, 1024], |
| "cache_last_time_shape": [1, 24, 1024, 8], |
| "_comment_cache_layout": "batch-first [B, L, ...] for CoreML. NeMo uses layer-first [L, B, ...] internally; the wrapper transposes.", |
| "inputs": { |
| "audio_signal": {"dtype": "float32", "shape": [1, 128, 65]}, |
| "cache_last_channel": {"dtype": "float32", "shape": [1, 24, 70, 1024], "init": "zeros"}, |
| "cache_last_time": {"dtype": "float32", "shape": [1, 24, 1024, 8], "init": "zeros"}, |
| "cache_last_channel_len": {"dtype": "int32", "shape": [1], "init": "zeros"} |
| }, |
| "outputs": { |
| "encoded": {"dtype": "float16", "shape": [1, 1024, 7]}, |
| "encoded_length": {"dtype": "int32", "shape": [1]}, |
| "cache_channel_out": {"dtype": "float16", "shape": [1, 24, 70, 1024], "_comment": "feed back as cache_last_channel"}, |
| "cache_time_out": {"dtype": "float16", "shape": [1, 24, 1024, 8], "_comment": "feed back as cache_last_time"}, |
| "cache_len_out": {"dtype": "int32", "shape": [1], "_comment": "feed back as cache_last_channel_len"} |
| }, |
| "_comment_strides": "CoreML output MLMultiArrays may have non-contiguous strides due to ANE alignment padding (e.g. stride=32 for dim size 7). Callers must use stride-aware copy, not flat memcpy." |
| }, |
|
|
| "decoder": { |
| "model_file": "decoder.mlmodelc", |
| "prediction_layers": 2, |
| "prediction_hidden": 640, |
| "vocab_size": 1025, |
| "blank_id": 1024, |
| "max_symbols_per_frame": 10, |
| "compute_units": "CPU_ONLY", |
| "compute_precision": "FLOAT16", |
| "inputs": { |
| "encoder_outputs": {"dtype": "float32", "shape": [1, 1024, 1], "_comment": "single encoder frame"}, |
| "targets": {"dtype": "int32", "shape": [1, 1], "_comment": "previous non-blank token (blank_id for first)"}, |
| "target_length": {"dtype": "int32", "shape": [1], "value": 1}, |
| "input_states_1": {"dtype": "float32", "shape": [2, 1, 640], "init": "zeros"}, |
| "input_states_2": {"dtype": "float32", "shape": [2, 1, 640], "init": "zeros"} |
| }, |
| "outputs": { |
| "outputs": {"dtype": "float16", "shape": [1, 1025], "_comment": "logits over vocab, argmax to get token"}, |
| "output_states_1": {"dtype": "float16", "shape": [2, 1, 640], "_comment": "feed back as input_states_1"}, |
| "output_states_2": {"dtype": "float16", "shape": [2, 1, 640], "_comment": "feed back as input_states_2"} |
| } |
| }, |
|
|
| "streaming": { |
| "chunk_duration_ms": 560, |
| "chunk_audio_samples": 8960, |
| "algorithm": "Cache-aware streaming FastConformer. Feed 560ms audio chunks. For each chunk: compute mel (56 frames), prepend 9 cached frames, run encoder. Then for each encoder output frame, run RNNT greedy decode: feed one encoder frame to decoder, argmax logits, if not blank emit token and loop (up to max_symbols_per_frame), if blank move to next frame. Feed cache outputs back as next chunk's cache inputs. Feed decoder states back for next symbol/frame." |
| }, |
|
|
| "ane_profile": { |
| "encoder_ane_ops": 1486, |
| "encoder_cpu_ops": 111, |
| "encoder_ane_pct": 93.0, |
| "cpu_op_breakdown": "select (72, attention masking), softmax (24, one per conformer layer), expand_dims/logical/cast (15, control flow)", |
| "decoder": "100% CPU (CPU_ONLY compute units)", |
| "inference_speed": "~15.5x realtime on Apple Silicon (M-series)" |
| } |
| } |
|
|