| { |
| "name": "Nemotron 3.5 ASR Streaming Multilingual 0.6B \u2014 CoreML", |
| "base_model": "nvidia/nemotron-asr-streaming-multilingual-0.6b", |
| "base_model_checkpoint": "2026-05-29 update", |
| "architecture": "Conformer encoder + RNN-T decoder", |
| "runtime": "CoreML / Apple Neural Engine", |
| "benchmark_machine": "Apple M5 Pro / macOS 26.5", |
| "benchmark_dataset": "FLEURS test (all languages)", |
| "models": { |
| "latin": { |
| "folder": "latin", |
| "vocab_size": 2828, |
| "serves": [ |
| "en", |
| "es", |
| "fr", |
| "it", |
| "pt", |
| "de" |
| ] |
| }, |
| "multilingual": { |
| "folder": "multilingual", |
| "vocab_size": 13087, |
| "serves": [ |
| "zh", |
| "ja", |
| "100+ via prompt_id" |
| ] |
| } |
| }, |
| "recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (latin model) / full vocab (multilingual) + B1 decoder-joint fusion + triple-stage pipelining. Encoder shared across both models per tier.", |
| "no_retraining": true, |
| "no_calibration": true, |
| "tiers_ms": [ |
| 560, |
| 1120, |
| 2240, |
| 4480 |
| ], |
| "recommended_tier_ms": 2240, |
| "ships": [ |
| { |
| "path": "latin/560ms", |
| "model": "latin", |
| "languages_served": [ |
| "en", |
| "es", |
| "fr", |
| "it", |
| "pt", |
| "de" |
| ], |
| "chunk_ms": 560, |
| "latency_s": 0.56, |
| "chunk_mel_frames": 56, |
| "total_mel_frames": 65, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 2828, |
| "vocab_pruned": true, |
| "vocab_prune_method": "latin-script (writing-system, domain-general)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "en", |
| "rtfx": 57.5, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 9.43 |
| }, |
| { |
| "language_code": "es", |
| "rtfx": 58.2, |
| "metric": "WER", |
| "n": 908, |
| "test_set": "FLEURS es_419", |
| "wer_pct": 4.95 |
| }, |
| { |
| "language_code": "fr", |
| "rtfx": 57.4, |
| "metric": "WER", |
| "n": 676, |
| "test_set": "FLEURS fr_fr", |
| "wer_pct": 9.68 |
| }, |
| { |
| "language_code": "it", |
| "rtfx": 59.0, |
| "metric": "WER", |
| "n": 865, |
| "test_set": "FLEURS it_it", |
| "wer_pct": 5.68 |
| }, |
| { |
| "language_code": "pt", |
| "rtfx": 58.7, |
| "metric": "WER", |
| "n": 919, |
| "test_set": "FLEURS pt_br", |
| "wer_pct": 6.38 |
| }, |
| { |
| "language_code": "de", |
| "rtfx": 58.8, |
| "metric": "WER", |
| "n": 862, |
| "test_set": "FLEURS de_de", |
| "wer_pct": 10.83 |
| } |
| ] |
| }, |
| { |
| "path": "latin/1120ms", |
| "model": "latin", |
| "languages_served": [ |
| "en", |
| "es", |
| "fr", |
| "it", |
| "pt", |
| "de" |
| ], |
| "chunk_ms": 1120, |
| "latency_s": 1.12, |
| "chunk_mel_frames": 112, |
| "total_mel_frames": 121, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 2828, |
| "vocab_pruned": true, |
| "vocab_prune_method": "latin-script (writing-system, domain-general)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "en", |
| "rtfx": 102.9, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 8.89 |
| }, |
| { |
| "language_code": "es", |
| "rtfx": 106.5, |
| "metric": "WER", |
| "n": 908, |
| "test_set": "FLEURS es_419", |
| "wer_pct": 4.76 |
| }, |
| { |
| "language_code": "fr", |
| "rtfx": 104.7, |
| "metric": "WER", |
| "n": 676, |
| "test_set": "FLEURS fr_fr", |
| "wer_pct": 9.44 |
| }, |
| { |
| "language_code": "it", |
| "rtfx": 109.0, |
| "metric": "WER", |
| "n": 865, |
| "test_set": "FLEURS it_it", |
| "wer_pct": 5.45 |
| }, |
| { |
| "language_code": "pt", |
| "rtfx": 107.6, |
| "metric": "WER", |
| "n": 919, |
| "test_set": "FLEURS pt_br", |
| "wer_pct": 6.11 |
| }, |
| { |
| "language_code": "de", |
| "rtfx": 107.2, |
| "metric": "WER", |
| "n": 862, |
| "test_set": "FLEURS de_de", |
| "wer_pct": 9.78 |
| } |
| ] |
| }, |
| { |
| "path": "latin/2240ms", |
| "model": "latin", |
| "languages_served": [ |
| "en", |
| "es", |
| "fr", |
| "it", |
| "pt", |
| "de" |
| ], |
| "chunk_ms": 2240, |
| "latency_s": 2.24, |
| "chunk_mel_frames": 224, |
| "total_mel_frames": 233, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 2828, |
| "vocab_pruned": true, |
| "vocab_prune_method": "latin-script (writing-system, domain-general)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "en", |
| "rtfx": 130.2, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 8.96 |
| }, |
| { |
| "language_code": "es", |
| "rtfx": 139.6, |
| "metric": "WER", |
| "n": 908, |
| "test_set": "FLEURS es_419", |
| "wer_pct": 4.8 |
| }, |
| { |
| "language_code": "fr", |
| "rtfx": 130.4, |
| "metric": "WER", |
| "n": 676, |
| "test_set": "FLEURS fr_fr", |
| "wer_pct": 9.52 |
| }, |
| { |
| "language_code": "it", |
| "rtfx": 146.7, |
| "metric": "WER", |
| "n": 865, |
| "test_set": "FLEURS it_it", |
| "wer_pct": 5.41 |
| }, |
| { |
| "language_code": "pt", |
| "rtfx": 141.0, |
| "metric": "WER", |
| "n": 919, |
| "test_set": "FLEURS pt_br", |
| "wer_pct": 6.14 |
| }, |
| { |
| "language_code": "de", |
| "rtfx": 144.4, |
| "metric": "WER", |
| "n": 862, |
| "test_set": "FLEURS de_de", |
| "wer_pct": 9.83 |
| } |
| ] |
| }, |
| { |
| "path": "latin/4480ms", |
| "model": "latin", |
| "languages_served": [ |
| "en", |
| "es", |
| "fr", |
| "it", |
| "pt", |
| "de" |
| ], |
| "chunk_ms": 4480, |
| "latency_s": 4.48, |
| "chunk_mel_frames": 448, |
| "total_mel_frames": 457, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 2828, |
| "vocab_pruned": true, |
| "vocab_prune_method": "latin-script (writing-system, domain-general)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "en", |
| "rtfx": 122.1, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 9.02 |
| }, |
| { |
| "language_code": "es", |
| "rtfx": 135.8, |
| "metric": "WER", |
| "n": 908, |
| "test_set": "FLEURS es_419", |
| "wer_pct": 4.77 |
| }, |
| { |
| "language_code": "fr", |
| "rtfx": 124.3, |
| "metric": "WER", |
| "n": 676, |
| "test_set": "FLEURS fr_fr", |
| "wer_pct": 9.42 |
| }, |
| { |
| "language_code": "it", |
| "rtfx": 150.5, |
| "metric": "WER", |
| "n": 865, |
| "test_set": "FLEURS it_it", |
| "wer_pct": 5.4 |
| }, |
| { |
| "language_code": "pt", |
| "rtfx": 141.3, |
| "metric": "WER", |
| "n": 919, |
| "test_set": "FLEURS pt_br", |
| "wer_pct": 6.18 |
| }, |
| { |
| "language_code": "de", |
| "rtfx": 141.5, |
| "metric": "WER", |
| "n": 862, |
| "test_set": "FLEURS de_de", |
| "wer_pct": 9.83 |
| } |
| ] |
| }, |
| { |
| "path": "multilingual/560ms", |
| "model": "multilingual", |
| "languages_served": [ |
| "zh", |
| "ja", |
| "100+ via prompt_id" |
| ], |
| "chunk_ms": 560, |
| "latency_s": 0.56, |
| "chunk_mel_frames": 56, |
| "total_mel_frames": 65, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 13087, |
| "vocab_pruned": false, |
| "vocab_prune_method": "none (full vocab)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "zh", |
| "rtfx": 22.5, |
| "metric": "CER", |
| "n": 945, |
| "test_set": "FLEURS cmn_hans_cn", |
| "cer_pct": 19.48 |
| }, |
| { |
| "language_code": "ja", |
| "rtfx": 20.7, |
| "metric": "CER", |
| "n": 650, |
| "test_set": "FLEURS ja_jp", |
| "cer_pct": 14.61 |
| }, |
| { |
| "language_code": "multilingual", |
| "rtfx": 23.4, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 9.15 |
| } |
| ] |
| }, |
| { |
| "path": "multilingual/1120ms", |
| "model": "multilingual", |
| "languages_served": [ |
| "zh", |
| "ja", |
| "100+ via prompt_id" |
| ], |
| "chunk_ms": 1120, |
| "latency_s": 1.12, |
| "chunk_mel_frames": 112, |
| "total_mel_frames": 121, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 13087, |
| "vocab_pruned": false, |
| "vocab_prune_method": "none (full vocab)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "zh", |
| "rtfx": 26.6, |
| "metric": "CER", |
| "n": 945, |
| "test_set": "FLEURS cmn_hans_cn", |
| "cer_pct": 18.75 |
| }, |
| { |
| "language_code": "ja", |
| "rtfx": 25.9, |
| "metric": "CER", |
| "n": 650, |
| "test_set": "FLEURS ja_jp", |
| "cer_pct": 13.77 |
| }, |
| { |
| "language_code": "multilingual", |
| "rtfx": 70.9, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 8.64 |
| } |
| ] |
| }, |
| { |
| "path": "multilingual/2240ms", |
| "model": "multilingual", |
| "languages_served": [ |
| "zh", |
| "ja", |
| "100+ via prompt_id" |
| ], |
| "chunk_ms": 2240, |
| "latency_s": 2.24, |
| "chunk_mel_frames": 224, |
| "total_mel_frames": 233, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 13087, |
| "vocab_pruned": false, |
| "vocab_prune_method": "none (full vocab)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "zh", |
| "rtfx": 89.0, |
| "metric": "CER", |
| "n": 945, |
| "test_set": "FLEURS cmn_hans_cn", |
| "cer_pct": 18.57 |
| }, |
| { |
| "language_code": "ja", |
| "rtfx": 84.2, |
| "metric": "CER", |
| "n": 650, |
| "test_set": "FLEURS ja_jp", |
| "cer_pct": 13.79 |
| }, |
| { |
| "language_code": "multilingual", |
| "rtfx": 80.4, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 8.76 |
| } |
| ] |
| }, |
| { |
| "path": "multilingual/4480ms", |
| "model": "multilingual", |
| "languages_served": [ |
| "zh", |
| "ja", |
| "100+ via prompt_id" |
| ], |
| "chunk_ms": 4480, |
| "latency_s": 4.48, |
| "chunk_mel_frames": 448, |
| "total_mel_frames": 457, |
| "att_context": [ |
| 42, |
| 13 |
| ], |
| "vocab_size": 13087, |
| "vocab_pruned": false, |
| "vocab_prune_method": "none (full vocab)", |
| "components": [ |
| "decoder", |
| "decoder_joint", |
| "encoder", |
| "joint", |
| "preprocessor" |
| ], |
| "formats": [ |
| "mlmodelc" |
| ], |
| "benchmarks": [ |
| { |
| "language_code": "zh", |
| "rtfx": 89.6, |
| "metric": "CER", |
| "n": 945, |
| "test_set": "FLEURS cmn_hans_cn", |
| "cer_pct": 18.05 |
| }, |
| { |
| "language_code": "ja", |
| "rtfx": 89.3, |
| "metric": "CER", |
| "n": 650, |
| "test_set": "FLEURS ja_jp", |
| "cer_pct": 13.82 |
| }, |
| { |
| "language_code": "multilingual", |
| "rtfx": 78.0, |
| "metric": "WER", |
| "n": 647, |
| "test_set": "FLEURS en_us", |
| "wer_pct": 8.78 |
| } |
| ] |
| } |
| ] |
| } |