{ "name": "Nemotron 3.5 ASR Streaming Multilingual 0.6B \u2014 CoreML", "base_model": "nvidia/nemotron-asr-streaming-multilingual-0.6b", "base_model_checkpoint": "2026-05-29 update", "architecture": "Conformer encoder + RNN-T decoder", "runtime": "CoreML / Apple Neural Engine", "benchmark_machine": "Apple M5 Pro / macOS 26.5", "benchmark_dataset": "FLEURS test (all languages)", "models": { "latin": { "folder": "latin", "vocab_size": 2828, "serves": [ "en", "es", "fr", "it", "pt", "de" ] }, "multilingual": { "folder": "multilingual", "vocab_size": 13087, "serves": [ "zh", "ja", "100+ via prompt_id" ] } }, "recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (latin model) / full vocab (multilingual) + B1 decoder-joint fusion + triple-stage pipelining. Encoder shared across both models per tier.", "no_retraining": true, "no_calibration": true, "tiers_ms": [ 560, 1120, 2240, 4480 ], "recommended_tier_ms": 2240, "ships": [ { "path": "latin/560ms", "model": "latin", "languages_served": [ "en", "es", "fr", "it", "pt", "de" ], "chunk_ms": 560, "latency_s": 0.56, "chunk_mel_frames": 56, "total_mel_frames": 65, "att_context": [ 42, 13 ], "vocab_size": 2828, "vocab_pruned": true, "vocab_prune_method": "latin-script (writing-system, domain-general)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "en", "rtfx": 57.5, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 9.43 }, { "language_code": "es", "rtfx": 58.2, "metric": "WER", "n": 908, "test_set": "FLEURS es_419", "wer_pct": 4.95 }, { "language_code": "fr", "rtfx": 57.4, "metric": "WER", "n": 676, "test_set": "FLEURS fr_fr", "wer_pct": 9.68 }, { "language_code": "it", "rtfx": 59.0, "metric": "WER", "n": 865, "test_set": "FLEURS it_it", "wer_pct": 5.68 }, { "language_code": "pt", "rtfx": 58.7, "metric": "WER", "n": 919, "test_set": "FLEURS pt_br", "wer_pct": 6.38 }, { "language_code": "de", "rtfx": 58.8, "metric": "WER", "n": 862, "test_set": "FLEURS de_de", "wer_pct": 10.83 } ] }, { "path": "latin/1120ms", "model": "latin", "languages_served": [ "en", "es", "fr", "it", "pt", "de" ], "chunk_ms": 1120, "latency_s": 1.12, "chunk_mel_frames": 112, "total_mel_frames": 121, "att_context": [ 42, 13 ], "vocab_size": 2828, "vocab_pruned": true, "vocab_prune_method": "latin-script (writing-system, domain-general)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "en", "rtfx": 102.9, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 8.89 }, { "language_code": "es", "rtfx": 106.5, "metric": "WER", "n": 908, "test_set": "FLEURS es_419", "wer_pct": 4.76 }, { "language_code": "fr", "rtfx": 104.7, "metric": "WER", "n": 676, "test_set": "FLEURS fr_fr", "wer_pct": 9.44 }, { "language_code": "it", "rtfx": 109.0, "metric": "WER", "n": 865, "test_set": "FLEURS it_it", "wer_pct": 5.45 }, { "language_code": "pt", "rtfx": 107.6, "metric": "WER", "n": 919, "test_set": "FLEURS pt_br", "wer_pct": 6.11 }, { "language_code": "de", "rtfx": 107.2, "metric": "WER", "n": 862, "test_set": "FLEURS de_de", "wer_pct": 9.78 } ] }, { "path": "latin/2240ms", "model": "latin", "languages_served": [ "en", "es", "fr", "it", "pt", "de" ], "chunk_ms": 2240, "latency_s": 2.24, "chunk_mel_frames": 224, "total_mel_frames": 233, "att_context": [ 42, 13 ], "vocab_size": 2828, "vocab_pruned": true, "vocab_prune_method": "latin-script (writing-system, domain-general)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "en", "rtfx": 130.2, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 8.96 }, { "language_code": "es", "rtfx": 139.6, "metric": "WER", "n": 908, "test_set": "FLEURS es_419", "wer_pct": 4.8 }, { "language_code": "fr", "rtfx": 130.4, "metric": "WER", "n": 676, "test_set": "FLEURS fr_fr", "wer_pct": 9.52 }, { "language_code": "it", "rtfx": 146.7, "metric": "WER", "n": 865, "test_set": "FLEURS it_it", "wer_pct": 5.41 }, { "language_code": "pt", "rtfx": 141.0, "metric": "WER", "n": 919, "test_set": "FLEURS pt_br", "wer_pct": 6.14 }, { "language_code": "de", "rtfx": 144.4, "metric": "WER", "n": 862, "test_set": "FLEURS de_de", "wer_pct": 9.83 } ] }, { "path": "latin/4480ms", "model": "latin", "languages_served": [ "en", "es", "fr", "it", "pt", "de" ], "chunk_ms": 4480, "latency_s": 4.48, "chunk_mel_frames": 448, "total_mel_frames": 457, "att_context": [ 42, 13 ], "vocab_size": 2828, "vocab_pruned": true, "vocab_prune_method": "latin-script (writing-system, domain-general)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "en", "rtfx": 122.1, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 9.02 }, { "language_code": "es", "rtfx": 135.8, "metric": "WER", "n": 908, "test_set": "FLEURS es_419", "wer_pct": 4.77 }, { "language_code": "fr", "rtfx": 124.3, "metric": "WER", "n": 676, "test_set": "FLEURS fr_fr", "wer_pct": 9.42 }, { "language_code": "it", "rtfx": 150.5, "metric": "WER", "n": 865, "test_set": "FLEURS it_it", "wer_pct": 5.4 }, { "language_code": "pt", "rtfx": 141.3, "metric": "WER", "n": 919, "test_set": "FLEURS pt_br", "wer_pct": 6.18 }, { "language_code": "de", "rtfx": 141.5, "metric": "WER", "n": 862, "test_set": "FLEURS de_de", "wer_pct": 9.83 } ] }, { "path": "multilingual/560ms", "model": "multilingual", "languages_served": [ "zh", "ja", "100+ via prompt_id" ], "chunk_ms": 560, "latency_s": 0.56, "chunk_mel_frames": 56, "total_mel_frames": 65, "att_context": [ 42, 13 ], "vocab_size": 13087, "vocab_pruned": false, "vocab_prune_method": "none (full vocab)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "zh", "rtfx": 22.5, "metric": "CER", "n": 945, "test_set": "FLEURS cmn_hans_cn", "cer_pct": 19.48 }, { "language_code": "ja", "rtfx": 20.7, "metric": "CER", "n": 650, "test_set": "FLEURS ja_jp", "cer_pct": 14.61 }, { "language_code": "multilingual", "rtfx": 23.4, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 9.15 } ] }, { "path": "multilingual/1120ms", "model": "multilingual", "languages_served": [ "zh", "ja", "100+ via prompt_id" ], "chunk_ms": 1120, "latency_s": 1.12, "chunk_mel_frames": 112, "total_mel_frames": 121, "att_context": [ 42, 13 ], "vocab_size": 13087, "vocab_pruned": false, "vocab_prune_method": "none (full vocab)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "zh", "rtfx": 26.6, "metric": "CER", "n": 945, "test_set": "FLEURS cmn_hans_cn", "cer_pct": 18.75 }, { "language_code": "ja", "rtfx": 25.9, "metric": "CER", "n": 650, "test_set": "FLEURS ja_jp", "cer_pct": 13.77 }, { "language_code": "multilingual", "rtfx": 70.9, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 8.64 } ] }, { "path": "multilingual/2240ms", "model": "multilingual", "languages_served": [ "zh", "ja", "100+ via prompt_id" ], "chunk_ms": 2240, "latency_s": 2.24, "chunk_mel_frames": 224, "total_mel_frames": 233, "att_context": [ 42, 13 ], "vocab_size": 13087, "vocab_pruned": false, "vocab_prune_method": "none (full vocab)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "zh", "rtfx": 89.0, "metric": "CER", "n": 945, "test_set": "FLEURS cmn_hans_cn", "cer_pct": 18.57 }, { "language_code": "ja", "rtfx": 84.2, "metric": "CER", "n": 650, "test_set": "FLEURS ja_jp", "cer_pct": 13.79 }, { "language_code": "multilingual", "rtfx": 80.4, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 8.76 } ] }, { "path": "multilingual/4480ms", "model": "multilingual", "languages_served": [ "zh", "ja", "100+ via prompt_id" ], "chunk_ms": 4480, "latency_s": 4.48, "chunk_mel_frames": 448, "total_mel_frames": 457, "att_context": [ 42, 13 ], "vocab_size": 13087, "vocab_pruned": false, "vocab_prune_method": "none (full vocab)", "components": [ "decoder", "decoder_joint", "encoder", "joint", "preprocessor" ], "formats": [ "mlmodelc" ], "benchmarks": [ { "language_code": "zh", "rtfx": 89.6, "metric": "CER", "n": 945, "test_set": "FLEURS cmn_hans_cn", "cer_pct": 18.05 }, { "language_code": "ja", "rtfx": 89.3, "metric": "CER", "n": 650, "test_set": "FLEURS ja_jp", "cer_pct": 13.82 }, { "language_code": "multilingual", "rtfx": 78.0, "metric": "WER", "n": 647, "test_set": "FLEURS en_us", "wer_pct": 8.78 } ] } ] }