alexwengg's picture
manifest: 2 models (latin + multilingual) x 4 tiers, per-language benchmarks
87d194b verified
{
"name": "Nemotron 3.5 ASR Streaming Multilingual 0.6B \u2014 CoreML",
"base_model": "nvidia/nemotron-asr-streaming-multilingual-0.6b",
"base_model_checkpoint": "2026-05-29 update",
"architecture": "Conformer encoder + RNN-T decoder",
"runtime": "CoreML / Apple Neural Engine",
"benchmark_machine": "Apple M5 Pro / macOS 26.5",
"benchmark_dataset": "FLEURS test (all languages)",
"models": {
"latin": {
"folder": "latin",
"vocab_size": 2828,
"serves": [
"en",
"es",
"fr",
"it",
"pt",
"de"
]
},
"multilingual": {
"folder": "multilingual",
"vocab_size": 13087,
"serves": [
"zh",
"ja",
"100+ via prompt_id"
]
}
},
"recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (latin model) / full vocab (multilingual) + B1 decoder-joint fusion + triple-stage pipelining. Encoder shared across both models per tier.",
"no_retraining": true,
"no_calibration": true,
"tiers_ms": [
560,
1120,
2240,
4480
],
"recommended_tier_ms": 2240,
"ships": [
{
"path": "latin/560ms",
"model": "latin",
"languages_served": [
"en",
"es",
"fr",
"it",
"pt",
"de"
],
"chunk_ms": 560,
"latency_s": 0.56,
"chunk_mel_frames": 56,
"total_mel_frames": 65,
"att_context": [
42,
13
],
"vocab_size": 2828,
"vocab_pruned": true,
"vocab_prune_method": "latin-script (writing-system, domain-general)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "en",
"rtfx": 57.5,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 9.43
},
{
"language_code": "es",
"rtfx": 58.2,
"metric": "WER",
"n": 908,
"test_set": "FLEURS es_419",
"wer_pct": 4.95
},
{
"language_code": "fr",
"rtfx": 57.4,
"metric": "WER",
"n": 676,
"test_set": "FLEURS fr_fr",
"wer_pct": 9.68
},
{
"language_code": "it",
"rtfx": 59.0,
"metric": "WER",
"n": 865,
"test_set": "FLEURS it_it",
"wer_pct": 5.68
},
{
"language_code": "pt",
"rtfx": 58.7,
"metric": "WER",
"n": 919,
"test_set": "FLEURS pt_br",
"wer_pct": 6.38
},
{
"language_code": "de",
"rtfx": 58.8,
"metric": "WER",
"n": 862,
"test_set": "FLEURS de_de",
"wer_pct": 10.83
}
]
},
{
"path": "latin/1120ms",
"model": "latin",
"languages_served": [
"en",
"es",
"fr",
"it",
"pt",
"de"
],
"chunk_ms": 1120,
"latency_s": 1.12,
"chunk_mel_frames": 112,
"total_mel_frames": 121,
"att_context": [
42,
13
],
"vocab_size": 2828,
"vocab_pruned": true,
"vocab_prune_method": "latin-script (writing-system, domain-general)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "en",
"rtfx": 102.9,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 8.89
},
{
"language_code": "es",
"rtfx": 106.5,
"metric": "WER",
"n": 908,
"test_set": "FLEURS es_419",
"wer_pct": 4.76
},
{
"language_code": "fr",
"rtfx": 104.7,
"metric": "WER",
"n": 676,
"test_set": "FLEURS fr_fr",
"wer_pct": 9.44
},
{
"language_code": "it",
"rtfx": 109.0,
"metric": "WER",
"n": 865,
"test_set": "FLEURS it_it",
"wer_pct": 5.45
},
{
"language_code": "pt",
"rtfx": 107.6,
"metric": "WER",
"n": 919,
"test_set": "FLEURS pt_br",
"wer_pct": 6.11
},
{
"language_code": "de",
"rtfx": 107.2,
"metric": "WER",
"n": 862,
"test_set": "FLEURS de_de",
"wer_pct": 9.78
}
]
},
{
"path": "latin/2240ms",
"model": "latin",
"languages_served": [
"en",
"es",
"fr",
"it",
"pt",
"de"
],
"chunk_ms": 2240,
"latency_s": 2.24,
"chunk_mel_frames": 224,
"total_mel_frames": 233,
"att_context": [
42,
13
],
"vocab_size": 2828,
"vocab_pruned": true,
"vocab_prune_method": "latin-script (writing-system, domain-general)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "en",
"rtfx": 130.2,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 8.96
},
{
"language_code": "es",
"rtfx": 139.6,
"metric": "WER",
"n": 908,
"test_set": "FLEURS es_419",
"wer_pct": 4.8
},
{
"language_code": "fr",
"rtfx": 130.4,
"metric": "WER",
"n": 676,
"test_set": "FLEURS fr_fr",
"wer_pct": 9.52
},
{
"language_code": "it",
"rtfx": 146.7,
"metric": "WER",
"n": 865,
"test_set": "FLEURS it_it",
"wer_pct": 5.41
},
{
"language_code": "pt",
"rtfx": 141.0,
"metric": "WER",
"n": 919,
"test_set": "FLEURS pt_br",
"wer_pct": 6.14
},
{
"language_code": "de",
"rtfx": 144.4,
"metric": "WER",
"n": 862,
"test_set": "FLEURS de_de",
"wer_pct": 9.83
}
]
},
{
"path": "latin/4480ms",
"model": "latin",
"languages_served": [
"en",
"es",
"fr",
"it",
"pt",
"de"
],
"chunk_ms": 4480,
"latency_s": 4.48,
"chunk_mel_frames": 448,
"total_mel_frames": 457,
"att_context": [
42,
13
],
"vocab_size": 2828,
"vocab_pruned": true,
"vocab_prune_method": "latin-script (writing-system, domain-general)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "en",
"rtfx": 122.1,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 9.02
},
{
"language_code": "es",
"rtfx": 135.8,
"metric": "WER",
"n": 908,
"test_set": "FLEURS es_419",
"wer_pct": 4.77
},
{
"language_code": "fr",
"rtfx": 124.3,
"metric": "WER",
"n": 676,
"test_set": "FLEURS fr_fr",
"wer_pct": 9.42
},
{
"language_code": "it",
"rtfx": 150.5,
"metric": "WER",
"n": 865,
"test_set": "FLEURS it_it",
"wer_pct": 5.4
},
{
"language_code": "pt",
"rtfx": 141.3,
"metric": "WER",
"n": 919,
"test_set": "FLEURS pt_br",
"wer_pct": 6.18
},
{
"language_code": "de",
"rtfx": 141.5,
"metric": "WER",
"n": 862,
"test_set": "FLEURS de_de",
"wer_pct": 9.83
}
]
},
{
"path": "multilingual/560ms",
"model": "multilingual",
"languages_served": [
"zh",
"ja",
"100+ via prompt_id"
],
"chunk_ms": 560,
"latency_s": 0.56,
"chunk_mel_frames": 56,
"total_mel_frames": 65,
"att_context": [
42,
13
],
"vocab_size": 13087,
"vocab_pruned": false,
"vocab_prune_method": "none (full vocab)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "zh",
"rtfx": 22.5,
"metric": "CER",
"n": 945,
"test_set": "FLEURS cmn_hans_cn",
"cer_pct": 19.48
},
{
"language_code": "ja",
"rtfx": 20.7,
"metric": "CER",
"n": 650,
"test_set": "FLEURS ja_jp",
"cer_pct": 14.61
},
{
"language_code": "multilingual",
"rtfx": 23.4,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 9.15
}
]
},
{
"path": "multilingual/1120ms",
"model": "multilingual",
"languages_served": [
"zh",
"ja",
"100+ via prompt_id"
],
"chunk_ms": 1120,
"latency_s": 1.12,
"chunk_mel_frames": 112,
"total_mel_frames": 121,
"att_context": [
42,
13
],
"vocab_size": 13087,
"vocab_pruned": false,
"vocab_prune_method": "none (full vocab)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "zh",
"rtfx": 26.6,
"metric": "CER",
"n": 945,
"test_set": "FLEURS cmn_hans_cn",
"cer_pct": 18.75
},
{
"language_code": "ja",
"rtfx": 25.9,
"metric": "CER",
"n": 650,
"test_set": "FLEURS ja_jp",
"cer_pct": 13.77
},
{
"language_code": "multilingual",
"rtfx": 70.9,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 8.64
}
]
},
{
"path": "multilingual/2240ms",
"model": "multilingual",
"languages_served": [
"zh",
"ja",
"100+ via prompt_id"
],
"chunk_ms": 2240,
"latency_s": 2.24,
"chunk_mel_frames": 224,
"total_mel_frames": 233,
"att_context": [
42,
13
],
"vocab_size": 13087,
"vocab_pruned": false,
"vocab_prune_method": "none (full vocab)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "zh",
"rtfx": 89.0,
"metric": "CER",
"n": 945,
"test_set": "FLEURS cmn_hans_cn",
"cer_pct": 18.57
},
{
"language_code": "ja",
"rtfx": 84.2,
"metric": "CER",
"n": 650,
"test_set": "FLEURS ja_jp",
"cer_pct": 13.79
},
{
"language_code": "multilingual",
"rtfx": 80.4,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 8.76
}
]
},
{
"path": "multilingual/4480ms",
"model": "multilingual",
"languages_served": [
"zh",
"ja",
"100+ via prompt_id"
],
"chunk_ms": 4480,
"latency_s": 4.48,
"chunk_mel_frames": 448,
"total_mel_frames": 457,
"att_context": [
42,
13
],
"vocab_size": 13087,
"vocab_pruned": false,
"vocab_prune_method": "none (full vocab)",
"components": [
"decoder",
"decoder_joint",
"encoder",
"joint",
"preprocessor"
],
"formats": [
"mlmodelc"
],
"benchmarks": [
{
"language_code": "zh",
"rtfx": 89.6,
"metric": "CER",
"n": 945,
"test_set": "FLEURS cmn_hans_cn",
"cer_pct": 18.05
},
{
"language_code": "ja",
"rtfx": 89.3,
"metric": "CER",
"n": 650,
"test_set": "FLEURS ja_jp",
"cer_pct": 13.82
},
{
"language_code": "multilingual",
"rtfx": 78.0,
"metric": "WER",
"n": 647,
"test_set": "FLEURS en_us",
"wer_pct": 8.78
}
]
}
]
}