alexwengg commited on
Commit
aeae878
·
verified ·
1 Parent(s): 925e256

v2 manifest: 36 ships, FLEURS benchmarks, mlmodelc-only B1

Browse files
Files changed (1) hide show
  1. manifest.json +256 -378
manifest.json CHANGED
@@ -1,10 +1,12 @@
1
  {
2
- "name": "Nemotron 3.5 ASR Streaming Multilingual 0.6B CoreML",
3
  "base_model": "nvidia/nemotron-asr-streaming-multilingual-0.6b",
 
4
  "architecture": "Conformer encoder + RNN-T decoder",
5
  "runtime": "CoreML / Apple Neural Engine",
6
  "benchmark_machine": "Apple M5 Pro / macOS 26.5",
7
- "recipe": "LAYERPOS[42,13] mixed-precision encoder + per-language vocab pruning + B1 fusion + triple-stage pipelining + smart-spec K=4",
 
8
  "no_retraining": true,
9
  "no_calibration": true,
10
  "tiers_ms": [
@@ -14,26 +16,10 @@
14
  4480
15
  ],
16
  "recommended_tier_ms": 2240,
17
- "models": [
18
- "de",
19
- "en",
20
- "es",
21
- "fr",
22
- "it",
23
- "ja",
24
- "multilingual",
25
- "pt",
26
- "zh"
27
- ],
28
- "ship_count": 36,
29
- "notes": [
30
- "2240ms (2s) is the RTFx-per-latency sweet spot for every model (+20-44% over 1120ms, quality-neutral).",
31
- "Multilingual peaks at 2240ms (74.6 RTFx); its 4480ms tier craters (19.4) because the 13088-vocab joint exceeds ANE working-set.",
32
- "560ms is the lowest-latency tier but off the trained 14-frame attention tiling: lower RTFx (~57) and a small quality cost. en measured (57.2 RTFx); other 560ms ships shipped unbenched.",
33
- "Portuguese 2240ms WER is +3.1pp vs 1120ms (B1-fallback path, chunk-sensitive).",
34
- "de/zh/ja keep-sets were derived from FLEURS-test transcripts: in-domain/optimistic numbers + OOV risk on out-of-domain text. Rebuild keep-set from a broader corpus for production.",
35
- "Each <lang>/<tier>ms/ dir is a self-contained FluidAudio model bundle (point --model-dir at it)."
36
- ],
37
  "ships": [
38
  {
39
  "path": "en/560ms",
@@ -47,31 +33,27 @@
47
  42,
48
  13
49
  ],
50
- "vocab_size": 988,
51
  "vocab_pruned": true,
 
52
  "components": [
53
  "decoder",
54
  "decoder_joint",
55
- "decoder_joint_noencproj",
56
  "encoder",
57
  "joint",
58
- "joint_noencproj_batched",
59
  "preprocessor"
60
  ],
61
  "formats": [
62
- "mlpackage",
63
  "mlmodelc"
64
  ],
65
  "benchmark": {
66
- "rtfx": 57.2,
67
- "wer_pct": 4.1,
68
- "cer_pct": 1.5,
69
  "metric": "WER",
70
- "n": 2620,
71
- "test_set": "LibriSpeech test-clean",
72
  "benched": true
73
- },
74
- "recommended": false
75
  },
76
  {
77
  "path": "en/1120ms",
@@ -85,31 +67,27 @@
85
  42,
86
  13
87
  ],
88
- "vocab_size": 988,
89
  "vocab_pruned": true,
 
90
  "components": [
91
  "decoder",
92
  "decoder_joint",
93
- "decoder_joint_noencproj",
94
  "encoder",
95
  "joint",
96
- "joint_noencproj_batched",
97
  "preprocessor"
98
  ],
99
  "formats": [
100
- "mlpackage",
101
  "mlmodelc"
102
  ],
103
  "benchmark": {
104
- "rtfx": 101.4,
105
- "wer_pct": 3.81,
106
- "cer_pct": 1.4,
107
  "metric": "WER",
108
- "n": 2620,
109
- "test_set": "LibriSpeech test-clean",
110
  "benched": true
111
- },
112
- "recommended": false
113
  },
114
  {
115
  "path": "en/2240ms",
@@ -123,31 +101,27 @@
123
  42,
124
  13
125
  ],
126
- "vocab_size": 988,
127
  "vocab_pruned": true,
 
128
  "components": [
129
  "decoder",
130
  "decoder_joint",
131
- "decoder_joint_noencproj",
132
  "encoder",
133
  "joint",
134
- "joint_noencproj_batched",
135
  "preprocessor"
136
  ],
137
  "formats": [
138
- "mlpackage",
139
  "mlmodelc"
140
  ],
141
  "benchmark": {
142
- "rtfx": 134.7,
143
- "wer_pct": 3.7,
144
- "cer_pct": 1.4,
145
  "metric": "WER",
146
- "n": 2620,
147
- "test_set": "LibriSpeech test-clean",
148
  "benched": true
149
- },
150
- "recommended": true
151
  },
152
  {
153
  "path": "en/4480ms",
@@ -158,34 +132,30 @@
158
  "chunk_mel_frames": 448,
159
  "total_mel_frames": 457,
160
  "att_context": [
161
- 56,
162
  13
163
  ],
164
- "vocab_size": 989,
165
  "vocab_pruned": true,
 
166
  "components": [
167
  "decoder",
168
  "decoder_joint",
169
- "decoder_joint_noencproj",
170
  "encoder",
171
  "joint",
172
- "joint_noencproj_batched",
173
  "preprocessor"
174
  ],
175
  "formats": [
176
- "mlpackage",
177
  "mlmodelc"
178
  ],
179
  "benchmark": {
180
- "rtfx": 136.7,
181
- "wer_pct": 3.7,
182
- "cer_pct": 1.4,
183
  "metric": "WER",
184
- "n": 2620,
185
- "test_set": "LibriSpeech test-clean",
186
  "benched": true
187
- },
188
- "recommended": false
189
  },
190
  {
191
  "path": "es/560ms",
@@ -199,8 +169,9 @@
199
  42,
200
  13
201
  ],
202
- "vocab_size": 831,
203
  "vocab_pruned": true,
 
204
  "components": [
205
  "decoder",
206
  "decoder_joint",
@@ -209,19 +180,16 @@
209
  "preprocessor"
210
  ],
211
  "formats": [
212
- "mlpackage",
213
  "mlmodelc"
214
  ],
215
  "benchmark": {
216
- "rtfx": null,
217
- "wer_pct": null,
218
- "cer_pct": null,
219
  "metric": "WER",
220
- "n": 0,
221
- "test_set": "MLS es_419",
222
- "benched": false
223
- },
224
- "recommended": false
225
  },
226
  {
227
  "path": "es/1120ms",
@@ -235,8 +203,9 @@
235
  42,
236
  13
237
  ],
238
- "vocab_size": 831,
239
  "vocab_pruned": true,
 
240
  "components": [
241
  "decoder",
242
  "decoder_joint",
@@ -245,19 +214,16 @@
245
  "preprocessor"
246
  ],
247
  "formats": [
248
- "mlpackage",
249
  "mlmodelc"
250
  ],
251
  "benchmark": {
252
- "rtfx": 106.8,
253
- "wer_pct": 6.5,
254
- "cer_pct": null,
255
  "metric": "WER",
256
- "n": 2385,
257
- "test_set": "MLS es_419",
258
  "benched": true
259
- },
260
- "recommended": false
261
  },
262
  {
263
  "path": "es/2240ms",
@@ -271,8 +237,9 @@
271
  42,
272
  13
273
  ],
274
- "vocab_size": 831,
275
  "vocab_pruned": true,
 
276
  "components": [
277
  "decoder",
278
  "decoder_joint",
@@ -281,19 +248,16 @@
281
  "preprocessor"
282
  ],
283
  "formats": [
284
- "mlpackage",
285
  "mlmodelc"
286
  ],
287
  "benchmark": {
288
- "rtfx": 153.6,
289
- "wer_pct": 6.8,
290
- "cer_pct": null,
291
  "metric": "WER",
292
- "n": 2385,
293
- "test_set": "MLS es_419",
294
  "benched": true
295
- },
296
- "recommended": true
297
  },
298
  {
299
  "path": "es/4480ms",
@@ -307,31 +271,27 @@
307
  42,
308
  13
309
  ],
310
- "vocab_size": 831,
311
  "vocab_pruned": true,
 
312
  "components": [
313
  "decoder",
314
  "decoder_joint",
315
- "decoder_joint_noencproj",
316
  "encoder",
317
  "joint",
318
- "joint_noencproj_batched",
319
  "preprocessor"
320
  ],
321
  "formats": [
322
- "mlpackage",
323
  "mlmodelc"
324
  ],
325
  "benchmark": {
326
- "rtfx": 139.0,
327
- "wer_pct": 6.4,
328
- "cer_pct": null,
329
  "metric": "WER",
330
- "n": 2385,
331
- "test_set": "MLS es_419",
332
  "benched": true
333
- },
334
- "recommended": false
335
  },
336
  {
337
  "path": "fr/560ms",
@@ -345,31 +305,27 @@
345
  42,
346
  13
347
  ],
348
- "vocab_size": 848,
349
  "vocab_pruned": true,
 
350
  "components": [
351
  "decoder",
352
  "decoder_joint",
353
- "decoder_joint_noencproj",
354
  "encoder",
355
  "joint",
356
- "joint_noencproj_batched",
357
  "preprocessor"
358
  ],
359
  "formats": [
360
- "mlpackage",
361
  "mlmodelc"
362
  ],
363
  "benchmark": {
364
- "rtfx": null,
365
- "wer_pct": null,
366
- "cer_pct": null,
367
  "metric": "WER",
368
- "n": 0,
369
- "test_set": "MLS fr_fr",
370
- "benched": false
371
- },
372
- "recommended": false
373
  },
374
  {
375
  "path": "fr/1120ms",
@@ -383,31 +339,27 @@
383
  42,
384
  13
385
  ],
386
- "vocab_size": 848,
387
  "vocab_pruned": true,
 
388
  "components": [
389
  "decoder",
390
  "decoder_joint",
391
- "decoder_joint_noencproj",
392
  "encoder",
393
  "joint",
394
- "joint_noencproj_batched",
395
  "preprocessor"
396
  ],
397
  "formats": [
398
- "mlpackage",
399
  "mlmodelc"
400
  ],
401
  "benchmark": {
402
- "rtfx": 109.5,
403
- "wer_pct": 9.9,
404
- "cer_pct": null,
405
  "metric": "WER",
406
- "n": 2426,
407
- "test_set": "MLS fr_fr",
408
  "benched": true
409
- },
410
- "recommended": false
411
  },
412
  {
413
  "path": "fr/2240ms",
@@ -421,31 +373,27 @@
421
  42,
422
  13
423
  ],
424
- "vocab_size": 848,
425
  "vocab_pruned": true,
 
426
  "components": [
427
  "decoder",
428
  "decoder_joint",
429
- "decoder_joint_noencproj",
430
  "encoder",
431
  "joint",
432
- "joint_noencproj_batched",
433
  "preprocessor"
434
  ],
435
  "formats": [
436
- "mlpackage",
437
  "mlmodelc"
438
  ],
439
  "benchmark": {
440
- "rtfx": 134.4,
441
- "wer_pct": 10.4,
442
- "cer_pct": null,
443
  "metric": "WER",
444
- "n": 2426,
445
- "test_set": "MLS fr_fr",
446
  "benched": true
447
- },
448
- "recommended": true
449
  },
450
  {
451
  "path": "fr/4480ms",
@@ -459,31 +407,27 @@
459
  42,
460
  13
461
  ],
462
- "vocab_size": 848,
463
  "vocab_pruned": true,
 
464
  "components": [
465
  "decoder",
466
  "decoder_joint",
467
- "decoder_joint_noencproj",
468
  "encoder",
469
  "joint",
470
- "joint_noencproj_batched",
471
  "preprocessor"
472
  ],
473
  "formats": [
474
- "mlpackage",
475
  "mlmodelc"
476
  ],
477
  "benchmark": {
478
- "rtfx": 130.2,
479
- "wer_pct": 16.8,
480
- "cer_pct": null,
481
  "metric": "WER",
482
- "n": 100,
483
- "test_set": "MLS fr_fr",
484
  "benched": true
485
- },
486
- "recommended": false
487
  },
488
  {
489
  "path": "it/560ms",
@@ -497,31 +441,27 @@
497
  42,
498
  13
499
  ],
500
- "vocab_size": 805,
501
  "vocab_pruned": true,
 
502
  "components": [
503
  "decoder",
504
  "decoder_joint",
505
- "decoder_joint_noencproj",
506
  "encoder",
507
  "joint",
508
- "joint_noencproj_batched",
509
  "preprocessor"
510
  ],
511
  "formats": [
512
- "mlpackage",
513
  "mlmodelc"
514
  ],
515
  "benchmark": {
516
- "rtfx": null,
517
- "wer_pct": null,
518
- "cer_pct": null,
519
  "metric": "WER",
520
- "n": 0,
521
- "test_set": "MLS it_it",
522
- "benched": false
523
- },
524
- "recommended": false
525
  },
526
  {
527
  "path": "it/1120ms",
@@ -535,31 +475,27 @@
535
  42,
536
  13
537
  ],
538
- "vocab_size": 805,
539
  "vocab_pruned": true,
 
540
  "components": [
541
  "decoder",
542
  "decoder_joint",
543
- "decoder_joint_noencproj",
544
  "encoder",
545
  "joint",
546
- "joint_noencproj_batched",
547
  "preprocessor"
548
  ],
549
  "formats": [
550
- "mlpackage",
551
  "mlmodelc"
552
  ],
553
  "benchmark": {
554
- "rtfx": 109.2,
555
- "wer_pct": 23.0,
556
- "cer_pct": null,
557
  "metric": "WER",
558
- "n": 1262,
559
- "test_set": "MLS it_it",
560
  "benched": true
561
- },
562
- "recommended": false
563
  },
564
  {
565
  "path": "it/2240ms",
@@ -573,31 +509,27 @@
573
  42,
574
  13
575
  ],
576
- "vocab_size": 805,
577
  "vocab_pruned": true,
 
578
  "components": [
579
  "decoder",
580
  "decoder_joint",
581
- "decoder_joint_noencproj",
582
  "encoder",
583
  "joint",
584
- "joint_noencproj_batched",
585
  "preprocessor"
586
  ],
587
  "formats": [
588
- "mlpackage",
589
  "mlmodelc"
590
  ],
591
  "benchmark": {
592
- "rtfx": 136.8,
593
- "wer_pct": 18.0,
594
- "cer_pct": null,
595
  "metric": "WER",
596
- "n": 1262,
597
- "test_set": "MLS it_it",
598
  "benched": true
599
- },
600
- "recommended": true
601
  },
602
  {
603
  "path": "it/4480ms",
@@ -611,31 +543,27 @@
611
  42,
612
  13
613
  ],
614
- "vocab_size": 805,
615
  "vocab_pruned": true,
 
616
  "components": [
617
  "decoder",
618
  "decoder_joint",
619
- "decoder_joint_noencproj",
620
  "encoder",
621
  "joint",
622
- "joint_noencproj_batched",
623
  "preprocessor"
624
  ],
625
  "formats": [
626
- "mlpackage",
627
  "mlmodelc"
628
  ],
629
  "benchmark": {
630
- "rtfx": 134.6,
631
- "wer_pct": 25.6,
632
- "cer_pct": null,
633
  "metric": "WER",
634
- "n": 100,
635
- "test_set": "MLS it_it",
636
  "benched": true
637
- },
638
- "recommended": false
639
  },
640
  {
641
  "path": "pt/560ms",
@@ -649,8 +577,9 @@
649
  42,
650
  13
651
  ],
652
- "vocab_size": 870,
653
  "vocab_pruned": true,
 
654
  "components": [
655
  "decoder",
656
  "decoder_joint",
@@ -659,19 +588,16 @@
659
  "preprocessor"
660
  ],
661
  "formats": [
662
- "mlpackage",
663
  "mlmodelc"
664
  ],
665
  "benchmark": {
666
- "rtfx": null,
667
- "wer_pct": null,
668
- "cer_pct": null,
669
  "metric": "WER",
670
- "n": 0,
671
- "test_set": "MLS pt_br",
672
- "benched": false
673
- },
674
- "recommended": false
675
  },
676
  {
677
  "path": "pt/1120ms",
@@ -685,8 +611,9 @@
685
  42,
686
  13
687
  ],
688
- "vocab_size": 870,
689
  "vocab_pruned": true,
 
690
  "components": [
691
  "decoder",
692
  "decoder_joint",
@@ -695,19 +622,16 @@
695
  "preprocessor"
696
  ],
697
  "formats": [
698
- "mlpackage",
699
  "mlmodelc"
700
  ],
701
  "benchmark": {
702
- "rtfx": 111.4,
703
- "wer_pct": 9.8,
704
- "cer_pct": null,
705
  "metric": "WER",
706
- "n": 871,
707
- "test_set": "MLS pt_br",
708
  "benched": true
709
- },
710
- "recommended": false
711
  },
712
  {
713
  "path": "pt/2240ms",
@@ -721,8 +645,9 @@
721
  42,
722
  13
723
  ],
724
- "vocab_size": 870,
725
  "vocab_pruned": true,
 
726
  "components": [
727
  "decoder",
728
  "decoder_joint",
@@ -731,19 +656,16 @@
731
  "preprocessor"
732
  ],
733
  "formats": [
734
- "mlpackage",
735
  "mlmodelc"
736
  ],
737
  "benchmark": {
738
- "rtfx": 155.2,
739
- "wer_pct": 12.9,
740
- "cer_pct": null,
741
  "metric": "WER",
742
- "n": 871,
743
- "test_set": "MLS pt_br",
744
  "benched": true
745
- },
746
- "recommended": true
747
  },
748
  {
749
  "path": "pt/4480ms",
@@ -757,31 +679,27 @@
757
  42,
758
  13
759
  ],
760
- "vocab_size": 870,
761
  "vocab_pruned": true,
 
762
  "components": [
763
  "decoder",
764
  "decoder_joint",
765
- "decoder_joint_noencproj",
766
  "encoder",
767
  "joint",
768
- "joint_noencproj_batched",
769
  "preprocessor"
770
  ],
771
  "formats": [
772
- "mlpackage",
773
  "mlmodelc"
774
  ],
775
  "benchmark": {
776
- "rtfx": 134.9,
777
- "wer_pct": 10.3,
778
- "cer_pct": null,
779
  "metric": "WER",
780
- "n": 871,
781
- "test_set": "MLS pt_br",
782
  "benched": true
783
- },
784
- "recommended": false
785
  },
786
  {
787
  "path": "de/560ms",
@@ -795,8 +713,9 @@
795
  42,
796
  13
797
  ],
798
- "vocab_size": 795,
799
  "vocab_pruned": true,
 
800
  "components": [
801
  "decoder",
802
  "decoder_joint",
@@ -805,19 +724,16 @@
805
  "preprocessor"
806
  ],
807
  "formats": [
808
- "mlpackage",
809
  "mlmodelc"
810
  ],
811
  "benchmark": {
812
- "rtfx": null,
813
- "wer_pct": null,
814
- "cer_pct": null,
815
  "metric": "WER",
816
- "n": 0,
817
  "test_set": "FLEURS de_de",
818
- "benched": false
819
- },
820
- "recommended": false
821
  },
822
  {
823
  "path": "de/1120ms",
@@ -831,8 +747,9 @@
831
  42,
832
  13
833
  ],
834
- "vocab_size": 795,
835
  "vocab_pruned": true,
 
836
  "components": [
837
  "decoder",
838
  "decoder_joint",
@@ -841,19 +758,16 @@
841
  "preprocessor"
842
  ],
843
  "formats": [
844
- "mlpackage",
845
  "mlmodelc"
846
  ],
847
  "benchmark": {
848
- "rtfx": 107.5,
849
- "wer_pct": 11.2,
850
- "cer_pct": 3.6,
851
  "metric": "WER",
852
  "n": 862,
853
  "test_set": "FLEURS de_de",
854
  "benched": true
855
- },
856
- "recommended": false
857
  },
858
  {
859
  "path": "de/2240ms",
@@ -867,8 +781,9 @@
867
  42,
868
  13
869
  ],
870
- "vocab_size": 795,
871
  "vocab_pruned": true,
 
872
  "components": [
873
  "decoder",
874
  "decoder_joint",
@@ -877,19 +792,16 @@
877
  "preprocessor"
878
  ],
879
  "formats": [
880
- "mlpackage",
881
  "mlmodelc"
882
  ],
883
  "benchmark": {
884
- "rtfx": 150.1,
885
- "wer_pct": 11.0,
886
- "cer_pct": 3.8,
887
  "metric": "WER",
888
  "n": 862,
889
  "test_set": "FLEURS de_de",
890
  "benched": true
891
- },
892
- "recommended": true
893
  },
894
  {
895
  "path": "de/4480ms",
@@ -903,8 +815,9 @@
903
  42,
904
  13
905
  ],
906
- "vocab_size": 795,
907
  "vocab_pruned": true,
 
908
  "components": [
909
  "decoder",
910
  "decoder_joint",
@@ -913,19 +826,16 @@
913
  "preprocessor"
914
  ],
915
  "formats": [
916
- "mlpackage",
917
  "mlmodelc"
918
  ],
919
  "benchmark": {
920
- "rtfx": 151.9,
921
- "wer_pct": 11.0,
922
- "cer_pct": 3.9,
923
  "metric": "WER",
924
  "n": 862,
925
  "test_set": "FLEURS de_de",
926
  "benched": true
927
- },
928
- "recommended": false
929
  },
930
  {
931
  "path": "zh/560ms",
@@ -939,8 +849,9 @@
939
  42,
940
  13
941
  ],
942
- "vocab_size": 1875,
943
- "vocab_pruned": true,
 
944
  "components": [
945
  "decoder",
946
  "decoder_joint",
@@ -949,19 +860,16 @@
949
  "preprocessor"
950
  ],
951
  "formats": [
952
- "mlpackage",
953
  "mlmodelc"
954
  ],
955
  "benchmark": {
956
- "rtfx": null,
957
- "wer_pct": null,
958
- "cer_pct": null,
959
  "metric": "CER",
960
- "n": 0,
961
  "test_set": "FLEURS cmn_hans_cn",
962
- "benched": false
963
- },
964
- "recommended": false
965
  },
966
  {
967
  "path": "zh/1120ms",
@@ -975,8 +883,9 @@
975
  42,
976
  13
977
  ],
978
- "vocab_size": 1875,
979
- "vocab_pruned": true,
 
980
  "components": [
981
  "decoder",
982
  "decoder_joint",
@@ -985,19 +894,16 @@
985
  "preprocessor"
986
  ],
987
  "formats": [
988
- "mlpackage",
989
  "mlmodelc"
990
  ],
991
  "benchmark": {
992
- "rtfx": 106.3,
993
- "wer_pct": null,
994
- "cer_pct": 21.9,
995
  "metric": "CER",
996
  "n": 945,
997
  "test_set": "FLEURS cmn_hans_cn",
998
  "benched": true
999
- },
1000
- "recommended": false
1001
  },
1002
  {
1003
  "path": "zh/2240ms",
@@ -1011,8 +917,9 @@
1011
  42,
1012
  13
1013
  ],
1014
- "vocab_size": 1875,
1015
- "vocab_pruned": true,
 
1016
  "components": [
1017
  "decoder",
1018
  "decoder_joint",
@@ -1021,19 +928,16 @@
1021
  "preprocessor"
1022
  ],
1023
  "formats": [
1024
- "mlpackage",
1025
  "mlmodelc"
1026
  ],
1027
  "benchmark": {
1028
- "rtfx": 146.0,
1029
- "wer_pct": null,
1030
- "cer_pct": 21.4,
1031
  "metric": "CER",
1032
  "n": 945,
1033
  "test_set": "FLEURS cmn_hans_cn",
1034
  "benched": true
1035
- },
1036
- "recommended": true
1037
  },
1038
  {
1039
  "path": "zh/4480ms",
@@ -1047,8 +951,9 @@
1047
  42,
1048
  13
1049
  ],
1050
- "vocab_size": 1875,
1051
- "vocab_pruned": true,
 
1052
  "components": [
1053
  "decoder",
1054
  "decoder_joint",
@@ -1057,19 +962,16 @@
1057
  "preprocessor"
1058
  ],
1059
  "formats": [
1060
- "mlpackage",
1061
  "mlmodelc"
1062
  ],
1063
  "benchmark": {
1064
- "rtfx": 140.2,
1065
- "wer_pct": null,
1066
- "cer_pct": 21.4,
1067
  "metric": "CER",
1068
  "n": 945,
1069
  "test_set": "FLEURS cmn_hans_cn",
1070
  "benched": true
1071
- },
1072
- "recommended": false
1073
  },
1074
  {
1075
  "path": "ja/560ms",
@@ -1083,8 +985,9 @@
1083
  42,
1084
  13
1085
  ],
1086
- "vocab_size": 1403,
1087
- "vocab_pruned": true,
 
1088
  "components": [
1089
  "decoder",
1090
  "decoder_joint",
@@ -1093,19 +996,16 @@
1093
  "preprocessor"
1094
  ],
1095
  "formats": [
1096
- "mlpackage",
1097
  "mlmodelc"
1098
  ],
1099
  "benchmark": {
1100
- "rtfx": null,
1101
- "wer_pct": null,
1102
- "cer_pct": null,
1103
  "metric": "CER",
1104
- "n": 0,
1105
  "test_set": "FLEURS ja_jp",
1106
- "benched": false
1107
- },
1108
- "recommended": false
1109
  },
1110
  {
1111
  "path": "ja/1120ms",
@@ -1119,8 +1019,9 @@
1119
  42,
1120
  13
1121
  ],
1122
- "vocab_size": 1403,
1123
- "vocab_pruned": true,
 
1124
  "components": [
1125
  "decoder",
1126
  "decoder_joint",
@@ -1129,19 +1030,16 @@
1129
  "preprocessor"
1130
  ],
1131
  "formats": [
1132
- "mlpackage",
1133
  "mlmodelc"
1134
  ],
1135
  "benchmark": {
1136
- "rtfx": 108.5,
1137
- "wer_pct": null,
1138
- "cer_pct": 15.6,
1139
  "metric": "CER",
1140
  "n": 650,
1141
  "test_set": "FLEURS ja_jp",
1142
  "benched": true
1143
- },
1144
- "recommended": false
1145
  },
1146
  {
1147
  "path": "ja/2240ms",
@@ -1155,8 +1053,9 @@
1155
  42,
1156
  13
1157
  ],
1158
- "vocab_size": 1403,
1159
- "vocab_pruned": true,
 
1160
  "components": [
1161
  "decoder",
1162
  "decoder_joint",
@@ -1165,19 +1064,16 @@
1165
  "preprocessor"
1166
  ],
1167
  "formats": [
1168
- "mlpackage",
1169
  "mlmodelc"
1170
  ],
1171
  "benchmark": {
1172
- "rtfx": 150.5,
1173
- "wer_pct": null,
1174
- "cer_pct": 15.4,
1175
  "metric": "CER",
1176
  "n": 650,
1177
  "test_set": "FLEURS ja_jp",
1178
  "benched": true
1179
- },
1180
- "recommended": true
1181
  },
1182
  {
1183
  "path": "ja/4480ms",
@@ -1191,8 +1087,9 @@
1191
  42,
1192
  13
1193
  ],
1194
- "vocab_size": 1403,
1195
- "vocab_pruned": true,
 
1196
  "components": [
1197
  "decoder",
1198
  "decoder_joint",
@@ -1201,23 +1098,20 @@
1201
  "preprocessor"
1202
  ],
1203
  "formats": [
1204
- "mlpackage",
1205
  "mlmodelc"
1206
  ],
1207
  "benchmark": {
1208
- "rtfx": 147.8,
1209
- "wer_pct": null,
1210
- "cer_pct": 15.3,
1211
  "metric": "CER",
1212
  "n": 650,
1213
  "test_set": "FLEURS ja_jp",
1214
  "benched": true
1215
- },
1216
- "recommended": false
1217
  },
1218
  {
1219
  "path": "multilingual/560ms",
1220
- "language": "Multilingual (100+ langs via prompt_id)",
1221
  "language_code": "auto",
1222
  "chunk_ms": 560,
1223
  "latency_s": 0.56,
@@ -1229,33 +1123,29 @@
1229
  ],
1230
  "vocab_size": 13087,
1231
  "vocab_pruned": false,
 
1232
  "components": [
1233
  "decoder",
1234
  "decoder_joint",
1235
- "decoder_joint_noencproj",
1236
  "encoder",
1237
  "joint",
1238
- "joint_noencproj_batched",
1239
  "preprocessor"
1240
  ],
1241
  "formats": [
1242
- "mlpackage",
1243
  "mlmodelc"
1244
  ],
1245
  "benchmark": {
1246
- "rtfx": null,
1247
- "wer_pct": null,
1248
- "cer_pct": null,
1249
  "metric": "WER",
1250
- "n": 0,
1251
- "test_set": "LibriSpeech test-clean (en)",
1252
- "benched": false
1253
- },
1254
- "recommended": false
1255
  },
1256
  {
1257
  "path": "multilingual/1120ms",
1258
- "language": "Multilingual (100+ langs via prompt_id)",
1259
  "language_code": "auto",
1260
  "chunk_ms": 1120,
1261
  "latency_s": 1.12,
@@ -1267,33 +1157,29 @@
1267
  ],
1268
  "vocab_size": 13087,
1269
  "vocab_pruned": false,
 
1270
  "components": [
1271
  "decoder",
1272
  "decoder_joint",
1273
- "decoder_joint_noencproj",
1274
  "encoder",
1275
  "joint",
1276
- "joint_noencproj_batched",
1277
  "preprocessor"
1278
  ],
1279
  "formats": [
1280
- "mlpackage",
1281
  "mlmodelc"
1282
  ],
1283
  "benchmark": {
1284
- "rtfx": 62.2,
1285
- "wer_pct": 3.8,
1286
- "cer_pct": 1.4,
1287
  "metric": "WER",
1288
- "n": 2620,
1289
- "test_set": "LibriSpeech test-clean (en)",
1290
  "benched": true
1291
- },
1292
- "recommended": false
1293
  },
1294
  {
1295
  "path": "multilingual/2240ms",
1296
- "language": "Multilingual (100+ langs via prompt_id)",
1297
  "language_code": "auto",
1298
  "chunk_ms": 2240,
1299
  "latency_s": 2.24,
@@ -1305,33 +1191,29 @@
1305
  ],
1306
  "vocab_size": 13087,
1307
  "vocab_pruned": false,
 
1308
  "components": [
1309
  "decoder",
1310
  "decoder_joint",
1311
- "decoder_joint_noencproj",
1312
  "encoder",
1313
  "joint",
1314
- "joint_noencproj_batched",
1315
  "preprocessor"
1316
  ],
1317
  "formats": [
1318
- "mlpackage",
1319
  "mlmodelc"
1320
  ],
1321
  "benchmark": {
1322
- "rtfx": 74.6,
1323
- "wer_pct": 3.7,
1324
- "cer_pct": 1.4,
1325
  "metric": "WER",
1326
- "n": 2620,
1327
- "test_set": "LibriSpeech test-clean (en)",
1328
  "benched": true
1329
- },
1330
- "recommended": true
1331
  },
1332
  {
1333
  "path": "multilingual/4480ms",
1334
- "language": "Multilingual (100+ langs via prompt_id)",
1335
  "language_code": "auto",
1336
  "chunk_ms": 4480,
1337
  "latency_s": 4.48,
@@ -1343,29 +1225,25 @@
1343
  ],
1344
  "vocab_size": 13087,
1345
  "vocab_pruned": false,
 
1346
  "components": [
1347
  "decoder",
1348
  "decoder_joint",
1349
- "decoder_joint_noencproj",
1350
  "encoder",
1351
  "joint",
1352
- "joint_noencproj_batched",
1353
  "preprocessor"
1354
  ],
1355
  "formats": [
1356
- "mlpackage",
1357
  "mlmodelc"
1358
  ],
1359
  "benchmark": {
1360
- "rtfx": 19.4,
1361
- "wer_pct": 3.7,
1362
- "cer_pct": 1.4,
1363
  "metric": "WER",
1364
- "n": 2620,
1365
- "test_set": "LibriSpeech test-clean (en)",
1366
  "benched": true
1367
- },
1368
- "recommended": false
1369
  }
1370
  ]
1371
  }
 
1
  {
2
+ "name": "Nemotron 3.5 ASR Streaming Multilingual 0.6B \u2014 CoreML",
3
  "base_model": "nvidia/nemotron-asr-streaming-multilingual-0.6b",
4
+ "base_model_checkpoint": "2026-05-29 update",
5
  "architecture": "Conformer encoder + RNN-T decoder",
6
  "runtime": "CoreML / Apple Neural Engine",
7
  "benchmark_machine": "Apple M5 Pro / macOS 26.5",
8
+ "benchmark_dataset": "FLEURS test (all languages)",
9
+ "recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (shared en/es/fr/it/pt/de) / full vocab (zh/ja/multilingual) + B1 decoder-joint fusion + triple-stage pipelining",
10
  "no_retraining": true,
11
  "no_calibration": true,
12
  "tiers_ms": [
 
16
  4480
17
  ],
18
  "recommended_tier_ms": 2240,
19
+ "vocab": {
20
+ "latin_shared": 2828,
21
+ "full": 13087
22
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "ships": [
24
  {
25
  "path": "en/560ms",
 
33
  42,
34
  13
35
  ],
36
+ "vocab_size": 2828,
37
  "vocab_pruned": true,
38
+ "vocab_prune_method": "latin-script",
39
  "components": [
40
  "decoder",
41
  "decoder_joint",
 
42
  "encoder",
43
  "joint",
 
44
  "preprocessor"
45
  ],
46
  "formats": [
 
47
  "mlmodelc"
48
  ],
49
  "benchmark": {
50
+ "rtfx": 57.5,
51
+ "wer_pct": 9.43,
 
52
  "metric": "WER",
53
+ "n": 647,
54
+ "test_set": "FLEURS en_us",
55
  "benched": true
56
+ }
 
57
  },
58
  {
59
  "path": "en/1120ms",
 
67
  42,
68
  13
69
  ],
70
+ "vocab_size": 2828,
71
  "vocab_pruned": true,
72
+ "vocab_prune_method": "latin-script",
73
  "components": [
74
  "decoder",
75
  "decoder_joint",
 
76
  "encoder",
77
  "joint",
 
78
  "preprocessor"
79
  ],
80
  "formats": [
 
81
  "mlmodelc"
82
  ],
83
  "benchmark": {
84
+ "rtfx": 102.9,
85
+ "wer_pct": 8.89,
 
86
  "metric": "WER",
87
+ "n": 647,
88
+ "test_set": "FLEURS en_us",
89
  "benched": true
90
+ }
 
91
  },
92
  {
93
  "path": "en/2240ms",
 
101
  42,
102
  13
103
  ],
104
+ "vocab_size": 2828,
105
  "vocab_pruned": true,
106
+ "vocab_prune_method": "latin-script",
107
  "components": [
108
  "decoder",
109
  "decoder_joint",
 
110
  "encoder",
111
  "joint",
 
112
  "preprocessor"
113
  ],
114
  "formats": [
 
115
  "mlmodelc"
116
  ],
117
  "benchmark": {
118
+ "rtfx": 130.2,
119
+ "wer_pct": 8.96,
 
120
  "metric": "WER",
121
+ "n": 647,
122
+ "test_set": "FLEURS en_us",
123
  "benched": true
124
+ }
 
125
  },
126
  {
127
  "path": "en/4480ms",
 
132
  "chunk_mel_frames": 448,
133
  "total_mel_frames": 457,
134
  "att_context": [
135
+ 42,
136
  13
137
  ],
138
+ "vocab_size": 2828,
139
  "vocab_pruned": true,
140
+ "vocab_prune_method": "latin-script",
141
  "components": [
142
  "decoder",
143
  "decoder_joint",
 
144
  "encoder",
145
  "joint",
 
146
  "preprocessor"
147
  ],
148
  "formats": [
 
149
  "mlmodelc"
150
  ],
151
  "benchmark": {
152
+ "rtfx": 122.1,
153
+ "wer_pct": 9.02,
 
154
  "metric": "WER",
155
+ "n": 647,
156
+ "test_set": "FLEURS en_us",
157
  "benched": true
158
+ }
 
159
  },
160
  {
161
  "path": "es/560ms",
 
169
  42,
170
  13
171
  ],
172
+ "vocab_size": 2828,
173
  "vocab_pruned": true,
174
+ "vocab_prune_method": "latin-script",
175
  "components": [
176
  "decoder",
177
  "decoder_joint",
 
180
  "preprocessor"
181
  ],
182
  "formats": [
 
183
  "mlmodelc"
184
  ],
185
  "benchmark": {
186
+ "rtfx": 58.2,
187
+ "wer_pct": 4.95,
 
188
  "metric": "WER",
189
+ "n": 908,
190
+ "test_set": "FLEURS es_419",
191
+ "benched": true
192
+ }
 
193
  },
194
  {
195
  "path": "es/1120ms",
 
203
  42,
204
  13
205
  ],
206
+ "vocab_size": 2828,
207
  "vocab_pruned": true,
208
+ "vocab_prune_method": "latin-script",
209
  "components": [
210
  "decoder",
211
  "decoder_joint",
 
214
  "preprocessor"
215
  ],
216
  "formats": [
 
217
  "mlmodelc"
218
  ],
219
  "benchmark": {
220
+ "rtfx": 106.5,
221
+ "wer_pct": 4.76,
 
222
  "metric": "WER",
223
+ "n": 908,
224
+ "test_set": "FLEURS es_419",
225
  "benched": true
226
+ }
 
227
  },
228
  {
229
  "path": "es/2240ms",
 
237
  42,
238
  13
239
  ],
240
+ "vocab_size": 2828,
241
  "vocab_pruned": true,
242
+ "vocab_prune_method": "latin-script",
243
  "components": [
244
  "decoder",
245
  "decoder_joint",
 
248
  "preprocessor"
249
  ],
250
  "formats": [
 
251
  "mlmodelc"
252
  ],
253
  "benchmark": {
254
+ "rtfx": 139.6,
255
+ "wer_pct": 4.8,
 
256
  "metric": "WER",
257
+ "n": 908,
258
+ "test_set": "FLEURS es_419",
259
  "benched": true
260
+ }
 
261
  },
262
  {
263
  "path": "es/4480ms",
 
271
  42,
272
  13
273
  ],
274
+ "vocab_size": 2828,
275
  "vocab_pruned": true,
276
+ "vocab_prune_method": "latin-script",
277
  "components": [
278
  "decoder",
279
  "decoder_joint",
 
280
  "encoder",
281
  "joint",
 
282
  "preprocessor"
283
  ],
284
  "formats": [
 
285
  "mlmodelc"
286
  ],
287
  "benchmark": {
288
+ "rtfx": 135.8,
289
+ "wer_pct": 4.77,
 
290
  "metric": "WER",
291
+ "n": 908,
292
+ "test_set": "FLEURS es_419",
293
  "benched": true
294
+ }
 
295
  },
296
  {
297
  "path": "fr/560ms",
 
305
  42,
306
  13
307
  ],
308
+ "vocab_size": 2828,
309
  "vocab_pruned": true,
310
+ "vocab_prune_method": "latin-script",
311
  "components": [
312
  "decoder",
313
  "decoder_joint",
 
314
  "encoder",
315
  "joint",
 
316
  "preprocessor"
317
  ],
318
  "formats": [
 
319
  "mlmodelc"
320
  ],
321
  "benchmark": {
322
+ "rtfx": 57.4,
323
+ "wer_pct": 9.68,
 
324
  "metric": "WER",
325
+ "n": 676,
326
+ "test_set": "FLEURS fr_fr",
327
+ "benched": true
328
+ }
 
329
  },
330
  {
331
  "path": "fr/1120ms",
 
339
  42,
340
  13
341
  ],
342
+ "vocab_size": 2828,
343
  "vocab_pruned": true,
344
+ "vocab_prune_method": "latin-script",
345
  "components": [
346
  "decoder",
347
  "decoder_joint",
 
348
  "encoder",
349
  "joint",
 
350
  "preprocessor"
351
  ],
352
  "formats": [
 
353
  "mlmodelc"
354
  ],
355
  "benchmark": {
356
+ "rtfx": 104.7,
357
+ "wer_pct": 9.44,
 
358
  "metric": "WER",
359
+ "n": 676,
360
+ "test_set": "FLEURS fr_fr",
361
  "benched": true
362
+ }
 
363
  },
364
  {
365
  "path": "fr/2240ms",
 
373
  42,
374
  13
375
  ],
376
+ "vocab_size": 2828,
377
  "vocab_pruned": true,
378
+ "vocab_prune_method": "latin-script",
379
  "components": [
380
  "decoder",
381
  "decoder_joint",
 
382
  "encoder",
383
  "joint",
 
384
  "preprocessor"
385
  ],
386
  "formats": [
 
387
  "mlmodelc"
388
  ],
389
  "benchmark": {
390
+ "rtfx": 130.4,
391
+ "wer_pct": 9.52,
 
392
  "metric": "WER",
393
+ "n": 676,
394
+ "test_set": "FLEURS fr_fr",
395
  "benched": true
396
+ }
 
397
  },
398
  {
399
  "path": "fr/4480ms",
 
407
  42,
408
  13
409
  ],
410
+ "vocab_size": 2828,
411
  "vocab_pruned": true,
412
+ "vocab_prune_method": "latin-script",
413
  "components": [
414
  "decoder",
415
  "decoder_joint",
 
416
  "encoder",
417
  "joint",
 
418
  "preprocessor"
419
  ],
420
  "formats": [
 
421
  "mlmodelc"
422
  ],
423
  "benchmark": {
424
+ "rtfx": 124.3,
425
+ "wer_pct": 9.42,
 
426
  "metric": "WER",
427
+ "n": 676,
428
+ "test_set": "FLEURS fr_fr",
429
  "benched": true
430
+ }
 
431
  },
432
  {
433
  "path": "it/560ms",
 
441
  42,
442
  13
443
  ],
444
+ "vocab_size": 2828,
445
  "vocab_pruned": true,
446
+ "vocab_prune_method": "latin-script",
447
  "components": [
448
  "decoder",
449
  "decoder_joint",
 
450
  "encoder",
451
  "joint",
 
452
  "preprocessor"
453
  ],
454
  "formats": [
 
455
  "mlmodelc"
456
  ],
457
  "benchmark": {
458
+ "rtfx": 59.0,
459
+ "wer_pct": 5.68,
 
460
  "metric": "WER",
461
+ "n": 865,
462
+ "test_set": "FLEURS it_it",
463
+ "benched": true
464
+ }
 
465
  },
466
  {
467
  "path": "it/1120ms",
 
475
  42,
476
  13
477
  ],
478
+ "vocab_size": 2828,
479
  "vocab_pruned": true,
480
+ "vocab_prune_method": "latin-script",
481
  "components": [
482
  "decoder",
483
  "decoder_joint",
 
484
  "encoder",
485
  "joint",
 
486
  "preprocessor"
487
  ],
488
  "formats": [
 
489
  "mlmodelc"
490
  ],
491
  "benchmark": {
492
+ "rtfx": 109.0,
493
+ "wer_pct": 5.45,
 
494
  "metric": "WER",
495
+ "n": 865,
496
+ "test_set": "FLEURS it_it",
497
  "benched": true
498
+ }
 
499
  },
500
  {
501
  "path": "it/2240ms",
 
509
  42,
510
  13
511
  ],
512
+ "vocab_size": 2828,
513
  "vocab_pruned": true,
514
+ "vocab_prune_method": "latin-script",
515
  "components": [
516
  "decoder",
517
  "decoder_joint",
 
518
  "encoder",
519
  "joint",
 
520
  "preprocessor"
521
  ],
522
  "formats": [
 
523
  "mlmodelc"
524
  ],
525
  "benchmark": {
526
+ "rtfx": 146.7,
527
+ "wer_pct": 5.41,
 
528
  "metric": "WER",
529
+ "n": 865,
530
+ "test_set": "FLEURS it_it",
531
  "benched": true
532
+ }
 
533
  },
534
  {
535
  "path": "it/4480ms",
 
543
  42,
544
  13
545
  ],
546
+ "vocab_size": 2828,
547
  "vocab_pruned": true,
548
+ "vocab_prune_method": "latin-script",
549
  "components": [
550
  "decoder",
551
  "decoder_joint",
 
552
  "encoder",
553
  "joint",
 
554
  "preprocessor"
555
  ],
556
  "formats": [
 
557
  "mlmodelc"
558
  ],
559
  "benchmark": {
560
+ "rtfx": 150.5,
561
+ "wer_pct": 5.4,
 
562
  "metric": "WER",
563
+ "n": 865,
564
+ "test_set": "FLEURS it_it",
565
  "benched": true
566
+ }
 
567
  },
568
  {
569
  "path": "pt/560ms",
 
577
  42,
578
  13
579
  ],
580
+ "vocab_size": 2828,
581
  "vocab_pruned": true,
582
+ "vocab_prune_method": "latin-script",
583
  "components": [
584
  "decoder",
585
  "decoder_joint",
 
588
  "preprocessor"
589
  ],
590
  "formats": [
 
591
  "mlmodelc"
592
  ],
593
  "benchmark": {
594
+ "rtfx": 58.7,
595
+ "wer_pct": 6.38,
 
596
  "metric": "WER",
597
+ "n": 919,
598
+ "test_set": "FLEURS pt_br",
599
+ "benched": true
600
+ }
 
601
  },
602
  {
603
  "path": "pt/1120ms",
 
611
  42,
612
  13
613
  ],
614
+ "vocab_size": 2828,
615
  "vocab_pruned": true,
616
+ "vocab_prune_method": "latin-script",
617
  "components": [
618
  "decoder",
619
  "decoder_joint",
 
622
  "preprocessor"
623
  ],
624
  "formats": [
 
625
  "mlmodelc"
626
  ],
627
  "benchmark": {
628
+ "rtfx": 107.6,
629
+ "wer_pct": 6.11,
 
630
  "metric": "WER",
631
+ "n": 919,
632
+ "test_set": "FLEURS pt_br",
633
  "benched": true
634
+ }
 
635
  },
636
  {
637
  "path": "pt/2240ms",
 
645
  42,
646
  13
647
  ],
648
+ "vocab_size": 2828,
649
  "vocab_pruned": true,
650
+ "vocab_prune_method": "latin-script",
651
  "components": [
652
  "decoder",
653
  "decoder_joint",
 
656
  "preprocessor"
657
  ],
658
  "formats": [
 
659
  "mlmodelc"
660
  ],
661
  "benchmark": {
662
+ "rtfx": 141.0,
663
+ "wer_pct": 6.14,
 
664
  "metric": "WER",
665
+ "n": 919,
666
+ "test_set": "FLEURS pt_br",
667
  "benched": true
668
+ }
 
669
  },
670
  {
671
  "path": "pt/4480ms",
 
679
  42,
680
  13
681
  ],
682
+ "vocab_size": 2828,
683
  "vocab_pruned": true,
684
+ "vocab_prune_method": "latin-script",
685
  "components": [
686
  "decoder",
687
  "decoder_joint",
 
688
  "encoder",
689
  "joint",
 
690
  "preprocessor"
691
  ],
692
  "formats": [
 
693
  "mlmodelc"
694
  ],
695
  "benchmark": {
696
+ "rtfx": 141.3,
697
+ "wer_pct": 6.18,
 
698
  "metric": "WER",
699
+ "n": 919,
700
+ "test_set": "FLEURS pt_br",
701
  "benched": true
702
+ }
 
703
  },
704
  {
705
  "path": "de/560ms",
 
713
  42,
714
  13
715
  ],
716
+ "vocab_size": 2828,
717
  "vocab_pruned": true,
718
+ "vocab_prune_method": "latin-script",
719
  "components": [
720
  "decoder",
721
  "decoder_joint",
 
724
  "preprocessor"
725
  ],
726
  "formats": [
 
727
  "mlmodelc"
728
  ],
729
  "benchmark": {
730
+ "rtfx": 58.8,
731
+ "wer_pct": 10.83,
 
732
  "metric": "WER",
733
+ "n": 862,
734
  "test_set": "FLEURS de_de",
735
+ "benched": true
736
+ }
 
737
  },
738
  {
739
  "path": "de/1120ms",
 
747
  42,
748
  13
749
  ],
750
+ "vocab_size": 2828,
751
  "vocab_pruned": true,
752
+ "vocab_prune_method": "latin-script",
753
  "components": [
754
  "decoder",
755
  "decoder_joint",
 
758
  "preprocessor"
759
  ],
760
  "formats": [
 
761
  "mlmodelc"
762
  ],
763
  "benchmark": {
764
+ "rtfx": 107.2,
765
+ "wer_pct": 9.78,
 
766
  "metric": "WER",
767
  "n": 862,
768
  "test_set": "FLEURS de_de",
769
  "benched": true
770
+ }
 
771
  },
772
  {
773
  "path": "de/2240ms",
 
781
  42,
782
  13
783
  ],
784
+ "vocab_size": 2828,
785
  "vocab_pruned": true,
786
+ "vocab_prune_method": "latin-script",
787
  "components": [
788
  "decoder",
789
  "decoder_joint",
 
792
  "preprocessor"
793
  ],
794
  "formats": [
 
795
  "mlmodelc"
796
  ],
797
  "benchmark": {
798
+ "rtfx": 144.4,
799
+ "wer_pct": 9.83,
 
800
  "metric": "WER",
801
  "n": 862,
802
  "test_set": "FLEURS de_de",
803
  "benched": true
804
+ }
 
805
  },
806
  {
807
  "path": "de/4480ms",
 
815
  42,
816
  13
817
  ],
818
+ "vocab_size": 2828,
819
  "vocab_pruned": true,
820
+ "vocab_prune_method": "latin-script",
821
  "components": [
822
  "decoder",
823
  "decoder_joint",
 
826
  "preprocessor"
827
  ],
828
  "formats": [
 
829
  "mlmodelc"
830
  ],
831
  "benchmark": {
832
+ "rtfx": 141.5,
833
+ "wer_pct": 9.83,
 
834
  "metric": "WER",
835
  "n": 862,
836
  "test_set": "FLEURS de_de",
837
  "benched": true
838
+ }
 
839
  },
840
  {
841
  "path": "zh/560ms",
 
849
  42,
850
  13
851
  ],
852
+ "vocab_size": 13087,
853
+ "vocab_pruned": false,
854
+ "vocab_prune_method": "none (full vocab)",
855
  "components": [
856
  "decoder",
857
  "decoder_joint",
 
860
  "preprocessor"
861
  ],
862
  "formats": [
 
863
  "mlmodelc"
864
  ],
865
  "benchmark": {
866
+ "rtfx": 22.5,
867
+ "cer_pct": 19.48,
 
868
  "metric": "CER",
869
+ "n": 945,
870
  "test_set": "FLEURS cmn_hans_cn",
871
+ "benched": true
872
+ }
 
873
  },
874
  {
875
  "path": "zh/1120ms",
 
883
  42,
884
  13
885
  ],
886
+ "vocab_size": 13087,
887
+ "vocab_pruned": false,
888
+ "vocab_prune_method": "none (full vocab)",
889
  "components": [
890
  "decoder",
891
  "decoder_joint",
 
894
  "preprocessor"
895
  ],
896
  "formats": [
 
897
  "mlmodelc"
898
  ],
899
  "benchmark": {
900
+ "rtfx": 26.6,
901
+ "cer_pct": 18.75,
 
902
  "metric": "CER",
903
  "n": 945,
904
  "test_set": "FLEURS cmn_hans_cn",
905
  "benched": true
906
+ }
 
907
  },
908
  {
909
  "path": "zh/2240ms",
 
917
  42,
918
  13
919
  ],
920
+ "vocab_size": 13087,
921
+ "vocab_pruned": false,
922
+ "vocab_prune_method": "none (full vocab)",
923
  "components": [
924
  "decoder",
925
  "decoder_joint",
 
928
  "preprocessor"
929
  ],
930
  "formats": [
 
931
  "mlmodelc"
932
  ],
933
  "benchmark": {
934
+ "rtfx": 89.0,
935
+ "cer_pct": 18.57,
 
936
  "metric": "CER",
937
  "n": 945,
938
  "test_set": "FLEURS cmn_hans_cn",
939
  "benched": true
940
+ }
 
941
  },
942
  {
943
  "path": "zh/4480ms",
 
951
  42,
952
  13
953
  ],
954
+ "vocab_size": 13087,
955
+ "vocab_pruned": false,
956
+ "vocab_prune_method": "none (full vocab)",
957
  "components": [
958
  "decoder",
959
  "decoder_joint",
 
962
  "preprocessor"
963
  ],
964
  "formats": [
 
965
  "mlmodelc"
966
  ],
967
  "benchmark": {
968
+ "rtfx": 89.6,
969
+ "cer_pct": 18.05,
 
970
  "metric": "CER",
971
  "n": 945,
972
  "test_set": "FLEURS cmn_hans_cn",
973
  "benched": true
974
+ }
 
975
  },
976
  {
977
  "path": "ja/560ms",
 
985
  42,
986
  13
987
  ],
988
+ "vocab_size": 13087,
989
+ "vocab_pruned": false,
990
+ "vocab_prune_method": "none (full vocab)",
991
  "components": [
992
  "decoder",
993
  "decoder_joint",
 
996
  "preprocessor"
997
  ],
998
  "formats": [
 
999
  "mlmodelc"
1000
  ],
1001
  "benchmark": {
1002
+ "rtfx": 20.7,
1003
+ "cer_pct": 14.61,
 
1004
  "metric": "CER",
1005
+ "n": 650,
1006
  "test_set": "FLEURS ja_jp",
1007
+ "benched": true
1008
+ }
 
1009
  },
1010
  {
1011
  "path": "ja/1120ms",
 
1019
  42,
1020
  13
1021
  ],
1022
+ "vocab_size": 13087,
1023
+ "vocab_pruned": false,
1024
+ "vocab_prune_method": "none (full vocab)",
1025
  "components": [
1026
  "decoder",
1027
  "decoder_joint",
 
1030
  "preprocessor"
1031
  ],
1032
  "formats": [
 
1033
  "mlmodelc"
1034
  ],
1035
  "benchmark": {
1036
+ "rtfx": 25.9,
1037
+ "cer_pct": 13.77,
 
1038
  "metric": "CER",
1039
  "n": 650,
1040
  "test_set": "FLEURS ja_jp",
1041
  "benched": true
1042
+ }
 
1043
  },
1044
  {
1045
  "path": "ja/2240ms",
 
1053
  42,
1054
  13
1055
  ],
1056
+ "vocab_size": 13087,
1057
+ "vocab_pruned": false,
1058
+ "vocab_prune_method": "none (full vocab)",
1059
  "components": [
1060
  "decoder",
1061
  "decoder_joint",
 
1064
  "preprocessor"
1065
  ],
1066
  "formats": [
 
1067
  "mlmodelc"
1068
  ],
1069
  "benchmark": {
1070
+ "rtfx": 84.2,
1071
+ "cer_pct": 13.79,
 
1072
  "metric": "CER",
1073
  "n": 650,
1074
  "test_set": "FLEURS ja_jp",
1075
  "benched": true
1076
+ }
 
1077
  },
1078
  {
1079
  "path": "ja/4480ms",
 
1087
  42,
1088
  13
1089
  ],
1090
+ "vocab_size": 13087,
1091
+ "vocab_pruned": false,
1092
+ "vocab_prune_method": "none (full vocab)",
1093
  "components": [
1094
  "decoder",
1095
  "decoder_joint",
 
1098
  "preprocessor"
1099
  ],
1100
  "formats": [
 
1101
  "mlmodelc"
1102
  ],
1103
  "benchmark": {
1104
+ "rtfx": 89.3,
1105
+ "cer_pct": 13.82,
 
1106
  "metric": "CER",
1107
  "n": 650,
1108
  "test_set": "FLEURS ja_jp",
1109
  "benched": true
1110
+ }
 
1111
  },
1112
  {
1113
  "path": "multilingual/560ms",
1114
+ "language": "Multilingual",
1115
  "language_code": "auto",
1116
  "chunk_ms": 560,
1117
  "latency_s": 0.56,
 
1123
  ],
1124
  "vocab_size": 13087,
1125
  "vocab_pruned": false,
1126
+ "vocab_prune_method": "none (full vocab)",
1127
  "components": [
1128
  "decoder",
1129
  "decoder_joint",
 
1130
  "encoder",
1131
  "joint",
 
1132
  "preprocessor"
1133
  ],
1134
  "formats": [
 
1135
  "mlmodelc"
1136
  ],
1137
  "benchmark": {
1138
+ "rtfx": 23.4,
1139
+ "wer_pct": 9.15,
 
1140
  "metric": "WER",
1141
+ "n": 647,
1142
+ "test_set": "FLEURS en_us",
1143
+ "benched": true
1144
+ }
 
1145
  },
1146
  {
1147
  "path": "multilingual/1120ms",
1148
+ "language": "Multilingual",
1149
  "language_code": "auto",
1150
  "chunk_ms": 1120,
1151
  "latency_s": 1.12,
 
1157
  ],
1158
  "vocab_size": 13087,
1159
  "vocab_pruned": false,
1160
+ "vocab_prune_method": "none (full vocab)",
1161
  "components": [
1162
  "decoder",
1163
  "decoder_joint",
 
1164
  "encoder",
1165
  "joint",
 
1166
  "preprocessor"
1167
  ],
1168
  "formats": [
 
1169
  "mlmodelc"
1170
  ],
1171
  "benchmark": {
1172
+ "rtfx": 70.9,
1173
+ "wer_pct": 8.64,
 
1174
  "metric": "WER",
1175
+ "n": 647,
1176
+ "test_set": "FLEURS en_us",
1177
  "benched": true
1178
+ }
 
1179
  },
1180
  {
1181
  "path": "multilingual/2240ms",
1182
+ "language": "Multilingual",
1183
  "language_code": "auto",
1184
  "chunk_ms": 2240,
1185
  "latency_s": 2.24,
 
1191
  ],
1192
  "vocab_size": 13087,
1193
  "vocab_pruned": false,
1194
+ "vocab_prune_method": "none (full vocab)",
1195
  "components": [
1196
  "decoder",
1197
  "decoder_joint",
 
1198
  "encoder",
1199
  "joint",
 
1200
  "preprocessor"
1201
  ],
1202
  "formats": [
 
1203
  "mlmodelc"
1204
  ],
1205
  "benchmark": {
1206
+ "rtfx": 80.4,
1207
+ "wer_pct": 8.76,
 
1208
  "metric": "WER",
1209
+ "n": 647,
1210
+ "test_set": "FLEURS en_us",
1211
  "benched": true
1212
+ }
 
1213
  },
1214
  {
1215
  "path": "multilingual/4480ms",
1216
+ "language": "Multilingual",
1217
  "language_code": "auto",
1218
  "chunk_ms": 4480,
1219
  "latency_s": 4.48,
 
1225
  ],
1226
  "vocab_size": 13087,
1227
  "vocab_pruned": false,
1228
+ "vocab_prune_method": "none (full vocab)",
1229
  "components": [
1230
  "decoder",
1231
  "decoder_joint",
 
1232
  "encoder",
1233
  "joint",
 
1234
  "preprocessor"
1235
  ],
1236
  "formats": [
 
1237
  "mlmodelc"
1238
  ],
1239
  "benchmark": {
1240
+ "rtfx": 78.0,
1241
+ "wer_pct": 8.78,
 
1242
  "metric": "WER",
1243
+ "n": 647,
1244
+ "test_set": "FLEURS en_us",
1245
  "benched": true
1246
+ }
 
1247
  }
1248
  ]
1249
  }