alexwengg commited on
Commit
87d194b
·
verified ·
1 Parent(s): 0c10c12

manifest: 2 models (latin + multilingual) x 4 tiers, per-language benchmarks

Browse files
Files changed (1) hide show
  1. manifest.json +442 -1091
manifest.json CHANGED
@@ -6,705 +6,51 @@
6
  "runtime": "CoreML / Apple Neural Engine",
7
  "benchmark_machine": "Apple M5 Pro / macOS 26.5",
8
  "benchmark_dataset": "FLEURS test (all languages)",
9
- "recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (shared en/es/fr/it/pt/de) / full vocab (zh/ja/multilingual) + B1 decoder-joint fusion + triple-stage pipelining",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "no_retraining": true,
11
  "no_calibration": true,
12
  "tiers_ms": [
13
  560,
14
  1120,
15
  2240,
16
- 4480
17
- ],
18
- "recommended_tier_ms": 2240,
19
- "vocab": {
20
- "latin_shared": 2828,
21
- "full": 13087
22
- },
23
- "ships": [
24
- {
25
- "path": "en/560ms",
26
- "language": "English",
27
- "language_code": "en-US",
28
- "chunk_ms": 560,
29
- "latency_s": 0.56,
30
- "chunk_mel_frames": 56,
31
- "total_mel_frames": 65,
32
- "att_context": [
33
- 42,
34
- 13
35
- ],
36
- "vocab_size": 2828,
37
- "vocab_pruned": true,
38
- "vocab_prune_method": "latin-script",
39
- "components": [
40
- "decoder",
41
- "decoder_joint",
42
- "encoder",
43
- "joint",
44
- "preprocessor"
45
- ],
46
- "formats": [
47
- "mlmodelc"
48
- ],
49
- "benchmark": {
50
- "rtfx": 57.5,
51
- "wer_pct": 9.43,
52
- "metric": "WER",
53
- "n": 647,
54
- "test_set": "FLEURS en_us",
55
- "benched": true
56
- }
57
- },
58
- {
59
- "path": "en/1120ms",
60
- "language": "English",
61
- "language_code": "en-US",
62
- "chunk_ms": 1120,
63
- "latency_s": 1.12,
64
- "chunk_mel_frames": 112,
65
- "total_mel_frames": 121,
66
- "att_context": [
67
- 42,
68
- 13
69
- ],
70
- "vocab_size": 2828,
71
- "vocab_pruned": true,
72
- "vocab_prune_method": "latin-script",
73
- "components": [
74
- "decoder",
75
- "decoder_joint",
76
- "encoder",
77
- "joint",
78
- "preprocessor"
79
- ],
80
- "formats": [
81
- "mlmodelc"
82
- ],
83
- "benchmark": {
84
- "rtfx": 102.9,
85
- "wer_pct": 8.89,
86
- "metric": "WER",
87
- "n": 647,
88
- "test_set": "FLEURS en_us",
89
- "benched": true
90
- }
91
- },
92
- {
93
- "path": "en/2240ms",
94
- "language": "English",
95
- "language_code": "en-US",
96
- "chunk_ms": 2240,
97
- "latency_s": 2.24,
98
- "chunk_mel_frames": 224,
99
- "total_mel_frames": 233,
100
- "att_context": [
101
- 42,
102
- 13
103
- ],
104
- "vocab_size": 2828,
105
- "vocab_pruned": true,
106
- "vocab_prune_method": "latin-script",
107
- "components": [
108
- "decoder",
109
- "decoder_joint",
110
- "encoder",
111
- "joint",
112
- "preprocessor"
113
- ],
114
- "formats": [
115
- "mlmodelc"
116
- ],
117
- "benchmark": {
118
- "rtfx": 130.2,
119
- "wer_pct": 8.96,
120
- "metric": "WER",
121
- "n": 647,
122
- "test_set": "FLEURS en_us",
123
- "benched": true
124
- }
125
- },
126
- {
127
- "path": "en/4480ms",
128
- "language": "English",
129
- "language_code": "en-US",
130
- "chunk_ms": 4480,
131
- "latency_s": 4.48,
132
- "chunk_mel_frames": 448,
133
- "total_mel_frames": 457,
134
- "att_context": [
135
- 42,
136
- 13
137
- ],
138
- "vocab_size": 2828,
139
- "vocab_pruned": true,
140
- "vocab_prune_method": "latin-script",
141
- "components": [
142
- "decoder",
143
- "decoder_joint",
144
- "encoder",
145
- "joint",
146
- "preprocessor"
147
- ],
148
- "formats": [
149
- "mlmodelc"
150
- ],
151
- "benchmark": {
152
- "rtfx": 122.1,
153
- "wer_pct": 9.02,
154
- "metric": "WER",
155
- "n": 647,
156
- "test_set": "FLEURS en_us",
157
- "benched": true
158
- }
159
- },
160
- {
161
- "path": "es/560ms",
162
- "language": "Spanish",
163
- "language_code": "es-ES",
164
- "chunk_ms": 560,
165
- "latency_s": 0.56,
166
- "chunk_mel_frames": 56,
167
- "total_mel_frames": 65,
168
- "att_context": [
169
- 42,
170
- 13
171
- ],
172
- "vocab_size": 2828,
173
- "vocab_pruned": true,
174
- "vocab_prune_method": "latin-script",
175
- "components": [
176
- "decoder",
177
- "decoder_joint",
178
- "encoder",
179
- "joint",
180
- "preprocessor"
181
- ],
182
- "formats": [
183
- "mlmodelc"
184
- ],
185
- "benchmark": {
186
- "rtfx": 58.2,
187
- "wer_pct": 4.95,
188
- "metric": "WER",
189
- "n": 908,
190
- "test_set": "FLEURS es_419",
191
- "benched": true
192
- }
193
- },
194
- {
195
- "path": "es/1120ms",
196
- "language": "Spanish",
197
- "language_code": "es-ES",
198
- "chunk_ms": 1120,
199
- "latency_s": 1.12,
200
- "chunk_mel_frames": 112,
201
- "total_mel_frames": 121,
202
- "att_context": [
203
- 42,
204
- 13
205
- ],
206
- "vocab_size": 2828,
207
- "vocab_pruned": true,
208
- "vocab_prune_method": "latin-script",
209
- "components": [
210
- "decoder",
211
- "decoder_joint",
212
- "encoder",
213
- "joint",
214
- "preprocessor"
215
- ],
216
- "formats": [
217
- "mlmodelc"
218
- ],
219
- "benchmark": {
220
- "rtfx": 106.5,
221
- "wer_pct": 4.76,
222
- "metric": "WER",
223
- "n": 908,
224
- "test_set": "FLEURS es_419",
225
- "benched": true
226
- }
227
- },
228
- {
229
- "path": "es/2240ms",
230
- "language": "Spanish",
231
- "language_code": "es-ES",
232
- "chunk_ms": 2240,
233
- "latency_s": 2.24,
234
- "chunk_mel_frames": 224,
235
- "total_mel_frames": 233,
236
- "att_context": [
237
- 42,
238
- 13
239
- ],
240
- "vocab_size": 2828,
241
- "vocab_pruned": true,
242
- "vocab_prune_method": "latin-script",
243
- "components": [
244
- "decoder",
245
- "decoder_joint",
246
- "encoder",
247
- "joint",
248
- "preprocessor"
249
- ],
250
- "formats": [
251
- "mlmodelc"
252
- ],
253
- "benchmark": {
254
- "rtfx": 139.6,
255
- "wer_pct": 4.8,
256
- "metric": "WER",
257
- "n": 908,
258
- "test_set": "FLEURS es_419",
259
- "benched": true
260
- }
261
- },
262
- {
263
- "path": "es/4480ms",
264
- "language": "Spanish",
265
- "language_code": "es-ES",
266
- "chunk_ms": 4480,
267
- "latency_s": 4.48,
268
- "chunk_mel_frames": 448,
269
- "total_mel_frames": 457,
270
- "att_context": [
271
- 42,
272
- 13
273
- ],
274
- "vocab_size": 2828,
275
- "vocab_pruned": true,
276
- "vocab_prune_method": "latin-script",
277
- "components": [
278
- "decoder",
279
- "decoder_joint",
280
- "encoder",
281
- "joint",
282
- "preprocessor"
283
- ],
284
- "formats": [
285
- "mlmodelc"
286
- ],
287
- "benchmark": {
288
- "rtfx": 135.8,
289
- "wer_pct": 4.77,
290
- "metric": "WER",
291
- "n": 908,
292
- "test_set": "FLEURS es_419",
293
- "benched": true
294
- }
295
- },
296
- {
297
- "path": "fr/560ms",
298
- "language": "French",
299
- "language_code": "fr-FR",
300
- "chunk_ms": 560,
301
- "latency_s": 0.56,
302
- "chunk_mel_frames": 56,
303
- "total_mel_frames": 65,
304
- "att_context": [
305
- 42,
306
- 13
307
- ],
308
- "vocab_size": 2828,
309
- "vocab_pruned": true,
310
- "vocab_prune_method": "latin-script",
311
- "components": [
312
- "decoder",
313
- "decoder_joint",
314
- "encoder",
315
- "joint",
316
- "preprocessor"
317
- ],
318
- "formats": [
319
- "mlmodelc"
320
- ],
321
- "benchmark": {
322
- "rtfx": 57.4,
323
- "wer_pct": 9.68,
324
- "metric": "WER",
325
- "n": 676,
326
- "test_set": "FLEURS fr_fr",
327
- "benched": true
328
- }
329
- },
330
- {
331
- "path": "fr/1120ms",
332
- "language": "French",
333
- "language_code": "fr-FR",
334
- "chunk_ms": 1120,
335
- "latency_s": 1.12,
336
- "chunk_mel_frames": 112,
337
- "total_mel_frames": 121,
338
- "att_context": [
339
- 42,
340
- 13
341
- ],
342
- "vocab_size": 2828,
343
- "vocab_pruned": true,
344
- "vocab_prune_method": "latin-script",
345
- "components": [
346
- "decoder",
347
- "decoder_joint",
348
- "encoder",
349
- "joint",
350
- "preprocessor"
351
- ],
352
- "formats": [
353
- "mlmodelc"
354
- ],
355
- "benchmark": {
356
- "rtfx": 104.7,
357
- "wer_pct": 9.44,
358
- "metric": "WER",
359
- "n": 676,
360
- "test_set": "FLEURS fr_fr",
361
- "benched": true
362
- }
363
- },
364
- {
365
- "path": "fr/2240ms",
366
- "language": "French",
367
- "language_code": "fr-FR",
368
- "chunk_ms": 2240,
369
- "latency_s": 2.24,
370
- "chunk_mel_frames": 224,
371
- "total_mel_frames": 233,
372
- "att_context": [
373
- 42,
374
- 13
375
- ],
376
- "vocab_size": 2828,
377
- "vocab_pruned": true,
378
- "vocab_prune_method": "latin-script",
379
- "components": [
380
- "decoder",
381
- "decoder_joint",
382
- "encoder",
383
- "joint",
384
- "preprocessor"
385
- ],
386
- "formats": [
387
- "mlmodelc"
388
- ],
389
- "benchmark": {
390
- "rtfx": 130.4,
391
- "wer_pct": 9.52,
392
- "metric": "WER",
393
- "n": 676,
394
- "test_set": "FLEURS fr_fr",
395
- "benched": true
396
- }
397
- },
398
- {
399
- "path": "fr/4480ms",
400
- "language": "French",
401
- "language_code": "fr-FR",
402
- "chunk_ms": 4480,
403
- "latency_s": 4.48,
404
- "chunk_mel_frames": 448,
405
- "total_mel_frames": 457,
406
- "att_context": [
407
- 42,
408
- 13
409
- ],
410
- "vocab_size": 2828,
411
- "vocab_pruned": true,
412
- "vocab_prune_method": "latin-script",
413
- "components": [
414
- "decoder",
415
- "decoder_joint",
416
- "encoder",
417
- "joint",
418
- "preprocessor"
419
- ],
420
- "formats": [
421
- "mlmodelc"
422
- ],
423
- "benchmark": {
424
- "rtfx": 124.3,
425
- "wer_pct": 9.42,
426
- "metric": "WER",
427
- "n": 676,
428
- "test_set": "FLEURS fr_fr",
429
- "benched": true
430
- }
431
- },
432
- {
433
- "path": "it/560ms",
434
- "language": "Italian",
435
- "language_code": "it-IT",
436
- "chunk_ms": 560,
437
- "latency_s": 0.56,
438
- "chunk_mel_frames": 56,
439
- "total_mel_frames": 65,
440
- "att_context": [
441
- 42,
442
- 13
443
- ],
444
- "vocab_size": 2828,
445
- "vocab_pruned": true,
446
- "vocab_prune_method": "latin-script",
447
- "components": [
448
- "decoder",
449
- "decoder_joint",
450
- "encoder",
451
- "joint",
452
- "preprocessor"
453
- ],
454
- "formats": [
455
- "mlmodelc"
456
- ],
457
- "benchmark": {
458
- "rtfx": 59.0,
459
- "wer_pct": 5.68,
460
- "metric": "WER",
461
- "n": 865,
462
- "test_set": "FLEURS it_it",
463
- "benched": true
464
- }
465
- },
466
- {
467
- "path": "it/1120ms",
468
- "language": "Italian",
469
- "language_code": "it-IT",
470
- "chunk_ms": 1120,
471
- "latency_s": 1.12,
472
- "chunk_mel_frames": 112,
473
- "total_mel_frames": 121,
474
- "att_context": [
475
- 42,
476
- 13
477
- ],
478
- "vocab_size": 2828,
479
- "vocab_pruned": true,
480
- "vocab_prune_method": "latin-script",
481
- "components": [
482
- "decoder",
483
- "decoder_joint",
484
- "encoder",
485
- "joint",
486
- "preprocessor"
487
- ],
488
- "formats": [
489
- "mlmodelc"
490
- ],
491
- "benchmark": {
492
- "rtfx": 109.0,
493
- "wer_pct": 5.45,
494
- "metric": "WER",
495
- "n": 865,
496
- "test_set": "FLEURS it_it",
497
- "benched": true
498
- }
499
- },
500
- {
501
- "path": "it/2240ms",
502
- "language": "Italian",
503
- "language_code": "it-IT",
504
- "chunk_ms": 2240,
505
- "latency_s": 2.24,
506
- "chunk_mel_frames": 224,
507
- "total_mel_frames": 233,
508
- "att_context": [
509
- 42,
510
- 13
511
- ],
512
- "vocab_size": 2828,
513
- "vocab_pruned": true,
514
- "vocab_prune_method": "latin-script",
515
- "components": [
516
- "decoder",
517
- "decoder_joint",
518
- "encoder",
519
- "joint",
520
- "preprocessor"
521
- ],
522
- "formats": [
523
- "mlmodelc"
524
- ],
525
- "benchmark": {
526
- "rtfx": 146.7,
527
- "wer_pct": 5.41,
528
- "metric": "WER",
529
- "n": 865,
530
- "test_set": "FLEURS it_it",
531
- "benched": true
532
- }
533
- },
534
- {
535
- "path": "it/4480ms",
536
- "language": "Italian",
537
- "language_code": "it-IT",
538
- "chunk_ms": 4480,
539
- "latency_s": 4.48,
540
- "chunk_mel_frames": 448,
541
- "total_mel_frames": 457,
542
- "att_context": [
543
- 42,
544
- 13
545
- ],
546
- "vocab_size": 2828,
547
- "vocab_pruned": true,
548
- "vocab_prune_method": "latin-script",
549
- "components": [
550
- "decoder",
551
- "decoder_joint",
552
- "encoder",
553
- "joint",
554
- "preprocessor"
555
- ],
556
- "formats": [
557
- "mlmodelc"
558
- ],
559
- "benchmark": {
560
- "rtfx": 150.5,
561
- "wer_pct": 5.4,
562
- "metric": "WER",
563
- "n": 865,
564
- "test_set": "FLEURS it_it",
565
- "benched": true
566
- }
567
- },
568
- {
569
- "path": "pt/560ms",
570
- "language": "Portuguese",
571
- "language_code": "pt-BR",
572
- "chunk_ms": 560,
573
- "latency_s": 0.56,
574
- "chunk_mel_frames": 56,
575
- "total_mel_frames": 65,
576
- "att_context": [
577
- 42,
578
- 13
579
- ],
580
- "vocab_size": 2828,
581
- "vocab_pruned": true,
582
- "vocab_prune_method": "latin-script",
583
- "components": [
584
- "decoder",
585
- "decoder_joint",
586
- "encoder",
587
- "joint",
588
- "preprocessor"
589
- ],
590
- "formats": [
591
- "mlmodelc"
592
- ],
593
- "benchmark": {
594
- "rtfx": 58.7,
595
- "wer_pct": 6.38,
596
- "metric": "WER",
597
- "n": 919,
598
- "test_set": "FLEURS pt_br",
599
- "benched": true
600
- }
601
- },
602
- {
603
- "path": "pt/1120ms",
604
- "language": "Portuguese",
605
- "language_code": "pt-BR",
606
- "chunk_ms": 1120,
607
- "latency_s": 1.12,
608
- "chunk_mel_frames": 112,
609
- "total_mel_frames": 121,
610
- "att_context": [
611
- 42,
612
- 13
613
- ],
614
- "vocab_size": 2828,
615
- "vocab_pruned": true,
616
- "vocab_prune_method": "latin-script",
617
- "components": [
618
- "decoder",
619
- "decoder_joint",
620
- "encoder",
621
- "joint",
622
- "preprocessor"
623
- ],
624
- "formats": [
625
- "mlmodelc"
626
- ],
627
- "benchmark": {
628
- "rtfx": 107.6,
629
- "wer_pct": 6.11,
630
- "metric": "WER",
631
- "n": 919,
632
- "test_set": "FLEURS pt_br",
633
- "benched": true
634
- }
635
- },
636
- {
637
- "path": "pt/2240ms",
638
- "language": "Portuguese",
639
- "language_code": "pt-BR",
640
- "chunk_ms": 2240,
641
- "latency_s": 2.24,
642
- "chunk_mel_frames": 224,
643
- "total_mel_frames": 233,
644
- "att_context": [
645
- 42,
646
- 13
647
- ],
648
- "vocab_size": 2828,
649
- "vocab_pruned": true,
650
- "vocab_prune_method": "latin-script",
651
- "components": [
652
- "decoder",
653
- "decoder_joint",
654
- "encoder",
655
- "joint",
656
- "preprocessor"
657
- ],
658
- "formats": [
659
- "mlmodelc"
660
- ],
661
- "benchmark": {
662
- "rtfx": 141.0,
663
- "wer_pct": 6.14,
664
- "metric": "WER",
665
- "n": 919,
666
- "test_set": "FLEURS pt_br",
667
- "benched": true
668
- }
669
- },
670
  {
671
- "path": "pt/4480ms",
672
- "language": "Portuguese",
673
- "language_code": "pt-BR",
674
- "chunk_ms": 4480,
675
- "latency_s": 4.48,
676
- "chunk_mel_frames": 448,
677
- "total_mel_frames": 457,
678
- "att_context": [
679
- 42,
680
- 13
681
- ],
682
- "vocab_size": 2828,
683
- "vocab_pruned": true,
684
- "vocab_prune_method": "latin-script",
685
- "components": [
686
- "decoder",
687
- "decoder_joint",
688
- "encoder",
689
- "joint",
690
- "preprocessor"
691
  ],
692
- "formats": [
693
- "mlmodelc"
694
- ],
695
- "benchmark": {
696
- "rtfx": 141.3,
697
- "wer_pct": 6.18,
698
- "metric": "WER",
699
- "n": 919,
700
- "test_set": "FLEURS pt_br",
701
- "benched": true
702
- }
703
- },
704
- {
705
- "path": "de/560ms",
706
- "language": "German",
707
- "language_code": "de-DE",
708
  "chunk_ms": 560,
709
  "latency_s": 0.56,
710
  "chunk_mel_frames": 56,
@@ -715,30 +61,79 @@
715
  ],
716
  "vocab_size": 2828,
717
  "vocab_pruned": true,
718
- "vocab_prune_method": "latin-script",
719
- "components": [
720
- "decoder",
721
- "decoder_joint",
722
- "encoder",
723
- "joint",
724
- "preprocessor"
725
- ],
726
- "formats": [
727
- "mlmodelc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  ],
729
- "benchmark": {
730
- "rtfx": 58.8,
731
- "wer_pct": 10.83,
732
- "metric": "WER",
733
- "n": 862,
734
- "test_set": "FLEURS de_de",
735
- "benched": true
736
- }
737
- },
738
- {
739
- "path": "de/1120ms",
740
- "language": "German",
741
- "language_code": "de-DE",
742
  "chunk_ms": 1120,
743
  "latency_s": 1.12,
744
  "chunk_mel_frames": 112,
@@ -749,30 +144,79 @@
749
  ],
750
  "vocab_size": 2828,
751
  "vocab_pruned": true,
752
- "vocab_prune_method": "latin-script",
753
- "components": [
754
- "decoder",
755
- "decoder_joint",
756
- "encoder",
757
- "joint",
758
- "preprocessor"
759
- ],
760
- "formats": [
761
- "mlmodelc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  ],
763
- "benchmark": {
764
- "rtfx": 107.2,
765
- "wer_pct": 9.78,
766
- "metric": "WER",
767
- "n": 862,
768
- "test_set": "FLEURS de_de",
769
- "benched": true
770
- }
771
- },
772
- {
773
- "path": "de/2240ms",
774
- "language": "German",
775
- "language_code": "de-DE",
776
  "chunk_ms": 2240,
777
  "latency_s": 2.24,
778
  "chunk_mel_frames": 224,
@@ -783,30 +227,79 @@
783
  ],
784
  "vocab_size": 2828,
785
  "vocab_pruned": true,
786
- "vocab_prune_method": "latin-script",
787
- "components": [
788
- "decoder",
789
- "decoder_joint",
790
- "encoder",
791
- "joint",
792
- "preprocessor"
793
- ],
794
- "formats": [
795
- "mlmodelc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  ],
797
- "benchmark": {
798
- "rtfx": 144.4,
799
- "wer_pct": 9.83,
800
- "metric": "WER",
801
- "n": 862,
802
- "test_set": "FLEURS de_de",
803
- "benched": true
804
- }
805
- },
806
- {
807
- "path": "de/4480ms",
808
- "language": "German",
809
- "language_code": "de-DE",
810
  "chunk_ms": 4480,
811
  "latency_s": 4.48,
812
  "chunk_mel_frames": 448,
@@ -817,302 +310,76 @@
817
  ],
818
  "vocab_size": 2828,
819
  "vocab_pruned": true,
820
- "vocab_prune_method": "latin-script",
821
- "components": [
822
- "decoder",
823
- "decoder_joint",
824
- "encoder",
825
- "joint",
826
- "preprocessor"
827
- ],
828
- "formats": [
829
- "mlmodelc"
830
- ],
831
- "benchmark": {
832
- "rtfx": 141.5,
833
- "wer_pct": 9.83,
834
- "metric": "WER",
835
- "n": 862,
836
- "test_set": "FLEURS de_de",
837
- "benched": true
838
- }
839
- },
840
- {
841
- "path": "zh/560ms",
842
- "language": "Chinese",
843
- "language_code": "zh-CN",
844
- "chunk_ms": 560,
845
- "latency_s": 0.56,
846
- "chunk_mel_frames": 56,
847
- "total_mel_frames": 65,
848
- "att_context": [
849
- 42,
850
- 13
851
- ],
852
- "vocab_size": 13087,
853
- "vocab_pruned": false,
854
- "vocab_prune_method": "none (full vocab)",
855
- "components": [
856
- "decoder",
857
- "decoder_joint",
858
- "encoder",
859
- "joint",
860
- "preprocessor"
861
- ],
862
- "formats": [
863
- "mlmodelc"
864
- ],
865
- "benchmark": {
866
- "rtfx": 22.5,
867
- "cer_pct": 19.48,
868
- "metric": "CER",
869
- "n": 945,
870
- "test_set": "FLEURS cmn_hans_cn",
871
- "benched": true
872
- }
873
- },
874
- {
875
- "path": "zh/1120ms",
876
- "language": "Chinese",
877
- "language_code": "zh-CN",
878
- "chunk_ms": 1120,
879
- "latency_s": 1.12,
880
- "chunk_mel_frames": 112,
881
- "total_mel_frames": 121,
882
- "att_context": [
883
- 42,
884
- 13
885
- ],
886
- "vocab_size": 13087,
887
- "vocab_pruned": false,
888
- "vocab_prune_method": "none (full vocab)",
889
- "components": [
890
- "decoder",
891
- "decoder_joint",
892
- "encoder",
893
- "joint",
894
- "preprocessor"
895
- ],
896
- "formats": [
897
- "mlmodelc"
898
- ],
899
- "benchmark": {
900
- "rtfx": 26.6,
901
- "cer_pct": 18.75,
902
- "metric": "CER",
903
- "n": 945,
904
- "test_set": "FLEURS cmn_hans_cn",
905
- "benched": true
906
- }
907
- },
908
- {
909
- "path": "zh/2240ms",
910
- "language": "Chinese",
911
- "language_code": "zh-CN",
912
- "chunk_ms": 2240,
913
- "latency_s": 2.24,
914
- "chunk_mel_frames": 224,
915
- "total_mel_frames": 233,
916
- "att_context": [
917
- 42,
918
- 13
919
- ],
920
- "vocab_size": 13087,
921
- "vocab_pruned": false,
922
- "vocab_prune_method": "none (full vocab)",
923
- "components": [
924
- "decoder",
925
- "decoder_joint",
926
- "encoder",
927
- "joint",
928
- "preprocessor"
929
- ],
930
- "formats": [
931
- "mlmodelc"
932
- ],
933
- "benchmark": {
934
- "rtfx": 89.0,
935
- "cer_pct": 18.57,
936
- "metric": "CER",
937
- "n": 945,
938
- "test_set": "FLEURS cmn_hans_cn",
939
- "benched": true
940
- }
941
- },
942
- {
943
- "path": "zh/4480ms",
944
- "language": "Chinese",
945
- "language_code": "zh-CN",
946
- "chunk_ms": 4480,
947
- "latency_s": 4.48,
948
- "chunk_mel_frames": 448,
949
- "total_mel_frames": 457,
950
- "att_context": [
951
- 42,
952
- 13
953
- ],
954
- "vocab_size": 13087,
955
- "vocab_pruned": false,
956
- "vocab_prune_method": "none (full vocab)",
957
- "components": [
958
- "decoder",
959
- "decoder_joint",
960
- "encoder",
961
- "joint",
962
- "preprocessor"
963
- ],
964
- "formats": [
965
- "mlmodelc"
966
- ],
967
- "benchmark": {
968
- "rtfx": 89.6,
969
- "cer_pct": 18.05,
970
- "metric": "CER",
971
- "n": 945,
972
- "test_set": "FLEURS cmn_hans_cn",
973
- "benched": true
974
- }
975
- },
976
- {
977
- "path": "ja/560ms",
978
- "language": "Japanese",
979
- "language_code": "ja-JP",
980
- "chunk_ms": 560,
981
- "latency_s": 0.56,
982
- "chunk_mel_frames": 56,
983
- "total_mel_frames": 65,
984
- "att_context": [
985
- 42,
986
- 13
987
- ],
988
- "vocab_size": 13087,
989
- "vocab_pruned": false,
990
- "vocab_prune_method": "none (full vocab)",
991
- "components": [
992
- "decoder",
993
- "decoder_joint",
994
- "encoder",
995
- "joint",
996
- "preprocessor"
997
- ],
998
- "formats": [
999
- "mlmodelc"
1000
- ],
1001
- "benchmark": {
1002
- "rtfx": 20.7,
1003
- "cer_pct": 14.61,
1004
- "metric": "CER",
1005
- "n": 650,
1006
- "test_set": "FLEURS ja_jp",
1007
- "benched": true
1008
- }
1009
- },
1010
- {
1011
- "path": "ja/1120ms",
1012
- "language": "Japanese",
1013
- "language_code": "ja-JP",
1014
- "chunk_ms": 1120,
1015
- "latency_s": 1.12,
1016
- "chunk_mel_frames": 112,
1017
- "total_mel_frames": 121,
1018
- "att_context": [
1019
- 42,
1020
- 13
1021
- ],
1022
- "vocab_size": 13087,
1023
- "vocab_pruned": false,
1024
- "vocab_prune_method": "none (full vocab)",
1025
- "components": [
1026
- "decoder",
1027
- "decoder_joint",
1028
- "encoder",
1029
- "joint",
1030
- "preprocessor"
1031
- ],
1032
- "formats": [
1033
- "mlmodelc"
1034
- ],
1035
- "benchmark": {
1036
- "rtfx": 25.9,
1037
- "cer_pct": 13.77,
1038
- "metric": "CER",
1039
- "n": 650,
1040
- "test_set": "FLEURS ja_jp",
1041
- "benched": true
1042
- }
1043
- },
1044
- {
1045
- "path": "ja/2240ms",
1046
- "language": "Japanese",
1047
- "language_code": "ja-JP",
1048
- "chunk_ms": 2240,
1049
- "latency_s": 2.24,
1050
- "chunk_mel_frames": 224,
1051
- "total_mel_frames": 233,
1052
- "att_context": [
1053
- 42,
1054
- 13
1055
- ],
1056
- "vocab_size": 13087,
1057
- "vocab_pruned": false,
1058
- "vocab_prune_method": "none (full vocab)",
1059
- "components": [
1060
- "decoder",
1061
- "decoder_joint",
1062
- "encoder",
1063
- "joint",
1064
- "preprocessor"
1065
- ],
1066
- "formats": [
1067
- "mlmodelc"
1068
- ],
1069
- "benchmark": {
1070
- "rtfx": 84.2,
1071
- "cer_pct": 13.79,
1072
- "metric": "CER",
1073
- "n": 650,
1074
- "test_set": "FLEURS ja_jp",
1075
- "benched": true
1076
- }
1077
- },
1078
- {
1079
- "path": "ja/4480ms",
1080
- "language": "Japanese",
1081
- "language_code": "ja-JP",
1082
- "chunk_ms": 4480,
1083
- "latency_s": 4.48,
1084
- "chunk_mel_frames": 448,
1085
- "total_mel_frames": 457,
1086
- "att_context": [
1087
- 42,
1088
- 13
1089
- ],
1090
- "vocab_size": 13087,
1091
- "vocab_pruned": false,
1092
- "vocab_prune_method": "none (full vocab)",
1093
- "components": [
1094
- "decoder",
1095
- "decoder_joint",
1096
- "encoder",
1097
- "joint",
1098
- "preprocessor"
1099
- ],
1100
- "formats": [
1101
- "mlmodelc"
1102
- ],
1103
- "benchmark": {
1104
- "rtfx": 89.3,
1105
- "cer_pct": 13.82,
1106
- "metric": "CER",
1107
- "n": 650,
1108
- "test_set": "FLEURS ja_jp",
1109
- "benched": true
1110
- }
1111
  },
1112
  {
1113
  "path": "multilingual/560ms",
1114
- "language": "Multilingual",
1115
- "language_code": "auto",
 
 
 
 
1116
  "chunk_ms": 560,
1117
  "latency_s": 0.56,
1118
  "chunk_mel_frames": 56,
@@ -1134,19 +401,41 @@
1134
  "formats": [
1135
  "mlmodelc"
1136
  ],
1137
- "benchmark": {
1138
- "rtfx": 23.4,
1139
- "wer_pct": 9.15,
1140
- "metric": "WER",
1141
- "n": 647,
1142
- "test_set": "FLEURS en_us",
1143
- "benched": true
1144
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1145
  },
1146
  {
1147
  "path": "multilingual/1120ms",
1148
- "language": "Multilingual",
1149
- "language_code": "auto",
 
 
 
 
1150
  "chunk_ms": 1120,
1151
  "latency_s": 1.12,
1152
  "chunk_mel_frames": 112,
@@ -1168,19 +457,41 @@
1168
  "formats": [
1169
  "mlmodelc"
1170
  ],
1171
- "benchmark": {
1172
- "rtfx": 70.9,
1173
- "wer_pct": 8.64,
1174
- "metric": "WER",
1175
- "n": 647,
1176
- "test_set": "FLEURS en_us",
1177
- "benched": true
1178
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179
  },
1180
  {
1181
  "path": "multilingual/2240ms",
1182
- "language": "Multilingual",
1183
- "language_code": "auto",
 
 
 
 
1184
  "chunk_ms": 2240,
1185
  "latency_s": 2.24,
1186
  "chunk_mel_frames": 224,
@@ -1202,19 +513,41 @@
1202
  "formats": [
1203
  "mlmodelc"
1204
  ],
1205
- "benchmark": {
1206
- "rtfx": 80.4,
1207
- "wer_pct": 8.76,
1208
- "metric": "WER",
1209
- "n": 647,
1210
- "test_set": "FLEURS en_us",
1211
- "benched": true
1212
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1213
  },
1214
  {
1215
  "path": "multilingual/4480ms",
1216
- "language": "Multilingual",
1217
- "language_code": "auto",
 
 
 
 
1218
  "chunk_ms": 4480,
1219
  "latency_s": 4.48,
1220
  "chunk_mel_frames": 448,
@@ -1236,14 +569,32 @@
1236
  "formats": [
1237
  "mlmodelc"
1238
  ],
1239
- "benchmark": {
1240
- "rtfx": 78.0,
1241
- "wer_pct": 8.78,
1242
- "metric": "WER",
1243
- "n": 647,
1244
- "test_set": "FLEURS en_us",
1245
- "benched": true
1246
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1247
  }
1248
  ]
1249
  }
 
6
  "runtime": "CoreML / Apple Neural Engine",
7
  "benchmark_machine": "Apple M5 Pro / macOS 26.5",
8
  "benchmark_dataset": "FLEURS test (all languages)",
9
+ "models": {
10
+ "latin": {
11
+ "folder": "latin",
12
+ "vocab_size": 2828,
13
+ "serves": [
14
+ "en",
15
+ "es",
16
+ "fr",
17
+ "it",
18
+ "pt",
19
+ "de"
20
+ ]
21
+ },
22
+ "multilingual": {
23
+ "folder": "multilingual",
24
+ "vocab_size": 13087,
25
+ "serves": [
26
+ "zh",
27
+ "ja",
28
+ "100+ via prompt_id"
29
+ ]
30
+ }
31
+ },
32
+ "recipe": "LAYERPOS [42,13] mixed-precision encoder (INT8 cuff + 6-bit middle) + Latin-script vocab prune (latin model) / full vocab (multilingual) + B1 decoder-joint fusion + triple-stage pipelining. Encoder shared across both models per tier.",
33
  "no_retraining": true,
34
  "no_calibration": true,
35
  "tiers_ms": [
36
  560,
37
  1120,
38
  2240,
39
+ 4480
40
+ ],
41
+ "recommended_tier_ms": 2240,
42
+ "ships": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  {
44
+ "path": "latin/560ms",
45
+ "model": "latin",
46
+ "languages_served": [
47
+ "en",
48
+ "es",
49
+ "fr",
50
+ "it",
51
+ "pt",
52
+ "de"
 
 
 
 
 
 
 
 
 
 
 
53
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  "chunk_ms": 560,
55
  "latency_s": 0.56,
56
  "chunk_mel_frames": 56,
 
61
  ],
62
  "vocab_size": 2828,
63
  "vocab_pruned": true,
64
+ "vocab_prune_method": "latin-script (writing-system, domain-general)",
65
+ "components": [
66
+ "decoder",
67
+ "decoder_joint",
68
+ "encoder",
69
+ "joint",
70
+ "preprocessor"
71
+ ],
72
+ "formats": [
73
+ "mlmodelc"
74
+ ],
75
+ "benchmarks": [
76
+ {
77
+ "language_code": "en",
78
+ "rtfx": 57.5,
79
+ "metric": "WER",
80
+ "n": 647,
81
+ "test_set": "FLEURS en_us",
82
+ "wer_pct": 9.43
83
+ },
84
+ {
85
+ "language_code": "es",
86
+ "rtfx": 58.2,
87
+ "metric": "WER",
88
+ "n": 908,
89
+ "test_set": "FLEURS es_419",
90
+ "wer_pct": 4.95
91
+ },
92
+ {
93
+ "language_code": "fr",
94
+ "rtfx": 57.4,
95
+ "metric": "WER",
96
+ "n": 676,
97
+ "test_set": "FLEURS fr_fr",
98
+ "wer_pct": 9.68
99
+ },
100
+ {
101
+ "language_code": "it",
102
+ "rtfx": 59.0,
103
+ "metric": "WER",
104
+ "n": 865,
105
+ "test_set": "FLEURS it_it",
106
+ "wer_pct": 5.68
107
+ },
108
+ {
109
+ "language_code": "pt",
110
+ "rtfx": 58.7,
111
+ "metric": "WER",
112
+ "n": 919,
113
+ "test_set": "FLEURS pt_br",
114
+ "wer_pct": 6.38
115
+ },
116
+ {
117
+ "language_code": "de",
118
+ "rtfx": 58.8,
119
+ "metric": "WER",
120
+ "n": 862,
121
+ "test_set": "FLEURS de_de",
122
+ "wer_pct": 10.83
123
+ }
124
+ ]
125
+ },
126
+ {
127
+ "path": "latin/1120ms",
128
+ "model": "latin",
129
+ "languages_served": [
130
+ "en",
131
+ "es",
132
+ "fr",
133
+ "it",
134
+ "pt",
135
+ "de"
136
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  "chunk_ms": 1120,
138
  "latency_s": 1.12,
139
  "chunk_mel_frames": 112,
 
144
  ],
145
  "vocab_size": 2828,
146
  "vocab_pruned": true,
147
+ "vocab_prune_method": "latin-script (writing-system, domain-general)",
148
+ "components": [
149
+ "decoder",
150
+ "decoder_joint",
151
+ "encoder",
152
+ "joint",
153
+ "preprocessor"
154
+ ],
155
+ "formats": [
156
+ "mlmodelc"
157
+ ],
158
+ "benchmarks": [
159
+ {
160
+ "language_code": "en",
161
+ "rtfx": 102.9,
162
+ "metric": "WER",
163
+ "n": 647,
164
+ "test_set": "FLEURS en_us",
165
+ "wer_pct": 8.89
166
+ },
167
+ {
168
+ "language_code": "es",
169
+ "rtfx": 106.5,
170
+ "metric": "WER",
171
+ "n": 908,
172
+ "test_set": "FLEURS es_419",
173
+ "wer_pct": 4.76
174
+ },
175
+ {
176
+ "language_code": "fr",
177
+ "rtfx": 104.7,
178
+ "metric": "WER",
179
+ "n": 676,
180
+ "test_set": "FLEURS fr_fr",
181
+ "wer_pct": 9.44
182
+ },
183
+ {
184
+ "language_code": "it",
185
+ "rtfx": 109.0,
186
+ "metric": "WER",
187
+ "n": 865,
188
+ "test_set": "FLEURS it_it",
189
+ "wer_pct": 5.45
190
+ },
191
+ {
192
+ "language_code": "pt",
193
+ "rtfx": 107.6,
194
+ "metric": "WER",
195
+ "n": 919,
196
+ "test_set": "FLEURS pt_br",
197
+ "wer_pct": 6.11
198
+ },
199
+ {
200
+ "language_code": "de",
201
+ "rtfx": 107.2,
202
+ "metric": "WER",
203
+ "n": 862,
204
+ "test_set": "FLEURS de_de",
205
+ "wer_pct": 9.78
206
+ }
207
+ ]
208
+ },
209
+ {
210
+ "path": "latin/2240ms",
211
+ "model": "latin",
212
+ "languages_served": [
213
+ "en",
214
+ "es",
215
+ "fr",
216
+ "it",
217
+ "pt",
218
+ "de"
219
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "chunk_ms": 2240,
221
  "latency_s": 2.24,
222
  "chunk_mel_frames": 224,
 
227
  ],
228
  "vocab_size": 2828,
229
  "vocab_pruned": true,
230
+ "vocab_prune_method": "latin-script (writing-system, domain-general)",
231
+ "components": [
232
+ "decoder",
233
+ "decoder_joint",
234
+ "encoder",
235
+ "joint",
236
+ "preprocessor"
237
+ ],
238
+ "formats": [
239
+ "mlmodelc"
240
+ ],
241
+ "benchmarks": [
242
+ {
243
+ "language_code": "en",
244
+ "rtfx": 130.2,
245
+ "metric": "WER",
246
+ "n": 647,
247
+ "test_set": "FLEURS en_us",
248
+ "wer_pct": 8.96
249
+ },
250
+ {
251
+ "language_code": "es",
252
+ "rtfx": 139.6,
253
+ "metric": "WER",
254
+ "n": 908,
255
+ "test_set": "FLEURS es_419",
256
+ "wer_pct": 4.8
257
+ },
258
+ {
259
+ "language_code": "fr",
260
+ "rtfx": 130.4,
261
+ "metric": "WER",
262
+ "n": 676,
263
+ "test_set": "FLEURS fr_fr",
264
+ "wer_pct": 9.52
265
+ },
266
+ {
267
+ "language_code": "it",
268
+ "rtfx": 146.7,
269
+ "metric": "WER",
270
+ "n": 865,
271
+ "test_set": "FLEURS it_it",
272
+ "wer_pct": 5.41
273
+ },
274
+ {
275
+ "language_code": "pt",
276
+ "rtfx": 141.0,
277
+ "metric": "WER",
278
+ "n": 919,
279
+ "test_set": "FLEURS pt_br",
280
+ "wer_pct": 6.14
281
+ },
282
+ {
283
+ "language_code": "de",
284
+ "rtfx": 144.4,
285
+ "metric": "WER",
286
+ "n": 862,
287
+ "test_set": "FLEURS de_de",
288
+ "wer_pct": 9.83
289
+ }
290
+ ]
291
+ },
292
+ {
293
+ "path": "latin/4480ms",
294
+ "model": "latin",
295
+ "languages_served": [
296
+ "en",
297
+ "es",
298
+ "fr",
299
+ "it",
300
+ "pt",
301
+ "de"
302
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  "chunk_ms": 4480,
304
  "latency_s": 4.48,
305
  "chunk_mel_frames": 448,
 
310
  ],
311
  "vocab_size": 2828,
312
  "vocab_pruned": true,
313
+ "vocab_prune_method": "latin-script (writing-system, domain-general)",
314
+ "components": [
315
+ "decoder",
316
+ "decoder_joint",
317
+ "encoder",
318
+ "joint",
319
+ "preprocessor"
320
+ ],
321
+ "formats": [
322
+ "mlmodelc"
323
+ ],
324
+ "benchmarks": [
325
+ {
326
+ "language_code": "en",
327
+ "rtfx": 122.1,
328
+ "metric": "WER",
329
+ "n": 647,
330
+ "test_set": "FLEURS en_us",
331
+ "wer_pct": 9.02
332
+ },
333
+ {
334
+ "language_code": "es",
335
+ "rtfx": 135.8,
336
+ "metric": "WER",
337
+ "n": 908,
338
+ "test_set": "FLEURS es_419",
339
+ "wer_pct": 4.77
340
+ },
341
+ {
342
+ "language_code": "fr",
343
+ "rtfx": 124.3,
344
+ "metric": "WER",
345
+ "n": 676,
346
+ "test_set": "FLEURS fr_fr",
347
+ "wer_pct": 9.42
348
+ },
349
+ {
350
+ "language_code": "it",
351
+ "rtfx": 150.5,
352
+ "metric": "WER",
353
+ "n": 865,
354
+ "test_set": "FLEURS it_it",
355
+ "wer_pct": 5.4
356
+ },
357
+ {
358
+ "language_code": "pt",
359
+ "rtfx": 141.3,
360
+ "metric": "WER",
361
+ "n": 919,
362
+ "test_set": "FLEURS pt_br",
363
+ "wer_pct": 6.18
364
+ },
365
+ {
366
+ "language_code": "de",
367
+ "rtfx": 141.5,
368
+ "metric": "WER",
369
+ "n": 862,
370
+ "test_set": "FLEURS de_de",
371
+ "wer_pct": 9.83
372
+ }
373
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  },
375
  {
376
  "path": "multilingual/560ms",
377
+ "model": "multilingual",
378
+ "languages_served": [
379
+ "zh",
380
+ "ja",
381
+ "100+ via prompt_id"
382
+ ],
383
  "chunk_ms": 560,
384
  "latency_s": 0.56,
385
  "chunk_mel_frames": 56,
 
401
  "formats": [
402
  "mlmodelc"
403
  ],
404
+ "benchmarks": [
405
+ {
406
+ "language_code": "zh",
407
+ "rtfx": 22.5,
408
+ "metric": "CER",
409
+ "n": 945,
410
+ "test_set": "FLEURS cmn_hans_cn",
411
+ "cer_pct": 19.48
412
+ },
413
+ {
414
+ "language_code": "ja",
415
+ "rtfx": 20.7,
416
+ "metric": "CER",
417
+ "n": 650,
418
+ "test_set": "FLEURS ja_jp",
419
+ "cer_pct": 14.61
420
+ },
421
+ {
422
+ "language_code": "multilingual",
423
+ "rtfx": 23.4,
424
+ "metric": "WER",
425
+ "n": 647,
426
+ "test_set": "FLEURS en_us",
427
+ "wer_pct": 9.15
428
+ }
429
+ ]
430
  },
431
  {
432
  "path": "multilingual/1120ms",
433
+ "model": "multilingual",
434
+ "languages_served": [
435
+ "zh",
436
+ "ja",
437
+ "100+ via prompt_id"
438
+ ],
439
  "chunk_ms": 1120,
440
  "latency_s": 1.12,
441
  "chunk_mel_frames": 112,
 
457
  "formats": [
458
  "mlmodelc"
459
  ],
460
+ "benchmarks": [
461
+ {
462
+ "language_code": "zh",
463
+ "rtfx": 26.6,
464
+ "metric": "CER",
465
+ "n": 945,
466
+ "test_set": "FLEURS cmn_hans_cn",
467
+ "cer_pct": 18.75
468
+ },
469
+ {
470
+ "language_code": "ja",
471
+ "rtfx": 25.9,
472
+ "metric": "CER",
473
+ "n": 650,
474
+ "test_set": "FLEURS ja_jp",
475
+ "cer_pct": 13.77
476
+ },
477
+ {
478
+ "language_code": "multilingual",
479
+ "rtfx": 70.9,
480
+ "metric": "WER",
481
+ "n": 647,
482
+ "test_set": "FLEURS en_us",
483
+ "wer_pct": 8.64
484
+ }
485
+ ]
486
  },
487
  {
488
  "path": "multilingual/2240ms",
489
+ "model": "multilingual",
490
+ "languages_served": [
491
+ "zh",
492
+ "ja",
493
+ "100+ via prompt_id"
494
+ ],
495
  "chunk_ms": 2240,
496
  "latency_s": 2.24,
497
  "chunk_mel_frames": 224,
 
513
  "formats": [
514
  "mlmodelc"
515
  ],
516
+ "benchmarks": [
517
+ {
518
+ "language_code": "zh",
519
+ "rtfx": 89.0,
520
+ "metric": "CER",
521
+ "n": 945,
522
+ "test_set": "FLEURS cmn_hans_cn",
523
+ "cer_pct": 18.57
524
+ },
525
+ {
526
+ "language_code": "ja",
527
+ "rtfx": 84.2,
528
+ "metric": "CER",
529
+ "n": 650,
530
+ "test_set": "FLEURS ja_jp",
531
+ "cer_pct": 13.79
532
+ },
533
+ {
534
+ "language_code": "multilingual",
535
+ "rtfx": 80.4,
536
+ "metric": "WER",
537
+ "n": 647,
538
+ "test_set": "FLEURS en_us",
539
+ "wer_pct": 8.76
540
+ }
541
+ ]
542
  },
543
  {
544
  "path": "multilingual/4480ms",
545
+ "model": "multilingual",
546
+ "languages_served": [
547
+ "zh",
548
+ "ja",
549
+ "100+ via prompt_id"
550
+ ],
551
  "chunk_ms": 4480,
552
  "latency_s": 4.48,
553
  "chunk_mel_frames": 448,
 
569
  "formats": [
570
  "mlmodelc"
571
  ],
572
+ "benchmarks": [
573
+ {
574
+ "language_code": "zh",
575
+ "rtfx": 89.6,
576
+ "metric": "CER",
577
+ "n": 945,
578
+ "test_set": "FLEURS cmn_hans_cn",
579
+ "cer_pct": 18.05
580
+ },
581
+ {
582
+ "language_code": "ja",
583
+ "rtfx": 89.3,
584
+ "metric": "CER",
585
+ "n": 650,
586
+ "test_set": "FLEURS ja_jp",
587
+ "cer_pct": 13.82
588
+ },
589
+ {
590
+ "language_code": "multilingual",
591
+ "rtfx": 78.0,
592
+ "metric": "WER",
593
+ "n": 647,
594
+ "test_set": "FLEURS en_us",
595
+ "wer_pct": 8.78
596
+ }
597
+ ]
598
  }
599
  ]
600
  }