File size: 4,761 Bytes
53a73e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Model Parameter Counts Dictionary for PazaBench Leaderboard
# Used as bubble sizes in the speech vs accuracy tradeoff chart

MODEL_PARAMETER_COUNTS = {
    # Facebook Data2Vec family
    "facebook/data2vec-audio-base-960h": 94_400_000,      # 94.4M
    "facebook/data2vec-audio-large-960h": 315_000_000,    # ~315M (large architecture)
    
    # Facebook MMS family
    "facebook/mms-1b-all": 1_000_000_000,                 # 1B params
    "facebook/mms-1b-fl102": 1_000_000_000,               # 1B params
    
    # Facebook Wav2Vec2 family
    "facebook/wav2vec2-base-960h": 94_400_000,            # 94.4M
    "facebook/wav2vec2-large-960h": 315_000_000,          # ~315M (large)
    "facebook/wav2vec2-large-960h-lv60-self": 315_000_000, # ~315M (large)
    "facebook/wav2vec2-large-robust-ft-libri-960h": 315_000_000,  # ~315M
    
    # Facebook Wav2Vec2 Conformer family
    "facebook/wav2vec2-conformer-rel-pos-large-960h-ft": 600_000_000,  # 600M
    "facebook/wav2vec2-conformer-rope-large-960h-ft": 600_000_000,     # 600M
    
    # Facebook HuBERT family
    "facebook/hubert-large-ls960-ft": 315_000_000,        # ~315M (large)
    "facebook/hubert-xlarge-ls960-ft": 1_000_000_000,     # 1B params
    
    # IBM Granite Speech family
    # Total params include speech encoder + LLM backbone
    "ibm/granite-granite-speech-3.3-2b": 3_000_000_000,   # 3B params
    "ibm/granite-granite-speech-3.3-8b": 9_000_000_000,   # 9B params
    
    # Kyutai family
    "kyutai/stt-2.6b-en": 2_600_000_000,                  # 2.6B params
    
    # Lite ASR / EfficientSpeech family
    "efficient/speech-lite-whisper-large-v3": 1_000_000_000,        # ~1B (large-v3 compression)
    "efficient/speech-lite-whisper-large-v3-acc": 1_000_000_000,    # ~1B (accuracy-optimized)
    "efficient/speech-lite-whisper-large-v3-fast": 1_000_000_000,   # ~1B (speed-optimized)
    "efficient/speech-lite-whisper-large-v3-turbo": 600_000_000,    # ~0.6B (turbo compression)
    "efficient/speech-lite-whisper-large-v3-turbo-acc": 600_000_000,  # ~0.6B (turbo accuracy)
    "efficient/speech-lite-whisper-large-v3-turbo-fast": 600_000_000, # ~0.6B (turbo fast)
    
    # Moonshine family (Useful Sensors)
    "usefulsensors/moonshine-tiny": 27_100_000,           # 27.1M params
    "usefulsensors/moonshine-base": 61_500_000,           # 61.5M params
    
    # OpenAI Whisper family
    "openai/whisper-tiny.en": 37_800_000,                 # 37.8M
    "openai/whisper-base.en": 72_600_000,                 # 72.6M
    "openai/whisper-small.en": 244_000_000,               # 244M
    "openai/whisper-medium.en": 769_000_000,              # 769M
    "openai/whisper-large": 1_550_000_000,                # 1550M
    "openai/whisper-large-v2": 1_550_000_000,             # 1550M
    "openai/whisper-large-v3": 1_550_000_000,             # 1550M
    "openai/whisper-large-v3-turbo": 809_000_000,         # 809M
    
    # Distil-Whisper family
    "distil/whisper-distil-large-v2": 756_000_000,        # 756M
    "distil/whisper-distil-large-v3": 756_000_000,        # 756M
    "distil/whisper-distil-medium.en": 394_000_000,       # 394M
    
    # Paza family
    "paza/microsoft-paza-Phi-4-multimodal-instruct": 5_600_000_000,  # 5.6B
    "paza/microsoft-paza-mms-1b-all": 1_000_000_000,      # 1B
    "paza/microsoft-paza-whisper-large-v3-turbo": 809_000_000,  # 809M
    
    # Qwen2 Audio family
    "Qwen/Qwen2-Audio-7B": 8_000_000_000,                 # 8B params
    "Qwen/Qwen2-Audio-7B-Instruct": 8_000_000_000,        # 8B params
    
    # OmniASR family - CTC models
    "facebook/omniASR-CTC-300M": 325_494_996,             # 325M
    "facebook/omniASR-CTC-1B": 975_065_300,               # 975M
    "facebook/omniASR-CTC-3B": 3_080_423_636,             # 3.08B
    "facebook/omniASR-CTC-7B": 6_504_786_132,             # 6.5B
    
    # OmniASR family - LLM models
    "facebook/omniASR-LLM-300M": 1_627_603_584,           # 1.6B
    "facebook/omniASR-LLM-1B": 2_275_710_592,             # 2.3B
    "facebook/omniASR-LLM-3B": 4_376_679_040,             # 4.4B
    "facebook/omniASR-LLM-7B": 7_801_041_536,             # 7.8B
    
    # OmniASR family - Zero-shot model
    "facebook/omniASR-LLM-7B-ZS": 7_810_900_608,          # 7.8B
    
    # Microsoft Phi-4 family
    "microsoft/Phi-4-multimodal-instruct": 5_600_000_000, # 5.6B params
    
    # NVIDIA NeMo ASR family
    "nvidia/canary-1b": 1_000_000_000,                    # ~1B (FastConformer encoder-decoder)
    "nvidia/canary-1b-v2": 1_000_000_000,                 # ~1B params
    "nvidia/canary-qwen-2.5b": 2_500_000_000,             # 2.5B params
    "nvidia/parakeet-tdt-0.6b-v2": 600_000_000,           # 600M
    "nvidia/parakeet-tdt-0.6b-v3": 600_000_000,           # 600M params
}