# Model Parameter Counts Dictionary for PazaBench Leaderboard # Used as bubble sizes in the speech vs accuracy tradeoff chart MODEL_PARAMETER_COUNTS = { # Facebook Data2Vec family "facebook/data2vec-audio-base-960h": 94_400_000, # 94.4M "facebook/data2vec-audio-large-960h": 315_000_000, # ~315M (large architecture) # Facebook MMS family "facebook/mms-1b-all": 1_000_000_000, # 1B params "facebook/mms-1b-fl102": 1_000_000_000, # 1B params # Facebook Wav2Vec2 family "facebook/wav2vec2-base-960h": 94_400_000, # 94.4M "facebook/wav2vec2-large-960h": 315_000_000, # ~315M (large) "facebook/wav2vec2-large-960h-lv60-self": 315_000_000, # ~315M (large) "facebook/wav2vec2-large-robust-ft-libri-960h": 315_000_000, # ~315M # Facebook Wav2Vec2 Conformer family "facebook/wav2vec2-conformer-rel-pos-large-960h-ft": 600_000_000, # 600M "facebook/wav2vec2-conformer-rope-large-960h-ft": 600_000_000, # 600M # Facebook HuBERT family "facebook/hubert-large-ls960-ft": 315_000_000, # ~315M (large) "facebook/hubert-xlarge-ls960-ft": 1_000_000_000, # 1B params # IBM Granite Speech family # Total params include speech encoder + LLM backbone "ibm/granite-granite-speech-3.3-2b": 3_000_000_000, # 3B params "ibm/granite-granite-speech-3.3-8b": 9_000_000_000, # 9B params # Kyutai family "kyutai/stt-2.6b-en": 2_600_000_000, # 2.6B params # Lite ASR / EfficientSpeech family "efficient/speech-lite-whisper-large-v3": 1_000_000_000, # ~1B (large-v3 compression) "efficient/speech-lite-whisper-large-v3-acc": 1_000_000_000, # ~1B (accuracy-optimized) "efficient/speech-lite-whisper-large-v3-fast": 1_000_000_000, # ~1B (speed-optimized) "efficient/speech-lite-whisper-large-v3-turbo": 600_000_000, # ~0.6B (turbo compression) "efficient/speech-lite-whisper-large-v3-turbo-acc": 600_000_000, # ~0.6B (turbo accuracy) "efficient/speech-lite-whisper-large-v3-turbo-fast": 600_000_000, # ~0.6B (turbo fast) # Moonshine family (Useful Sensors) "usefulsensors/moonshine-tiny": 27_100_000, # 27.1M params "usefulsensors/moonshine-base": 61_500_000, # 61.5M params # OpenAI Whisper family "openai/whisper-tiny.en": 37_800_000, # 37.8M "openai/whisper-base.en": 72_600_000, # 72.6M "openai/whisper-small.en": 244_000_000, # 244M "openai/whisper-medium.en": 769_000_000, # 769M "openai/whisper-large": 1_550_000_000, # 1550M "openai/whisper-large-v2": 1_550_000_000, # 1550M "openai/whisper-large-v3": 1_550_000_000, # 1550M "openai/whisper-large-v3-turbo": 809_000_000, # 809M # Distil-Whisper family "distil/whisper-distil-large-v2": 756_000_000, # 756M "distil/whisper-distil-large-v3": 756_000_000, # 756M "distil/whisper-distil-medium.en": 394_000_000, # 394M # Paza family "paza/microsoft-paza-Phi-4-multimodal-instruct": 5_600_000_000, # 5.6B "paza/microsoft-paza-mms-1b-all": 1_000_000_000, # 1B "paza/microsoft-paza-whisper-large-v3-turbo": 809_000_000, # 809M # Qwen2 Audio family "Qwen/Qwen2-Audio-7B": 8_000_000_000, # 8B params "Qwen/Qwen2-Audio-7B-Instruct": 8_000_000_000, # 8B params # OmniASR family - CTC models "facebook/omniASR-CTC-300M": 325_494_996, # 325M "facebook/omniASR-CTC-1B": 975_065_300, # 975M "facebook/omniASR-CTC-3B": 3_080_423_636, # 3.08B "facebook/omniASR-CTC-7B": 6_504_786_132, # 6.5B # OmniASR family - LLM models "facebook/omniASR-LLM-300M": 1_627_603_584, # 1.6B "facebook/omniASR-LLM-1B": 2_275_710_592, # 2.3B "facebook/omniASR-LLM-3B": 4_376_679_040, # 4.4B "facebook/omniASR-LLM-7B": 7_801_041_536, # 7.8B # OmniASR family - Zero-shot model "facebook/omniASR-LLM-7B-ZS": 7_810_900_608, # 7.8B # Microsoft Phi-4 family "microsoft/Phi-4-multimodal-instruct": 5_600_000_000, # 5.6B params # NVIDIA NeMo ASR family "nvidia/canary-1b": 1_000_000_000, # ~1B (FastConformer encoder-decoder) "nvidia/canary-1b-v2": 1_000_000_000, # ~1B params "nvidia/canary-qwen-2.5b": 2_500_000_000, # 2.5B params "nvidia/parakeet-tdt-0.6b-v2": 600_000_000, # 600M "nvidia/parakeet-tdt-0.6b-v3": 600_000_000, # 600M params }