paza-bench / src /about.py
muchai-mercy's picture
update pazabench space
53a73e0
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">PazaBench Leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
The PazaBench Leaderboard is an Automatic Speech Recognition (ASR) benchmark for low-resource languages developed by the **[Microsoft Research Africa, Nairobi Lab](https://www.microsoft.com/en-us/research/lab/microsoft-research-lab-africa-nairobi/)**. Launching with **39 African Languages** across **52 State-of-the-Art ASR** and **Language Models**, PazaBench compares three key metrics: **Character Error Rate (CER)**, **Word Error Rate (WER)**, and **RTFx (Inverse Real-Time Factor)**.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## PazaBench Inputs
### Evaluation Dataset
The PazaBench evaluation dataset is unified from 7 datasets: [African Next Voices Kenya](https://huggingface.co/datasets/MCAA1-MSU/anv_data_ke), [African Next Voices South Africa](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices), [ALFFA](https://openslr.org/25/), [DigiGreen Kikuyu ASR](https://huggingface.co/datasets/DigiGreen/KikuyuASR_trainingdataset), [Google FLEURS](https://huggingface.co/datasets/google/fleurs), [Mozilla Common Voice 23.0](https://commonvoice.mozilla.org/), and [Naija Voices](https://huggingface.co/datasets/naijavoices/naijavoices-dataset). It captures **61 test splits** across the listed **39 languages** and adds up to **204,492 samples** with modalities limited to 16 kHz mono speech with aligned transcriptions and per-split metadata. For each language, the dataset groups representing that language are unified to provide a balanced measure of model performance.
"""
LLM_BENCHMARKS_DATASETS_TEXT = """
## Datasets
| Dataset | Description | Languages | Total Samples | License |
|---------|-------------|-----------|------------------|---------|
| [Mozilla Common Voice 23.0](https://datacollective.mozillafoundation.org/datasets) | Crowdsourced speech | Afrikaans, Amharic, Arabic, Basaa, Dholuo, Dioula, Ekoti, Hausa, Igbo, Kabyle, Kalenjin, Kidaw'ida, Kinyarwanda, Luganda, Nyungwe, Setswana, Swahili, Tamazight, Tigre, Tigrinya, Twi, Yoruba, Zulu (**20 languages**) | 76,747 | CC0 1.0 |
| [Naija Voices](https://huggingface.co/datasets/naijavoices/naijavoices-dataset) | Conversational speech from Nigeria | Hausa, Igbo, Yoruba (**3 languages**) | 57,529 | CC BY-NC-SA 4.0 |
| [African Next Voices Kenya](https://huggingface.co/datasets/MCAA1-MSU/anv_data_ke) | Conversational speech | Dholuo, Kalenjin, Kikuyu, Maasai, Somali (**5 languages**) | 26,415 | CC BY 4.0 |
| [African Next Voices South Africa](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices) | Conversational speech | Sesotho, Setswana, Tshivenda, Xhosa, Xitsonga, Zulu (**6 languages**) | 23,701 | CC BY 4.0 |
| [Google FLEURS](https://huggingface.co/datasets/google/fleurs) | Read speech | Afrikaans, Amharic, Dholuo, Fula, Ganda, Hausa, Igbo, Kamba, Lingala, Northern Sotho, Nyanja, Oromo, Shona, Somali, Swahili, Umbundu, Wolof, Xhosa, Yoruba, Zulu (**20 languages**) | 12,418 | CC BY 4.0 |
| [DigiGreen Kikuyu ASR](https://huggingface.co/datasets/DigiGreen/KikuyuASR_trainingdataset) | Agricultural speech from farmers | Kikuyu (**1 language**) | 5,054 | Apache 2.0 |
| [ALFFA](https://openslr.org/25/) | Broadcast news and read speech | Amharic, Swahili, Wolof (**3 languages**) | 4,350 | MIT |
---
## Evaluated Models
PazaBench evaluates **16 SOTA ASR model families across 52 individual models** listed below:
| Model Family | Model |
|--------------|-------|
| Paza by Microsoft Research Africa, Nairobi | [microsoft/paza-Phi-4-multimodal-instruct](https://huggingface.co/microsoft/paza-Phi-4-multimodal-instruct), [microsoft/paza-mms-1b-all](https://huggingface.co/microsoft/paza-mms-1b-all), [microsoft/paza-whisper-large-v3-turbo](https://huggingface.co/microsoft/paza-whisper-large-v3-turbo) |
| Distil Whisper | [distil-whisper/distil-large-v2](https://huggingface.co/distil-whisper/distil-large-v2), [distil-whisper/distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3), [distil-whisper/distil-medium.en](https://huggingface.co/distil-whisper/distil-medium.en) |
| Facebook Data2Vec | [facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h), [facebook/data2vec-audio-large-960h](https://huggingface.co/facebook/data2vec-audio-large-960h) |
| Facebook HuBERT | [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft), [facebook/hubert-xlarge-ls960-ft](https://huggingface.co/facebook/hubert-xlarge-ls960-ft) |
| Facebook MMS | [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all), [facebook/mms-1b-fl102](https://huggingface.co/facebook/mms-1b-fl102) |
| Facebook Omnilingual ASR | [facebook/omniASR-CTC-300M](https://huggingface.co/facebook/omniASR-CTC-300M), [facebook/omniASR-CTC-1B](https://huggingface.co/facebook/omniASR-CTC-1B), [facebook/omniASR-CTC-3B](https://huggingface.co/facebook/omniASR-CTC-3B), [facebook/omniASR-CTC-7B](https://huggingface.co/facebook/omniASR-CTC-7B), [facebook/omniASR-LLM-300M](https://huggingface.co/facebook/omniASR-LLM-300M), [facebook/omniASR-LLM-1B](https://huggingface.co/facebook/omniASR-LLM-1B), [facebook/omniASR-LLM-3B](https://huggingface.co/facebook/omniASR-LLM-3B), [facebook/omniASR-LLM-7B](https://huggingface.co/facebook/omniASR-LLM-7B), [facebook/omniASR-LLM-7B-ZS](https://huggingface.co/facebook/omniASR-LLM-7B-ZS) |
| Facebook Wav2Vec2 | [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h), [facebook/wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h), [facebook/wav2vec2-large-960h-lv60-self](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), [facebook/wav2vec2-large-robust-ft-libri-960h](https://huggingface.co/facebook/wav2vec2-large-robust-ft-libri-960h) |
| Facebook Wav2Vec2 Conformer | [facebook/wav2vec2-conformer-rel-pos-large-960h-ft](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large-960h-ft), [facebook/wav2vec2-conformer-rope-large-960h-ft](https://huggingface.co/facebook/wav2vec2-conformer-rope-large-960h-ft) |
| IBM Granite Speech | [ibm-granite/granite-speech-3.3-2b](https://huggingface.co/ibm-granite/granite-speech-3.3-2b), [ibm-granite/granite-speech-3.3-8b](https://huggingface.co/ibm-granite/granite-speech-3.3-8b) |
| Kyutai STT | [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en) |
| Lite ASR (EfficientSpeech) | [efficient-speech/lite-whisper-large-v3](https://huggingface.co/efficient-speech/lite-whisper-large-v3), [efficient-speech/lite-whisper-large-v3-acc](https://huggingface.co/efficient-speech/lite-whisper-large-v3-acc), [efficient-speech/lite-whisper-large-v3-fast](https://huggingface.co/efficient-speech/lite-whisper-large-v3-fast), [efficient-speech/lite-whisper-large-v3-turbo](https://huggingface.co/efficient-speech/lite-whisper-large-v3-turbo), [efficient-speech/lite-whisper-large-v3-turbo-acc](https://huggingface.co/efficient-speech/lite-whisper-large-v3-turbo-acc), [efficient-speech/lite-whisper-large-v3-turbo-fast](https://huggingface.co/efficient-speech/lite-whisper-large-v3-turbo-fast) |
| Microsoft Phi-4 | [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) |
| Moonshine | [usefulsensors/moonshine-base](https://huggingface.co/usefulsensors/moonshine-base), [usefulsensors/moonshine-tiny](https://huggingface.co/usefulsensors/moonshine-tiny) |
| NVIDIA NeMo ASR | [nvidia/canary-1b-v2](https://huggingface.co/nvidia/canary-1b-v2), [nvidia/canary-qwen-2.5b](https://huggingface.co/nvidia/canary-qwen-2.5b), [nvidia/parakeet-tdt-0.6b-v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3) |
| OpenAI Whisper | [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3), [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo), [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2), [openai/whisper-large](https://huggingface.co/openai/whisper-large), [openai/whisper-medium.en](https://huggingface.co/openai/whisper-medium.en), [openai/whisper-small.en](https://huggingface.co/openai/whisper-small.en), [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en), [openai/whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en) |
| Qwen2 Audio | [Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B), [Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct) |
**Whisper Post-Processing:** Whisper model results include a duration-based truncation step to mitigate hallucination and known over-generation behavior.
---
## Acknowledgements
We gratefully acknowledge the dataset creators and leaderboard teams whose contributions made PazaBench possible:
**Datasets:** We extend our gratitude to the creators, community contributors, and maintainers of [African Next Voices Kenya](https://huggingface.co/datasets/MCAA1-MSU/anv_data_ke), [African Next Voices South Africa](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices), [ALFFA](https://openslr.org/25/), [DigiGreen Kikuyu ASR](https://huggingface.co/datasets/DigiGreen/KikuyuASR_trainingdataset), [Google FLEURS](https://huggingface.co/datasets/google/fleurs), [Mozilla Common Voice](https://commonvoice.mozilla.org/) and [Naija Voices](https://huggingface.co/datasets/naijavoices/naijavoices-dataset) whose efforts have been invaluable in advancing African languages speech data.
**Reference Implementation:** We recognize the foundational work of the [Open ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard) by Hugging Face Audio team and appreciate the contributors of the [open_asr_leaderboard repository](https://github.com/huggingface/open_asr_leaderboard) for creating reproducible evaluation scripts.
"""
# Dataset group metadata with descriptions and language counts
# Used for adding descriptors to dataset filter dropdown
DATASET_GROUP_METADATA = {
"ALFFA": {
"description": "Read speech & broadcast news",
"languages": ["Amharic", "Swahili", "Wolof"],
"language_count": 3,
},
"African Next Voices Kenya": {
"description": "Conversational speech",
"languages": ["Dholuo", "Kalenjin", "Kikuyu", "Maasai", "Somali"],
"language_count": 5,
},
"African Next Voices South Africa": {
"description": "Conversational speech",
"languages": ["Sesotho", "Setswana", "Xhosa", "Xitsonga", "Tshivenda", "Zulu"],
"language_count": 6,
},
"Google FLEURS": {
"description": "Read speech",
"languages": ["Afrikaans", "Amharic", "Fula", "Ganda", "Hausa", "Igbo", "Kamba", "Lingala", "Dholuo", "Northern Sotho", "Nyanja", "Oromo", "Shona", "Somali", "Swahili", "Umbundu", "Wolof", "Xhosa", "Yoruba", "Zulu"],
"language_count": 20,
},
"DigiGreen Kikuyu ASR": {
"description": "Agricultural speech",
"languages": ["Kikuyu"],
"language_count": 1,
},
"Mozilla Common Voice 23.0": {
"description": "Crowdsourced speech",
"languages": ["Afrikaans", "Amharic", "Arabic", "Basaa", "Dholuo", "Dioula", "Ekoti", "Hausa", "Igbo", "Kabyle", "Kalenjin", "Kinyarwanda", "Kidaw'ida", "Luganda", "Nyungwe", "Setswana", "Swahili", "Tamazight", "Tigre", "Tigrinya", "Twi", "Yoruba", "Zulu"],
"language_count": 20,
},
"Naija Voices": {
"description": "Conversational speech",
"languages": ["Hausa", "Igbo", "Yoruba"],
"language_count": 3,
},
}
def get_dataset_group_label(dataset_group: str) -> str:
"""Return a formatted label with description and language count."""
meta = DATASET_GROUP_METADATA.get(dataset_group)
if meta:
return f"{dataset_group} ({meta['description']}, {meta['language_count']} languages)"
return dataset_group
def get_dataset_group_languages(dataset_group: str) -> list[str]:
"""Return list of languages for a dataset group."""
meta = DATASET_GROUP_METADATA.get(dataset_group)
if meta:
return meta['languages']
return []
EVALUATION_LANGUAGE_TEXT = """
### Request Language Evaluation
Submit a language dataset from any region for evaluation on PazaBench. We'll benchmark it using all supported ASR models. Provide the dataset source in the form below.
**Requirements:**
- Dataset must be publicly accessible on Hugging Face Hub or via a public URL
- Must contain audio samples with text transcriptions
- Audio should be 16kHz mono WAV format (will be resampled if needed)
"""
EVALUATION_MODEL_TEXT = """
### Submit a Model for Evaluation
Add a new ASR model to PazaBench. We'll evaluate it across all 39 African languages.
**Requirements:**
- Model must be **publicly available** on [Hugging Face Hub](https://huggingface.co/models)
- Must support speech-to-text / ASR tasks
- Should be compatible with `transformers` AutoModel or provide clear loading instructions
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{pazabench2026,
title={PazaBench: A Benchmark for Automatic Speech Recognition on Low Resource Languages},
author={Microsoft Research Africa, Nairobi},
year={2026},
howpublished={\url{https://www.microsoft.com/en-us/research/project/project-gecko/}},
note={Alpha version. Part of Project Gecko - Equitable Generative AI for the Global Majority}
}
"""