File size: 31,429 Bytes
dc9d270 bb039ee 83b9fa5 1a8b506 83b9fa5 4317f0a 135a93d 4317f0a bb039ee 03449f2 2612b68 784327a 20a4ed4 f6cf615 9788c5f 03449f2 784327a 4317f0a 2612b68 bb039ee 4317f0a bb039ee 4317f0a 135a93d 4317f0a bb039ee cab1797 bb039ee 135a93d bb039ee 468ff44 a1923a3 468ff44 ba4ffeb 91f9ff3 ba4ffeb 2dfa1bc 91f9ff3 ba4ffeb 2dfa1bc 91f9ff3 ba4ffeb 83b9fa5 ba4ffeb 2dfa1bc ba4ffeb 2dfa1bc ba4ffeb 2dfa1bc ba4ffeb 20a4ed4 2612b68 20a4ed4 b643ef9 f577fb0 20a4ed4 eb23303 20a4ed4 18cbfb7 20a4ed4 770fb91 9788c5f 1a8b506 9788c5f 1a8b506 9788c5f 2bd506a 9788c5f 71506d5 9788c5f 6c47cc7 71506d5 2bd506a 71506d5 6c47cc7 2bd506a 71506d5 2bd506a 6c47cc7 6126e1c 2bd506a 6c47cc7 2bd506a 6126e1c 2bd506a 6c47cc7 6126e1c 2bd506a 6c47cc7 6126e1c 6c47cc7 6126e1c 6c47cc7 6126e1c 03449f2 bd0de02 03449f2 bd0de02 03449f2 bd0de02 b8465ed bd0de02 03449f2 bd0de02 03449f2 bd0de02 b8465ed bd0de02 03449f2 bd0de02 03449f2 bd0de02 03449f2 bd0de02 03449f2 bd0de02 03449f2 bd0de02 03449f2 c218ecd 03449f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 |
import altair as alt
import numpy as np
import pandas as pd
import streamlit as st
import streamlit.components.v1 as components
from pathlib import Path
import os
import pyperclip
st.set_page_config(
page_title="Automatic Speech Recognition for African Languages",
layout="wide"
)
st.title("ASR for African Languages Model Hub")
# Create tabs
tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([
"About",
"Benchmark Dataset",
"Model Collections",
"Evaluation Scenarios",
"ASR models demo",
"Quantitative Results",
"Human Evaluation of ASR Models"
])
with tab5:
st.header("Demo")
# Option 1: Embed Hugging Face Space with iframe
components.iframe(
"https://asr-africa-asr-african-languages.hf.space/",
height=800,
scrolling=True
)
# # Option 2:
# st.markdown(
# "[Open full demo in a new tab](https://asr-africa-asr-african-languages.hf.space/)"
# )
# --- Tab 2: Owner Avatar / Dataset ---
with tab2:
# Load Markdown file if provided
md_file = Path("src/benchmark.md")
if md_file.exists():
st.markdown(md_file.read_text())
else:
st.info("Upload .md file")
with tab3:
st.header("Model Collections")
st.write("Explore available ASR model collections, grouped by language:")
languages = {
"Ewe": "https://huggingface.co/collections/asr-africa/ewe-68d3d85e015eea82e1355e95",
"Swahili": "https://huggingface.co/collections/asr-africa/swahili-new-676666b26fd924e18fa8781a",
"Lingala": "https://huggingface.co/collections/asr-africa/lingala-new-676666a913beb149ccc22243",
"Luganda": "https://huggingface.co/collections/asr-africa/luganda-new-67666690a7812f6a52248d66",
"Wolof": "https://huggingface.co/collections/asr-africa/wolof-66fbeddd8f3b78428e0bdd57",
"Hausa": "https://huggingface.co/collections/asr-africa/hausa-66e14b187658eb2032f2d80b",
"Igbo": "https://huggingface.co/collections/asr-africa/igbo-66e14e30a533df3d8277334d",
"Yoruba": "https://huggingface.co/collections/asr-africa/yoruba-66e15043c177114958255eaa",
"Bambara": "https://huggingface.co/collections/asr-africa/bambara-66e152a56048d62cd8e6750b",
"Zulu": "https://huggingface.co/collections/asr-africa/zulu-66e1d8c419ce4dfba1d500b1",
"Xhosa": "https://huggingface.co/collections/asr-africa/xhosa-66e1da92a4fcbc413b4699eb",
"Afrikaans": "https://huggingface.co/collections/asr-africa/afrikaans-66e1dc2e07da322da51ca415",
"Bemba": "https://huggingface.co/collections/asr-africa/bemba-66e1dd3adce93c72498d12c3",
"Shona": "https://huggingface.co/collections/asr-africa/shona-66e1de0a076e2b2237b7c5a8",
"Kinyarwanda": "https://huggingface.co/collections/asr-africa/kinyarwanda-66e2e97e15879154e1f47fb7",
"Fula": "https://huggingface.co/collections/asr-africa/fula-66e97b9370af82f2d163e80d",
"Akan": "https://huggingface.co/collections/asr-africa/akan-66e97d0da2f86f17cad499f0"
}
base_models = {
"Wav2Vec2 XLS-R (300M)": "https://huggingface.co/facebook/wav2vec2-xls-r-300m",
"Whisper-Small": "https://huggingface.co/openai/whisper-small",
"MMS-1B": "https://huggingface.co/facebook/mms-1b-all",
"W2V2-BERT 2.0": "https://huggingface.co/facebook/w2v-bert-2.0"
}
st.subheader("Base Architectures")
for name, link in base_models.items():
st.markdown(f"- [{name}]({link})")
st.subheader("Language-Specific Collections")
# sort languages by key (language name)
for lang in sorted(languages.keys()):
link = languages[lang]
with st.expander(f"{lang} Models"):
st.markdown(f"[View full {lang} collection on Hugging Face]({link})")
st.write(
"Models fine-tuned from Wav2Vec2 XLS-R, Whisper, MMS-1B, and W2V2-BERT "
"to support high-quality speech recognition in this language."
)
# --- Tab 4: Evaluation Scenarios ---
with tab4:
st.header("Evaluation Scenarios")
st.write(
"To benchmark ASR models for African languages, we design evaluation scenarios "
"that mimic real-world challenges such as limited training data, domain shift, "
"and variation in speech style."
)
# Summary Table
st.subheader("Scenario Overview")
scenarios = pd.DataFrame([
{
"Scenario": "Data Efficiency Benchmark",
"Focus": "Low-resource training (1 hour per language)",
"Languages": "Multiple African languages",
"Dataset": "asr-africa/ASRAfricaDataEfficiencyBenchmark"
},
{
"Scenario": "Domain Adaptation Benchmark",
"Focus": "Performance shift across domains",
"Languages": "Akan (Finance), Wolof (Agriculture)",
"Dataset": "asr-africa/African-ASR-Domain-Adaptation-Evaluation"
},
{
"Scenario": "Speech Type Adaptation",
"Focus": "Different speech types (read, conversational, etc.)",
"Languages": "Luganda, Wolof",
"Dataset": "asr-africa/African-ASR-Speech-Type-Adaptation"
}
])
st.dataframe(scenarios, width='stretch')
st.subheader("Explore Scenarios")
with st.expander("Data Efficiency Benchmark"):
st.markdown("""
- **Goal:** Evaluate ASR performance in low-resource conditions.
- **Design:** 1 hour of transcribed audio per language.
- **Includes:** audio + metadata.
- **Use case:** Encourage data-efficient ASR systems.
π [View dataset](https://huggingface.co/datasets/asr-africa/ASRAfricaDataEfficiencyBenchmark)
""")
with st.expander("Domain Adaptation Benchmark"):
st.markdown("""
- **Goal:** Test ASR generalization across domains.
- **Languages:**
- Akan β Financial domain testing.
- Wolof β Agricultural domain testing.
- **Challenge:** Many ASR systems degrade when tested on new domains.
π [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Domain-Adaptation-Evaluation)
""")
with st.expander("Speech Type Adaptation"):
st.markdown("""
- **Goal:** Measure ASR performance on different types of speech.
- **Types of Speech:** Read speech, conversational, spontaneous speech.
π [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Speech-Type-Adaptation)
""")
with tab1:
st.header("About")
st.write(
"Automatic Speech Recognition for African Languages: How much speech data is required for a good domain-specific Automatic Speech Recognition model?"
)
st.markdown("""
Previous studies have led to the collection of a considerable number of hours of open-source ASR data,
for example, the work done in India where over 1000βs of hours of data were collected for low-resource
Indian languages. In this research, we would like to transfer the learnings from these successes and
replicate the same model for low-resource African languages. For example, the aspects around the use of
speech data from noisy and non-noisy environments. However, to ensure that we proceed in a cost-efficient
and sustainable approach, we deem it necessary to understand the amount of data that we need to collect
for African languages. Hence, we propose to leverage the Mozilla Common Voice (MCV) platform and other
appropriate and openly available / open-source repositories of African language datasets
to build automatic speech recognition models and test their performance to learn if the data collected was sufficient.
### What is a βGood ASR Modelβ?
We will need to define what a βgoodβ ASR model is for African languages. The aspects to consider will include
the performance measures of ASR models, the performance indicators and the vocabulary size for domain-specific ASR models.
- **Performance metrics**: Word Error Rate (WER) and Character Error Rate (CER).
- **Target benchmarks**: < 10% WER and < 5% CER in lab settings.
- **Lab vs Non-Lab**: Lab data = controlled clean audio; Non-lab = noisy, diverse, real-world audio.
### Performance Indicators
WER is not always the best indicator, especially for languages with diacritics. We need to test models on
different **domains, distributions, and languages** to avoid over/underestimation.
Usefulness will be measured on **generalization** and **accuracy**.
### Sectors of Interest
We will also further investigate the performance of domain-specific ASR models. We would like to investigate three specific domains: Health, Agriculture, and Education, which are key focus areas for the Foundation and for which we can have usable end-user applications in the African context. The idea is to obtain open-source speech datasets for these specific domains and evaluate ASR model performance across these domains.
### Benchmark Dataset
We will build a test set that can be used for benchmarking ASR models in some of the 30 most spoken African languages. The benchmark dataset will be structured to consist of unique MP3 files and corresponding text files. We will ensure as much as possible that the benchmark datasets are as diverse as possible with dataset characteristics like gender, age, accent, variant, vocabulary, acoustic characteristics to help improve the accuracy of speech recognition models. The speech benchmark dataset will be reviewed, deemed highly quality, and split into dev, test and train sets. Due to the largely acoustic nature of African languages (mostly tonal, diacritical, etc.), a careful speech analysis of African languages is necessary and the benchmark dataset is important to spur more research in the African context.
""")
# # Citation
# CITATION_TEXT = """@misc{asr-africa-2025,
# title = {Automatic Speech Recognition for African Languages},
# author = {Dr Joyce Nakatumba-Nabende, Dr Peter Nabende, Dr Andrew Katumba, Alvin Nahabwe},
# year = 2025,
# publisher = {Hugging Face},
# howpublished = "\\url{https://huggingface.co/spaces/asr-africa/Automatic_Speech_Recognition_for_African_Languages}"
# }"""
# with st.expander("π Citation", expanded=False):
# st.text_area(
# "BibTeX snippet to cite this source",
# value=CITATION_TEXT,
# height=150,
# disabled=True
# )
# if st.button("π Copy to Clipboard"):
# try:
# pyperclip.copy(CITATION_TEXT)
# st.success("Citation copied to clipboard!")
# except pyperclip.PyperclipException:
# st.error("Could not copy automatically. Please copy manually.")
# --- Platform preview for About tab ---
st.markdown("""
## Platform overview
A preview of what the platform contains and how to navigate. Use the links and tabs in the top navigation to jump to demos, datasets, results, or evaluation details.
1. **Benchmark Datasets:**
A multilingual collection covering over **17 African languages**, built from open corpora (e.g., Common Voice, Fleurs, NCHLT, ALFFA, Naija Voices).
Each dataset is cleaned, validated, and partitioned into training, development, and test splits to ensure fair benchmarking.
2. **Model Collections:**
Fine-tuned ASR models derived from **Wav2Vec2 XLS-R**, **Whisper**, **MMS**, and **W2V-BERT**, adapted for African phonetic, tonal, and orthographic features.
These are hosted as public collections on [Hugging Face](https://huggingface.co/asr-africa).
3. **Evaluation Scenarios:**
Designed to test **data efficiency**, **domain adaptation**, and **speech-type robustness** β e.g., how models generalize from read speech to spontaneous dialogue,
or from education to agricultural domains.
4. **ASR Demo Interface:**
A **Gradio-powered live testing tool**, allowing users to upload or record audio, view transcriptions, and submit structured feedback via the integrated backend API.
5. **Quantitative Results:**
Comprehensive analysis of model performance across training hours and data scales (1β400 hours), visualized through **Word Error Rate (WER)** and **Character Error Rate (CER)** trends.
Findings show clear **data scaling laws**, with XLS-R and W2V-BERT models performing best under low-resource conditions.
6. **Human Evaluation Framework:**
A structured qualitative evaluation conducted with **20 native-language evaluators** across 12 languages.
Evaluators assessed **accuracy**, **meaning preservation**, **orthography**, and **error types** (e.g., named entities, punctuation, diacritics).
This data is publicly available in the curated [ASR_Evaluation_dataset](https://huggingface.co/datasets/asr-africa/ASR_Evaluation_dataset).
""")
with tab6:
st.header("Quantitative Results: WER vs Dataset Size")
# --- Introduction ---
st.subheader("Introduction")
st.write("""
Automatic Speech Recognition (ASR) for African languages remains challenging due to the scarcity of labeled data and limited methodological guidance for low-resource settings. While interest in multilingual and low-resource ASR is growing, there is still limited understanding of how different pretrained models perform across diverse African languages, data sizes, and decoding strategies.
In this study, we benchmark four state-of-the-art ASR models, Wav2Vec2 XLS-R, Whisper, MMS, and W2V-BERT, across 17 African languages representing East, West, and Southern Africa. These include Luganda, Swahili, Kinyarwanda, Wolof, Akan, Ewe, Xhosa, Lingala, Amharic, Bambara, Bemba, Zulu, Igbo, Shona, Afrikaans, Hausa, and Fula. Our findings contribute empirical insights into model robustness and data efficiency in low-resource scenarios.
""")
# --- Datasets ---
st.subheader("Datasets")
st.write("""
We trained each ASR model on 1, 5, 10, 20, 50, 100, 200 and 400-hour splits, based on labelled data available perlanguage. For Wav2Vec2-XLS-R and W2V-BERT, we also trained 5-gram language models using available textual data to assess the impact of language model integration
""")
# --- Results ---
st.subheader("Results")
st.write("""
Overall, the Word Error Rate (WER) decreases as the number of training hours increases across all models and
languages. This highlights the importance of dataset size in improving ASR performance, although the rate of
improvement varies significantly between models.
""")
# XLS-R
st.subheader("XLS-R")
st.write("""
XLS-R shows a steep decline in log WER as the dataset size increases, especially in low-to-moderate data regimes.
The improvement slows as the dataset becomes larger, suggesting diminishing returns in high-data settings.
""")
st.image("src/Images/13e2495b-81ba-44ad-8646-4576a555ce7a.jpg", caption="Log WER vs Training Hours for XLS-R")
st.subheader("XLS-R + LM")
st.write("""
Incorporating a 5-gram language model (LM) further improves performance in low-data settings (<50 hours).
However, the effect of the LM diminishes as more supervised training data becomes available.
""")
st.image("src/Images/bdfd0b3c-e789-4b5d-811f-866bcb84a230.jpg", caption="Log WER vs Training Hours for XLS-R + LM")
# W2v-BERT
st.subheader("W2v-BERT")
st.write("""
W2v-BERT exhibits a more gradual decline in log WER. It performs well in low-data settings, showing stable reduction
in WER as dataset size increases. This makes it suitable for low-resource languages.
""")
st.image("src/Images/bertlog.png", caption="Log WER vs Training Hours for W2v-BERT")
st.subheader("W2v-BERT + LM")
st.write("""
Adding an LM enhances W2v-BERTβs performance in extremely low-data scenarios, though the relative
benefits diminish as dataset size increases.
""")
st.image("src/Images/9fe3960d-937d-4dca-9051-84a2b216abb9.jpg", caption="Log WER vs Training Hours for W2v-BERT+LM")
# Whisper
st.subheader("Whisper")
st.write("""
Whisper shows a consistent but moderate decline in log WER. Improvements are more linear compared to XLS-R, benefiting
steadily from additional data, but it does not reach XLS-Rβs high-data performance.
""")
st.image("src/Images/4560beb1-33f4-416e-8bf1-fe9e7778835b.jpg", caption="Log WER vs Training Hours for Whisper")
# MMS
st.subheader("MMS")
st.write("""
MMS shows significant improvement between 1β5 hours of training across multiple languages. However, the rate of
improvement declines as more data is added. MMS performs strongly in both low- and high-data settings.
""")
st.image("src/Images/5148df6b-d65e-434b-b2b3-a09a18df0368.jpg", caption="Log WER vs Training Hours for MMS")
# Takeaways
st.write("""
### Key Takeaways
- **More data generally leads to better performance**, though gains diminish beyond ~100 hours for many languages.
- **Language models (LMs) provide the greatest benefit in low-data regimes (<50 hours)** by supplying additional contextual information.
- As supervised training data increases, **the added value of LMs decreases**, though their effectiveness varies somewhat across languages.
- **Model choice matters**: XLS-R benefits most from scaling data, while W2v-BERT shines in extremely low-resource scenarios.
""")
with tab7:
st.header("Human Evaluation of ASR Models")
# --- Introduction ---
st.subheader("Introduction")
st.write("""
ASR systems are typically assessed using automatic metrics such as Word Error Rate (WER) or Character Error Rate (CER).
While these provide valuable quantitative insights, they do not fully capture how well transcriptions preserve meaning, respect language orthography, or handle specific features such as tone, diacritics, or named entities.
To address these gaps, we conducted a human evaluation of ASR systems across African languages to obtain the qualitative performance of the best performing models.
""")
# --- Guidelines ---
st.subheader("Evaluation Guidelines")
st.write("""
Evaluators were provided with structured instructions to ensure consistency in their assessments. The main criteria included:
- **Accuracy (1β5 scale):** How correctly the model transcribed the audio.
- **Meaning Preservation (1β5 scale):** Whether the transcription retained the original meaning.
- **Orthography:** Whether the transcription followed standard writing conventions, including accents, diacritics, and special characters.
- **Recording Environment:** Evaluators noted the type of environment (quiet, professional studio, or noisy background) since background noise impacts ASR performance.
- **Device Used:** Information on whether the recording was made with a mobile phone, laptop microphone, or dedicated mic, as device quality affects clarity.
- **Domain/Topic of Speech:** Evaluators indicated if the speech belonged to a specific topic such as education, health, law, or everyday conversation, to assess domain adaptability.
- **Error Types:** Evaluators identified common error categories, such as:
- Substitutions (wrong words used)
- Omissions (missing words)
- Insertions (extra words added)
- Pronunciation-related errors
- Diacritic/Tone/Special character errors
- Named Entity errors (people, places, currencies)
- Punctuation errors
- **Performance Description:** Free text where evaluators described strengths and weaknesses of the models in their own words.
""")
# --- Setup ---
st.subheader("Evaluation Setup")
st.write("""
- **Languages Evaluated:** 12 languages including Afrikaans, Amharic, Bemba, Hausa, Igbo, Kinyarwanda, Lingala, Luganda, Oromo, Swahili, Wolof, Xhosa, and Yoruba.
- **Participants:** 20 evaluators (native speakers or fluent linguists), aged 18β50, majority with postgraduate education.
- **Platform:** A Gradio-based interface allowed evaluators to upload/record audio, view transcriptions, and complete the feedback form directly online.
""")
st.subheader("Evaluator Contributions")
data = [
{"Evaluator ID": "eval_001", "Contributions": 65, "Languages": "Afrikaans"},
{"Evaluator ID": "eval_002", "Contributions": 50, "Languages": "Afrikaans"},
{"Evaluator ID": "eval_005", "Contributions": 63, "Languages": "Amharic"},
{"Evaluator ID": "eval_006", "Contributions": 69, "Languages": "Amharic"},
{"Evaluator ID": "eval_007", "Contributions": 50, "Languages": "Bemba"},
{"Evaluator ID": "eval_008", "Contributions": 53, "Languages": "Bemba"},
{"Evaluator ID": "eval_009", "Contributions": 60, "Languages": "Hausa"},
{"Evaluator ID": "eval_010", "Contributions": 53, "Languages": "Igbo"},
{"Evaluator ID": "eval_011", "Contributions": 12, "Languages": "Lingala"},
{"Evaluator ID": "eval_012", "Contributions": 115, "Languages": "Oromo"},
{"Evaluator ID": "eval_014", "Contributions": 52, "Languages": "Wolof"},
{"Evaluator ID": "eval_015", "Contributions": 8, "Languages": "Xhosa"},
{"Evaluator ID": "eval_017", "Contributions": 59, "Languages": "Yoruba"},
{"Evaluator ID": "eval_018", "Contributions": 58, "Languages": "Yoruba"},
{"Evaluator ID": "eval_019", "Contributions": 52, "Languages": "Luganda"},
{"Evaluator ID": "eval_020", "Contributions": 55, "Languages": "Luganda"},
{"Evaluator ID": "eval_021", "Contributions": 66, "Languages": "Swahili"},
{"Evaluator ID": "eval_022", "Contributions": 64, "Languages": "Swahili"},
{"Evaluator ID": "eval_023", "Contributions": 50, "Languages": "Kinyarwanda"},
{"Evaluator ID": "eval_024", "Contributions": 53, "Languages": "Kinyarwanda"},
]
df_evaluators = pd.DataFrame(data)
# Show contributions as charts (by evaluator and aggregated by language)
st.markdown("**Contributions by evaluator**")
df_evals_sorted = df_evaluators.sort_values('Contributions', ascending=False)
chart_evaluator = alt.Chart(df_evals_sorted).mark_bar().encode(
x=alt.X('Evaluator ID:N', sort=df_evals_sorted['Evaluator ID'].tolist(), title='Evaluator ID'),
y=alt.Y('Contributions:Q', title='Contributions'),
color=alt.Color('Languages:N', legend=alt.Legend(title='Language')),
tooltip=['Evaluator ID', 'Contributions', 'Languages']
).properties(height=300)
st.altair_chart(chart_evaluator, use_container_width=True)
st.markdown("**Total contributions by language**")
df_lang = df_evaluators.groupby('Languages', as_index=False).sum()
chart_lang = alt.Chart(df_lang).mark_bar().encode(
x=alt.X('Languages:N', sort='-y', title='Language'),
y=alt.Y('Contributions:Q', title='Total Contributions'),
tooltip=['Languages', 'Contributions']
).properties(height=300)
st.altair_chart(chart_lang, use_container_width=True)
# Optional: also show totals and raw table inside an expander
with st.expander("View raw evaluator table"):
st.dataframe(df_evaluators, width="stretch")
st.write("### Summary")
st.write(f"- **Total Evaluators:** {df_evaluators['Evaluator ID'].nunique()}")
st.write(f"- **Total Contributions:** {df_evaluators['Contributions'].sum()}")
# --- Findings ---
st.subheader("Findings")
st.write("""
ASR performance varied significantly across languages, reflecting differences in data availability,
orthography complexity, and domain coverage. Below we summarize the average **Accuracy** and
**Meaning Preservation** scores (1β5 scale) by language.
""")
# Data table of results
results_data = [
{"Language": "Swahili", "Audios Evaluated": 132, "Accuracy": 4.96, "Meaning": 4.97},
{"Language": "Luganda", "Audios Evaluated": 110, "Accuracy": 4.70, "Meaning": 4.78},
{"Language": "Amharic", "Audios Evaluated": 132, "Accuracy": 4.65, "Meaning": 4.82},
{"Language": "Lingala", "Audios Evaluated": 30, "Accuracy": 4.63, "Meaning": 4.70},
{"Language": "Hausa", "Audios Evaluated": 60, "Accuracy": 4.58, "Meaning": 4.97},
{"Language": "Oromo", "Audios Evaluated": 115, "Accuracy": 4.54, "Meaning": 4.52},
{"Language": "Bemba", "Audios Evaluated": 116, "Accuracy": 4.39, "Meaning": 4.86},
{"Language": "Yoruba", "Audios Evaluated": 122, "Accuracy": 4.22, "Meaning": 4.48},
{"Language": "Wolof", "Audios Evaluated": 53, "Accuracy": 3.98, "Meaning": 4.13},
{"Language": "Kinyarwanda", "Audios Evaluated": 103, "Accuracy": 3.75, "Meaning": 4.81},
{"Language": "Xhosa", "Audios Evaluated": 8, "Accuracy": 3.62, "Meaning": 3.38},
{"Language": "Afrikaans", "Audios Evaluated": 116, "Accuracy": 3.59, "Meaning": 4.10},
{"Language": "Igbo", "Audios Evaluated": 55, "Accuracy": 2.25, "Meaning": 2.15},
]
df_results = pd.DataFrame(results_data)
# Visualize results with charts
# Grouped bars for Accuracy and Meaning per language
df_long = df_results.melt(id_vars=['Language', 'Audios Evaluated'], value_vars=['Accuracy', 'Meaning'],
var_name='Metric', value_name='Score')
# Use xOffset for grouped bars when available
try:
chart_metrics = alt.Chart(df_long).mark_bar().encode(
x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
y=alt.Y('Score:Q', title='Score (1-5)'),
color=alt.Color('Metric:N', legend=alt.Legend(title='Metric')),
tooltip=['Language', 'Metric', 'Score'],
xOffset='Metric:N'
).properties(height=360)
except Exception:
# Fallback when xOffset is not supported: side-by-side by using column
chart_metrics = alt.Chart(df_long).mark_bar().encode(
x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
y=alt.Y('Score:Q', title='Score (1-5)'),
color=alt.Color('Metric:N', legend=alt.Legend(title='Metric')),
tooltip=['Language', 'Metric', 'Score']
).properties(height=360)
st.altair_chart(chart_metrics, use_container_width=True)
# Scatter: Accuracy vs Audios Evaluated (size / color by language)
chart_scatter = alt.Chart(df_results).mark_circle(size=120).encode(
x=alt.X('Audios Evaluated:Q', title='Audios Evaluated'),
y=alt.Y('Accuracy:Q', title='Accuracy'),
color=alt.Color('Language:N', legend=None),
tooltip=['Language', 'Audios Evaluated', 'Accuracy', 'Meaning']
).properties(height=360)
st.markdown("**Accuracy vs. Dataset Size**")
st.altair_chart(chart_scatter, use_container_width=True)
# Audios evaluated per language (bar)
chart_audios = alt.Chart(df_results).mark_bar().encode(
x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
y=alt.Y('Audios Evaluated:Q', title='Audios Evaluated'),
tooltip=['Language', 'Audios Evaluated']
).properties(height=320)
st.markdown("**Number of audios evaluated by language**")
st.altair_chart(chart_audios, use_container_width=True)
# Optional: raw table in expander
with st.expander('View raw findings table'):
st.dataframe(df_results, width='stretch')
# Narrative summary
st.markdown("""
### Key Takeaways
- **High-Performing Languages:**
- Swahili (Accuracy 4.96, Meaning 4.97)
- Luganda (Accuracy 4.70, Meaning 4.78)
- Amharic (Accuracy 4.65, Meaning 4.82)
These models produced highly accurate transcriptions with minimal meaning loss.
- **Moderate Performance:**
Hausa, Oromo, Bemba, Yoruba, Wolof, and Kinyarwanda β generally understandable, but often with orthography and punctuation issues.
- **Low-Performing Languages from evaluation:**
- Igbo (Accuracy 2.25, Meaning 2.15)
- Afrikaans (Accuracy 3.59, Meaning 4.10)
- Xhosa (Accuracy 3.62, Meaning 3.38)
""")
# --- Error Patterns ---
st.subheader("Common Error Patterns")
st.write("""
Evaluators highlighted several recurring challenges and areas for improvement across
different languages. These reflect both linguistic complexities and system limitations.
""")
error_data = [
{"Issue": "Punctuation and Formatting",
"Comments": "Absence of punctuation, lack of capitalisation"},
{"Issue": "Spelling and Grammar",
"Comments": "Word merging, frequent spelling mistakes in individual words"},
{"Issue": "Named Entity Recognition",
"Comments": "Inaccurate handling of numbers, currencies, and names"},
{"Issue": "Device Compatibility & Performance",
"Comments": "Better performance on laptops than on mobile phones"},
]
df_errors = pd.DataFrame(error_data)
st.dataframe(df_errors, width="stretch")
st.markdown("""
### Summary
1. **Punctuation and formatting inconsistencies** make transcriptions harder to read.
2. **Word merging and spelling errors** were frequent, particularly in morphologically rich languages.
3. **Named entity recognition** (e.g., names, currencies, numbers) was a common source of error.
4. **Platform performance** was reported as better on laptops than mobile devices.
""")
# --- Human Evaluation Dataset ---
st.subheader("Human Evaluation Dataset")
st.write("""
To support reproducibility and enable further community research, all human evaluation
submissions have been curated into a Hugging Face dataset.
This dataset contains transcriptions, evaluator edits, metadata about recording
environments, devices, domains, and error annotations.
Key fields include:
- **transcription**: Model-generated output.
- **transcript_edit**: Human-corrected transcription.
- **evaluated_language**: Language of the audio.
- **environment**: Recording environment (quiet, noisy, studio).
- **device**: Type of recording device used.
- **accuracy & meaning**: Evaluator ratings on 1β5 scale.
- **errors**: Categories of common transcription errors.
- **performance**: Free-text qualitative assessment.
- **audio**: Audio file with aligned metadata.
π [Explore the dataset on Hugging Face](https://huggingface.co/datasets/asr-africa/ASR_Evaluation_dataset)
""")
# --- Takeaways ---
st.subheader("Takeaways")
st.write("""
- Human ratings generally aligned with automatic metrics: languages with larger datasets (Swahili, Luganda, Amharic) scored highest.
- WER alone misses issues such as meaning drift, orthography violations, and named entity errors.
""")
|