File size: 31,429 Bytes
dc9d270
 
 
 
bb039ee
 
83b9fa5
1a8b506
83b9fa5
 
 
 
 
4317f0a
135a93d
4317f0a
bb039ee
03449f2
2612b68
784327a
 
20a4ed4
f6cf615
9788c5f
03449f2
784327a
4317f0a
2612b68
bb039ee
4317f0a
bb039ee
 
 
 
 
4317f0a
 
135a93d
 
 
 
4317f0a
bb039ee
 
 
cab1797
bb039ee
 
 
135a93d
bb039ee
 
 
468ff44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1923a3
 
 
 
468ff44
 
 
 
 
 
ba4ffeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91f9ff3
ba4ffeb
 
 
 
2dfa1bc
91f9ff3
ba4ffeb
 
 
2dfa1bc
 
91f9ff3
ba4ffeb
 
 
83b9fa5
ba4ffeb
 
 
 
 
 
 
2dfa1bc
ba4ffeb
 
 
 
 
 
 
 
2dfa1bc
 
 
ba4ffeb
 
 
 
 
2dfa1bc
ba4ffeb
 
 
20a4ed4
2612b68
20a4ed4
 
 
b643ef9
f577fb0
20a4ed4
 
 
 
 
 
 
eb23303
20a4ed4
 
 
 
 
18cbfb7
20a4ed4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770fb91
9788c5f
 
 
 
 
 
 
 
1a8b506
9788c5f
 
 
 
 
 
 
1a8b506
9788c5f
 
 
 
 
 
2bd506a
9788c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71506d5
9788c5f
6c47cc7
71506d5
 
2bd506a
71506d5
 
 
 
6c47cc7
2bd506a
71506d5
 
 
 
 
 
 
 
 
 
 
 
 
 
2bd506a
 
6c47cc7
 
 
 
6126e1c
 
 
 
 
 
 
 
 
2bd506a
 
 
6c47cc7
 
 
 
 
2bd506a
6126e1c
 
 
 
 
 
 
 
 
2bd506a
 
6c47cc7
 
 
 
6126e1c
2bd506a
 
 
6c47cc7
 
 
 
6126e1c
6c47cc7
6126e1c
6c47cc7
6126e1c
 
 
 
 
03449f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd0de02
 
 
 
 
 
 
 
 
 
 
 
03449f2
 
bd0de02
03449f2
 
 
 
 
 
 
bd0de02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8465ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd0de02
 
 
 
03449f2
 
bd0de02
03449f2
bd0de02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8465ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd0de02
 
 
03449f2
bd0de02
 
 
 
 
03449f2
bd0de02
03449f2
bd0de02
 
 
 
03449f2
 
 
 
bd0de02
03449f2
bd0de02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03449f2
 
c218ecd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03449f2
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
import altair as alt
import numpy as np
import pandas as pd
import streamlit as st
import streamlit.components.v1 as components
from pathlib import Path
import os
import pyperclip

st.set_page_config(
    page_title="Automatic Speech Recognition for African Languages",
    layout="wide"
)

st.title("ASR for African Languages Model Hub")

# Create tabs
tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([
    "About",
    "Benchmark Dataset",
    "Model Collections",
    "Evaluation Scenarios",
    "ASR models demo",
    "Quantitative Results",
    "Human Evaluation of ASR Models"
])

with tab5:
    st.header("Demo")

    # Option 1: Embed Hugging Face Space with iframe
    components.iframe(
        "https://asr-africa-asr-african-languages.hf.space/",
        height=800,
        scrolling=True
    )

    # # Option 2:
    # st.markdown(
    #     "[Open full demo in a new tab](https://asr-africa-asr-african-languages.hf.space/)"
    # )

# --- Tab 2: Owner Avatar / Dataset ---
with tab2:
    # Load Markdown file if provided
    md_file = Path("src/benchmark.md")  
    if md_file.exists():
        st.markdown(md_file.read_text())
    else:
        st.info("Upload .md file")

with tab3:
    st.header("Model Collections")
    st.write("Explore available ASR model collections, grouped by language:")

    languages = {
        "Ewe": "https://huggingface.co/collections/asr-africa/ewe-68d3d85e015eea82e1355e95",
        "Swahili": "https://huggingface.co/collections/asr-africa/swahili-new-676666b26fd924e18fa8781a",
        "Lingala": "https://huggingface.co/collections/asr-africa/lingala-new-676666a913beb149ccc22243",
        "Luganda": "https://huggingface.co/collections/asr-africa/luganda-new-67666690a7812f6a52248d66",
        "Wolof": "https://huggingface.co/collections/asr-africa/wolof-66fbeddd8f3b78428e0bdd57",
        "Hausa": "https://huggingface.co/collections/asr-africa/hausa-66e14b187658eb2032f2d80b",
        "Igbo": "https://huggingface.co/collections/asr-africa/igbo-66e14e30a533df3d8277334d",
        "Yoruba": "https://huggingface.co/collections/asr-africa/yoruba-66e15043c177114958255eaa",
        "Bambara": "https://huggingface.co/collections/asr-africa/bambara-66e152a56048d62cd8e6750b",
        "Zulu": "https://huggingface.co/collections/asr-africa/zulu-66e1d8c419ce4dfba1d500b1",
        "Xhosa": "https://huggingface.co/collections/asr-africa/xhosa-66e1da92a4fcbc413b4699eb",
        "Afrikaans": "https://huggingface.co/collections/asr-africa/afrikaans-66e1dc2e07da322da51ca415",
        "Bemba": "https://huggingface.co/collections/asr-africa/bemba-66e1dd3adce93c72498d12c3",
        "Shona": "https://huggingface.co/collections/asr-africa/shona-66e1de0a076e2b2237b7c5a8",
        "Kinyarwanda": "https://huggingface.co/collections/asr-africa/kinyarwanda-66e2e97e15879154e1f47fb7",
        "Fula": "https://huggingface.co/collections/asr-africa/fula-66e97b9370af82f2d163e80d",
        "Akan": "https://huggingface.co/collections/asr-africa/akan-66e97d0da2f86f17cad499f0"
    }

    base_models = {
        "Wav2Vec2 XLS-R (300M)": "https://huggingface.co/facebook/wav2vec2-xls-r-300m",
        "Whisper-Small": "https://huggingface.co/openai/whisper-small",
        "MMS-1B": "https://huggingface.co/facebook/mms-1b-all",
        "W2V2-BERT 2.0": "https://huggingface.co/facebook/w2v-bert-2.0"
    }

    st.subheader("Base Architectures")
    for name, link in base_models.items():
        st.markdown(f"- [{name}]({link})")

    st.subheader("Language-Specific Collections")
    
    # sort languages by key (language name)
    for lang in sorted(languages.keys()):
        link = languages[lang]
        with st.expander(f"{lang} Models"):
            st.markdown(f"[View full {lang} collection on Hugging Face]({link})")
            st.write(
                "Models fine-tuned from Wav2Vec2 XLS-R, Whisper, MMS-1B, and W2V2-BERT "
                "to support high-quality speech recognition in this language."
            )
# --- Tab 4: Evaluation Scenarios ---
with tab4:
    st.header("Evaluation Scenarios")
    st.write(
        "To benchmark ASR models for African languages, we design evaluation scenarios "
        "that mimic real-world challenges such as limited training data, domain shift, "
        "and variation in speech style."
    )

    # Summary Table
    st.subheader("Scenario Overview")
    scenarios = pd.DataFrame([
        {
            "Scenario": "Data Efficiency Benchmark",
            "Focus": "Low-resource training (1 hour per language)",
            "Languages": "Multiple African languages",
            "Dataset": "asr-africa/ASRAfricaDataEfficiencyBenchmark"
        },
        {
            "Scenario": "Domain Adaptation Benchmark",
            "Focus": "Performance shift across domains",
            "Languages": "Akan (Finance), Wolof (Agriculture)",
            "Dataset": "asr-africa/African-ASR-Domain-Adaptation-Evaluation"
        },
        {
            "Scenario": "Speech Type Adaptation",
            "Focus": "Different speech types (read, conversational, etc.)",
            "Languages": "Luganda, Wolof",
            "Dataset": "asr-africa/African-ASR-Speech-Type-Adaptation"
        }
    ])

    st.dataframe(scenarios, width='stretch')

    st.subheader("Explore Scenarios")

    with st.expander("Data Efficiency Benchmark"):
        st.markdown("""
        - **Goal:** Evaluate ASR performance in low-resource conditions.  
        - **Design:** 1 hour of transcribed audio per language.  
        - **Includes:** audio + metadata.  
        - **Use case:** Encourage data-efficient ASR systems.  
        πŸ”— [View dataset](https://huggingface.co/datasets/asr-africa/ASRAfricaDataEfficiencyBenchmark)
        """)

    with st.expander("Domain Adaptation Benchmark"):
        st.markdown("""
        - **Goal:** Test ASR generalization across domains.  
        - **Languages:**  
          - Akan β†’ Financial domain testing.  
          - Wolof β†’ Agricultural domain testing.  
        - **Challenge:** Many ASR systems degrade when tested on new domains.  
        πŸ”— [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Domain-Adaptation-Evaluation)
        """)

    with st.expander("Speech Type Adaptation"):
        st.markdown("""
        - **Goal:** Measure ASR performance on different types of speech.  
        - **Types of Speech:** Read speech, conversational, spontaneous speech.  
        πŸ”— [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Speech-Type-Adaptation)
        """)

with tab1:
    st.header("About")
    st.write(
        "Automatic Speech Recognition for African Languages: How much speech data is required for a good domain-specific Automatic Speech Recognition model?"
    )
    st.markdown("""
        Previous studies have led to the collection of a considerable number of hours of open-source ASR data, 
        for example, the work done in India where over 1000’s of hours of data were collected for low-resource 
        Indian languages. In this research, we would like to transfer the learnings from these successes and 
        replicate the same model for low-resource African languages. For example, the aspects around the use of 
        speech data from noisy and non-noisy environments. However, to ensure that we proceed in a cost-efficient 
        and sustainable approach, we deem it necessary to understand the amount of data that we need to collect 
        for African languages. Hence, we propose to leverage the Mozilla Common Voice (MCV) platform and other 
        appropriate and openly available / open-source repositories of African language datasets
        to build automatic speech recognition models and test their performance to learn if the data collected was sufficient.
        
        ### What is a β€œGood ASR Model”?
        We will need to define what a β€œgood” ASR model is for African languages. The aspects to consider will include 
        the performance measures of ASR models, the performance indicators and the vocabulary size for domain-specific ASR models.
        
        - **Performance metrics**: Word Error Rate (WER) and Character Error Rate (CER).  
        - **Target benchmarks**: < 10% WER and < 5% CER in lab settings.  
        - **Lab vs Non-Lab**: Lab data = controlled clean audio; Non-lab = noisy, diverse, real-world audio.  
        
        ### Performance Indicators
        WER is not always the best indicator, especially for languages with diacritics. We need to test models on 
        different **domains, distributions, and languages** to avoid over/underestimation.  
        Usefulness will be measured on **generalization** and **accuracy**.

        ### Sectors of Interest
        We will also further investigate the performance of domain-specific ASR models. We would like to investigate three specific domains: Health, Agriculture, and Education, which are key focus areas for the Foundation and for which we can have usable end-user applications in the African context. The idea is to obtain open-source speech datasets for these specific domains and evaluate ASR model performance across these domains.

        
        ### Benchmark Dataset
        We will build a test set that can be used for benchmarking ASR models in some of the 30 most spoken African languages. The benchmark dataset will be structured to consist of unique MP3 files and corresponding text files. We will ensure as much as possible that the benchmark datasets are as diverse as possible with dataset characteristics like gender, age, accent, variant, vocabulary, acoustic characteristics to help improve the accuracy of speech recognition models. The speech benchmark dataset will be reviewed, deemed highly quality, and split into dev, test and train sets. Due to the largely acoustic nature of African languages (mostly tonal, diacritical, etc.), a careful speech analysis of African languages is necessary and the benchmark dataset is important to spur more research in the African context.
                
    """)
    # # Citation
    # CITATION_TEXT = """@misc{asr-africa-2025,
    # title        = {Automatic Speech Recognition for African Languages},
    # author       = {Dr Joyce Nakatumba-Nabende, Dr Peter Nabende, Dr Andrew Katumba, Alvin Nahabwe},
    # year         = 2025,
    # publisher    = {Hugging Face},
    # howpublished = "\\url{https://huggingface.co/spaces/asr-africa/Automatic_Speech_Recognition_for_African_Languages}"
    # }"""
    
    # with st.expander("πŸ“™ Citation", expanded=False):
    #     st.text_area(
    #         "BibTeX snippet to cite this source",
    #         value=CITATION_TEXT,
    #         height=150,
    #         disabled=True
    #     )
        
        # if st.button("πŸ“‹ Copy to Clipboard"):
        #     try:
        #         pyperclip.copy(CITATION_TEXT)
        #         st.success("Citation copied to clipboard!")
        #     except pyperclip.PyperclipException:
        #         st.error("Could not copy automatically. Please copy manually.")

    # --- Platform preview for About tab ---
    st.markdown("""
## Platform overview

A preview of what the platform contains and how to navigate. Use the links and tabs in the top navigation to jump to demos, datasets, results, or evaluation details.

1. **Benchmark Datasets:**  
       A multilingual collection covering over **17 African languages**, built from open corpora (e.g., Common Voice, Fleurs, NCHLT, ALFFA, Naija Voices).  
       Each dataset is cleaned, validated, and partitioned into training, development, and test splits to ensure fair benchmarking.

2. **Model Collections:**  
       Fine-tuned ASR models derived from **Wav2Vec2 XLS-R**, **Whisper**, **MMS**, and **W2V-BERT**, adapted for African phonetic, tonal, and orthographic features.  
       These are hosted as public collections on [Hugging Face](https://huggingface.co/asr-africa).

3. **Evaluation Scenarios:**  
       Designed to test **data efficiency**, **domain adaptation**, and **speech-type robustness** β€” e.g., how models generalize from read speech to spontaneous dialogue, 
       or from education to agricultural domains.

4. **ASR Demo Interface:**  
       A **Gradio-powered live testing tool**, allowing users to upload or record audio, view transcriptions, and submit structured feedback via the integrated backend API.

5. **Quantitative Results:**  
    Comprehensive analysis of model performance across training hours and data scales (1–400 hours), visualized through **Word Error Rate (WER)** and **Character Error Rate (CER)** trends.  
    Findings show clear **data scaling laws**, with XLS-R and W2V-BERT models performing best under low-resource conditions.

6. **Human Evaluation Framework:**  
    A structured qualitative evaluation conducted with **20 native-language evaluators** across 12 languages.  
    Evaluators assessed **accuracy**, **meaning preservation**, **orthography**, and **error types** (e.g., named entities, punctuation, diacritics).  
    This data is publicly available in the curated [ASR_Evaluation_dataset](https://huggingface.co/datasets/asr-africa/ASR_Evaluation_dataset).
""")
with tab6:
    st.header("Quantitative Results: WER vs Dataset Size")

    # --- Introduction ---
    st.subheader("Introduction")
    st.write("""
    Automatic Speech Recognition (ASR) for African languages remains challenging due to the scarcity of labeled data and limited methodological guidance for low-resource settings. While interest in multilingual and low-resource ASR is growing, there is still limited understanding of how different pretrained models perform across diverse African languages, data sizes, and decoding strategies. 
    
    In this study, we benchmark four state-of-the-art ASR models, Wav2Vec2 XLS-R, Whisper, MMS, and W2V-BERT, across 17 African languages representing East, West, and Southern Africa. These include Luganda, Swahili, Kinyarwanda, Wolof, Akan, Ewe, Xhosa, Lingala, Amharic, Bambara, Bemba, Zulu, Igbo, Shona, Afrikaans, Hausa, and Fula. Our findings contribute empirical insights into model robustness and data efficiency in low-resource scenarios.

    """)

    # --- Datasets ---
    st.subheader("Datasets")
    st.write("""
    We trained each ASR model on 1, 5, 10, 20, 50, 100, 200 and 400-hour splits, based on labelled data available perlanguage. For Wav2Vec2-XLS-R and W2V-BERT, we also trained 5-gram language models using available textual data to assess the impact of language model integration

    """)

    # --- Results ---
    st.subheader("Results")
    st.write("""
    Overall, the Word Error Rate (WER) decreases as the number of training hours increases across all models and 
    languages. This highlights the importance of dataset size in improving ASR performance, although the rate of 
    improvement varies significantly between models.
    """)
    # XLS-R
    st.subheader("XLS-R")
    st.write("""
    XLS-R shows a steep decline in log WER as the dataset size increases, especially in low-to-moderate data regimes.
    The improvement slows as the dataset becomes larger, suggesting diminishing returns in high-data settings.
    """)
    st.image("src/Images/13e2495b-81ba-44ad-8646-4576a555ce7a.jpg", caption="Log WER vs Training Hours for XLS-R")

    st.subheader("XLS-R + LM")
    st.write("""
    Incorporating a 5-gram language model (LM) further improves performance in low-data settings (<50 hours). 
    However, the effect of the LM diminishes as more supervised training data becomes available.
    """)
    st.image("src/Images/bdfd0b3c-e789-4b5d-811f-866bcb84a230.jpg", caption="Log WER vs Training Hours for XLS-R + LM")
    

    # W2v-BERT
    st.subheader("W2v-BERT")
    st.write("""
    W2v-BERT exhibits a more gradual decline in log WER. It performs well in low-data settings, showing stable reduction
    in WER as dataset size increases. This makes it suitable for low-resource languages.
    """)
    st.image("src/Images/bertlog.png", caption="Log WER vs Training Hours for W2v-BERT")


    st.subheader("W2v-BERT + LM")
    st.write("""
    Adding an LM enhances W2v-BERT’s performance in extremely low-data scenarios, though the relative 
    benefits diminish as dataset size increases.
    """)
    st.image("src/Images/9fe3960d-937d-4dca-9051-84a2b216abb9.jpg", caption="Log WER vs Training Hours for W2v-BERT+LM")


    # Whisper
    st.subheader("Whisper")
    st.write("""
    Whisper shows a consistent but moderate decline in log WER. Improvements are more linear compared to XLS-R, benefiting
    steadily from additional data, but it does not reach XLS-R’s high-data performance.
    """)
    st.image("src/Images/4560beb1-33f4-416e-8bf1-fe9e7778835b.jpg", caption="Log WER vs Training Hours for Whisper")

    # MMS
    st.subheader("MMS")
    st.write("""
    MMS shows significant improvement between 1–5 hours of training across multiple languages. However, the rate of
    improvement declines as more data is added. MMS performs strongly in both low- and high-data settings.
    """)
    st.image("src/Images/5148df6b-d65e-434b-b2b3-a09a18df0368.jpg", caption="Log WER vs Training Hours for MMS")

    # Takeaways
    st.write("""
    ### Key Takeaways
    - **More data generally leads to better performance**, though gains diminish beyond ~100 hours for many languages.  
    - **Language models (LMs) provide the greatest benefit in low-data regimes (<50 hours)** by supplying additional contextual information.  
    - As supervised training data increases, **the added value of LMs decreases**, though their effectiveness varies somewhat across languages.  
    - **Model choice matters**: XLS-R benefits most from scaling data, while W2v-BERT shines in extremely low-resource scenarios.  
    """)

with tab7:
    st.header("Human Evaluation of ASR Models")

    # --- Introduction ---
    st.subheader("Introduction")
    st.write("""
    ASR systems are typically assessed using automatic metrics such as Word Error Rate (WER) or Character Error Rate (CER).
    While these provide valuable quantitative insights, they do not fully capture how well transcriptions preserve meaning, respect language orthography, or handle specific features such as tone, diacritics, or named entities. 
    To address these gaps, we conducted a human evaluation of ASR systems across African languages to obtain the qualitative performance of the best performing models.
    """)

    # --- Guidelines ---
    st.subheader("Evaluation Guidelines")
    st.write("""
    Evaluators were provided with structured instructions to ensure consistency in their assessments. The main criteria included:
    
    - **Accuracy (1–5 scale):** How correctly the model transcribed the audio.  
    - **Meaning Preservation (1–5 scale):** Whether the transcription retained the original meaning.  
    - **Orthography:** Whether the transcription followed standard writing conventions, including accents, diacritics, and special characters.  
    - **Recording Environment:** Evaluators noted the type of environment (quiet, professional studio, or noisy background) since background noise impacts ASR performance.  
    - **Device Used:** Information on whether the recording was made with a mobile phone, laptop microphone, or dedicated mic, as device quality affects clarity.  
    - **Domain/Topic of Speech:** Evaluators indicated if the speech belonged to a specific topic such as education, health, law, or everyday conversation, to assess domain adaptability.  
    - **Error Types:** Evaluators identified common error categories, such as:  
      - Substitutions (wrong words used)  
      - Omissions (missing words)  
      - Insertions (extra words added)  
      - Pronunciation-related errors  
      - Diacritic/Tone/Special character errors  
      - Named Entity errors (people, places, currencies)  
      - Punctuation errors  
    - **Performance Description:** Free text where evaluators described strengths and weaknesses of the models in their own words.
    """)


    # --- Setup ---
    st.subheader("Evaluation Setup")
    st.write("""
    - **Languages Evaluated:** 12 languages including Afrikaans, Amharic, Bemba, Hausa, Igbo, Kinyarwanda, Lingala, Luganda, Oromo, Swahili, Wolof, Xhosa, and Yoruba.  
    - **Participants:** 20 evaluators (native speakers or fluent linguists), aged 18–50, majority with postgraduate education.  
    - **Platform:** A Gradio-based interface allowed evaluators to upload/record audio, view transcriptions, and complete the feedback form directly online.
    """)
    st.subheader("Evaluator Contributions")
    data = [
        {"Evaluator ID": "eval_001", "Contributions": 65, "Languages": "Afrikaans"},
        {"Evaluator ID": "eval_002", "Contributions": 50, "Languages": "Afrikaans"},
        {"Evaluator ID": "eval_005", "Contributions": 63, "Languages": "Amharic"},
        {"Evaluator ID": "eval_006", "Contributions": 69, "Languages": "Amharic"},
        {"Evaluator ID": "eval_007", "Contributions": 50, "Languages": "Bemba"},
        {"Evaluator ID": "eval_008", "Contributions": 53, "Languages": "Bemba"},
        {"Evaluator ID": "eval_009", "Contributions": 60, "Languages": "Hausa"},
        {"Evaluator ID": "eval_010", "Contributions": 53, "Languages": "Igbo"},
        {"Evaluator ID": "eval_011", "Contributions": 12, "Languages": "Lingala"},
        {"Evaluator ID": "eval_012", "Contributions": 115, "Languages": "Oromo"},
        {"Evaluator ID": "eval_014", "Contributions": 52, "Languages": "Wolof"},
        {"Evaluator ID": "eval_015", "Contributions": 8, "Languages": "Xhosa"},
        {"Evaluator ID": "eval_017", "Contributions": 59, "Languages": "Yoruba"},
        {"Evaluator ID": "eval_018", "Contributions": 58, "Languages": "Yoruba"},
        {"Evaluator ID": "eval_019", "Contributions": 52, "Languages": "Luganda"},
        {"Evaluator ID": "eval_020", "Contributions": 55, "Languages": "Luganda"},
        {"Evaluator ID": "eval_021", "Contributions": 66, "Languages": "Swahili"},
        {"Evaluator ID": "eval_022", "Contributions": 64, "Languages": "Swahili"},
        {"Evaluator ID": "eval_023", "Contributions": 50, "Languages": "Kinyarwanda"},
        {"Evaluator ID": "eval_024", "Contributions": 53, "Languages": "Kinyarwanda"},
    ]
    
    df_evaluators = pd.DataFrame(data)
    # Show contributions as charts (by evaluator and aggregated by language)
    st.markdown("**Contributions by evaluator**")
    df_evals_sorted = df_evaluators.sort_values('Contributions', ascending=False)

    chart_evaluator = alt.Chart(df_evals_sorted).mark_bar().encode(
        x=alt.X('Evaluator ID:N', sort=df_evals_sorted['Evaluator ID'].tolist(), title='Evaluator ID'),
        y=alt.Y('Contributions:Q', title='Contributions'),
        color=alt.Color('Languages:N', legend=alt.Legend(title='Language')),
        tooltip=['Evaluator ID', 'Contributions', 'Languages']
    ).properties(height=300)

    st.altair_chart(chart_evaluator, use_container_width=True)

    st.markdown("**Total contributions by language**")
    df_lang = df_evaluators.groupby('Languages', as_index=False).sum()
    chart_lang = alt.Chart(df_lang).mark_bar().encode(
        x=alt.X('Languages:N', sort='-y', title='Language'),
        y=alt.Y('Contributions:Q', title='Total Contributions'),
        tooltip=['Languages', 'Contributions']
    ).properties(height=300)

    st.altair_chart(chart_lang, use_container_width=True)

    # Optional: also show totals and raw table inside an expander
    with st.expander("View raw evaluator table"):
        st.dataframe(df_evaluators, width="stretch")

    st.write("### Summary")
    st.write(f"- **Total Evaluators:** {df_evaluators['Evaluator ID'].nunique()}")
    st.write(f"- **Total Contributions:** {df_evaluators['Contributions'].sum()}")
    
    # --- Findings ---
    st.subheader("Findings")
    
    st.write("""
    ASR performance varied significantly across languages, reflecting differences in data availability,
    orthography complexity, and domain coverage. Below we summarize the average **Accuracy** and
    **Meaning Preservation** scores (1–5 scale) by language.
    """)
    
    # Data table of results
    results_data = [
        {"Language": "Swahili", "Audios Evaluated": 132, "Accuracy": 4.96, "Meaning": 4.97},
        {"Language": "Luganda", "Audios Evaluated": 110, "Accuracy": 4.70, "Meaning": 4.78},
        {"Language": "Amharic", "Audios Evaluated": 132, "Accuracy": 4.65, "Meaning": 4.82},
        {"Language": "Lingala", "Audios Evaluated": 30, "Accuracy": 4.63, "Meaning": 4.70},
        {"Language": "Hausa", "Audios Evaluated": 60, "Accuracy": 4.58, "Meaning": 4.97},
        {"Language": "Oromo", "Audios Evaluated": 115, "Accuracy": 4.54, "Meaning": 4.52},
        {"Language": "Bemba", "Audios Evaluated": 116, "Accuracy": 4.39, "Meaning": 4.86},
        {"Language": "Yoruba", "Audios Evaluated": 122, "Accuracy": 4.22, "Meaning": 4.48},
        {"Language": "Wolof", "Audios Evaluated": 53, "Accuracy": 3.98, "Meaning": 4.13},
        {"Language": "Kinyarwanda", "Audios Evaluated": 103, "Accuracy": 3.75, "Meaning": 4.81},
        {"Language": "Xhosa", "Audios Evaluated": 8, "Accuracy": 3.62, "Meaning": 3.38},
        {"Language": "Afrikaans", "Audios Evaluated": 116, "Accuracy": 3.59, "Meaning": 4.10},
        {"Language": "Igbo", "Audios Evaluated": 55, "Accuracy": 2.25, "Meaning": 2.15},
    ]
    
    df_results = pd.DataFrame(results_data)
    # Visualize results with charts
    # Grouped bars for Accuracy and Meaning per language
    df_long = df_results.melt(id_vars=['Language', 'Audios Evaluated'], value_vars=['Accuracy', 'Meaning'],
                              var_name='Metric', value_name='Score')

    # Use xOffset for grouped bars when available
    try:
        chart_metrics = alt.Chart(df_long).mark_bar().encode(
            x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
            y=alt.Y('Score:Q', title='Score (1-5)'),
            color=alt.Color('Metric:N', legend=alt.Legend(title='Metric')),
            tooltip=['Language', 'Metric', 'Score'],
            xOffset='Metric:N'
        ).properties(height=360)
    except Exception:
        # Fallback when xOffset is not supported: side-by-side by using column
        chart_metrics = alt.Chart(df_long).mark_bar().encode(
            x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
            y=alt.Y('Score:Q', title='Score (1-5)'),
            color=alt.Color('Metric:N', legend=alt.Legend(title='Metric')),
            tooltip=['Language', 'Metric', 'Score']
        ).properties(height=360)

    st.altair_chart(chart_metrics, use_container_width=True)

    # Scatter: Accuracy vs Audios Evaluated (size / color by language)
    chart_scatter = alt.Chart(df_results).mark_circle(size=120).encode(
        x=alt.X('Audios Evaluated:Q', title='Audios Evaluated'),
        y=alt.Y('Accuracy:Q', title='Accuracy'),
        color=alt.Color('Language:N', legend=None),
        tooltip=['Language', 'Audios Evaluated', 'Accuracy', 'Meaning']
    ).properties(height=360)

    st.markdown("**Accuracy vs. Dataset Size**")
    st.altair_chart(chart_scatter, use_container_width=True)

    # Audios evaluated per language (bar)
    chart_audios = alt.Chart(df_results).mark_bar().encode(
        x=alt.X('Language:N', sort=df_results['Language'].tolist(), title='Language'),
        y=alt.Y('Audios Evaluated:Q', title='Audios Evaluated'),
        tooltip=['Language', 'Audios Evaluated']
    ).properties(height=320)

    st.markdown("**Number of audios evaluated by language**")
    st.altair_chart(chart_audios, use_container_width=True)

    # Optional: raw table in expander
    with st.expander('View raw findings table'):
        st.dataframe(df_results, width='stretch')

    # Narrative summary
    st.markdown("""
    ### Key Takeaways
    - **High-Performing Languages:**  
      - Swahili (Accuracy 4.96, Meaning 4.97)  
      - Luganda (Accuracy 4.70, Meaning 4.78)  
      - Amharic (Accuracy 4.65, Meaning 4.82)  
      These models produced highly accurate transcriptions with minimal meaning loss.
    
    - **Moderate Performance:**  
      Hausa, Oromo, Bemba, Yoruba, Wolof, and Kinyarwanda β€” generally understandable, but often with orthography and punctuation issues.
    
    - **Low-Performing Languages from evaluation:**  
      - Igbo (Accuracy 2.25, Meaning 2.15)  
      - Afrikaans (Accuracy 3.59, Meaning 4.10)  
      - Xhosa (Accuracy 3.62, Meaning 3.38)  
    """)

    # --- Error Patterns ---
    st.subheader("Common Error Patterns")
    
    st.write("""
    Evaluators highlighted several recurring challenges and areas for improvement across
    different languages. These reflect both linguistic complexities and system limitations.
    """)
    
    error_data = [
        {"Issue": "Punctuation and Formatting", 
         "Comments": "Absence of punctuation, lack of capitalisation"},
        {"Issue": "Spelling and Grammar", 
         "Comments": "Word merging, frequent spelling mistakes in individual words"},
        {"Issue": "Named Entity Recognition", 
         "Comments": "Inaccurate handling of numbers, currencies, and names"},
        {"Issue": "Device Compatibility & Performance", 
         "Comments": "Better performance on laptops than on mobile phones"},
    ]
    
    df_errors = pd.DataFrame(error_data)
    st.dataframe(df_errors, width="stretch")
    
    st.markdown("""
    ### Summary
    1. **Punctuation and formatting inconsistencies** make transcriptions harder to read.  
    2. **Word merging and spelling errors** were frequent, particularly in morphologically rich languages.  
    3. **Named entity recognition** (e.g., names, currencies, numbers) was a common source of error.  
    4. **Platform performance** was reported as better on laptops than mobile devices.  
    """)


    # --- Human Evaluation Dataset ---
    st.subheader("Human Evaluation Dataset")
    
    st.write("""
    To support reproducibility and enable further community research, all human evaluation
    submissions have been curated into a Hugging Face dataset.  
    This dataset contains transcriptions, evaluator edits, metadata about recording
    environments, devices, domains, and error annotations.
    
    Key fields include:
    - **transcription**: Model-generated output.  
    - **transcript_edit**: Human-corrected transcription.  
    - **evaluated_language**: Language of the audio.  
    - **environment**: Recording environment (quiet, noisy, studio).  
    - **device**: Type of recording device used.  
    - **accuracy & meaning**: Evaluator ratings on 1–5 scale.  
    - **errors**: Categories of common transcription errors.  
    - **performance**: Free-text qualitative assessment.  
    - **audio**: Audio file with aligned metadata.  
    
    πŸ”— [Explore the dataset on Hugging Face](https://huggingface.co/datasets/asr-africa/ASR_Evaluation_dataset)
    """)

    # --- Takeaways ---
    st.subheader("Takeaways")
    st.write("""
    - Human ratings generally aligned with automatic metrics: languages with larger datasets (Swahili, Luganda, Amharic) scored highest.  
    - WER alone misses issues such as meaning drift, orthography violations, and named entity errors.  
    """)