import gradio as gr import difflib from transformers import pipeline import unicodedata # Initialize the ASR pipeline (model loaded once at startup) asr_pipeline = pipeline("automatic-speech-recognition", model="tarteel-ai/whisper-base-ar-quran") # Ground truth for Surah Al-Fatiha (each ayah) fateha_ayahs = { 1: "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ", 2: "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ", 3: "ٱلرَّحْمَنِ ٱلرَّحِيمِ", 4: "مَالِكِ يَوْمِ الدِّينِ", 5: "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ", 6: "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ", 7: "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ" } def remove_diacritics(text: str) -> str: """Remove Arabic diacritics from text using Unicode normalization.""" normalized_text = unicodedata.normalize('NFKD', text) return ''.join([c for c in normalized_text if not unicodedata.combining(c)]) def compare_texts(ref: str, hyp: str, ignore_diacritics: bool = True): """ Compare the reference (ground truth) and hypothesis (ASR output) texts word-by-word. Detects: - Missed words: present in ref but not in hyp. - Incorrect words: substitutions. - Extra words: inserted in hyp. Returns: - highlighted_str: the transcription with wrong/extra words highlighted in red (HTML). - missed: list of missed words. - incorrect: list of tuples (expected, produced) for substitution errors. - extra: list of extra words. """ if ignore_diacritics: ref_norm = remove_diacritics(ref) hyp_norm = remove_diacritics(hyp) else: ref_norm = ref hyp_norm = hyp ref_words = ref_norm.split() hyp_words = hyp_norm.split() matcher = difflib.SequenceMatcher(None, ref_words, hyp_words) highlighted_transcription = [] missed = [] incorrect = [] extra = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == "equal": highlighted_transcription.extend(hyp_words[j1:j2]) elif tag == "replace": sub_len = min(i2 - i1, j2 - j1) for idx in range(sub_len): r_word = ref_words[i1 + idx] h_word = hyp_words[j1 + idx] highlighted_transcription.append(f"{h_word}") incorrect.append((r_word, h_word)) if (i2 - i1) > sub_len: missed.extend(ref_words[i1 + sub_len:i2]) if (j2 - j1) > sub_len: for word in hyp_words[j1 + sub_len:j2]: highlighted_transcription.append(f"{word}") extra.append(word) elif tag == "delete": missed.extend(ref_words[i1:i2]) elif tag == "insert": for word in hyp_words[j1:j2]: highlighted_transcription.append(f"{word}") extra.append(word) highlighted_str = " ".join(highlighted_transcription) return highlighted_str, missed, incorrect, extra def process_audio(verse_from, verse_to, audio_file): print("[PROCESS] Initializing...") verse_from = int(verse_from) verse_to = int(verse_to) # def process_audio(verse_from, audio_file): # verse_from = int(verse_from) # verse_to = int(verse_from) if verse_from not in fateha_ayahs or verse_to not in fateha_ayahs: return "

Invalid verse number. Please choose a number between 1 and 7.

" verse_number = f"{verse_from}" if verse_from == verse_to else f"{verse_from} - {verse_to}" print(f"[PROCESS] Processing ayah: {verse_number}") ground_truth = "" n = verse_from while n <= verse_to: ground_truth = ground_truth + " " + fateha_ayahs[n] n += 1 print(f"[PROCESS] Ayah ref: {ground_truth}") # audio_file is a file path because we use type="filepath" result = asr_pipeline(audio_file) print(f"[PROCESS] Result: {result}") transcription = result["text"] highlighted_transcription, missed, incorrect, extra = compare_texts( ground_truth, transcription, ignore_diacritics=False ) html_output = f"""

Ground Truth (Verse {verse_number}):

{ground_truth}

Model Transcription:

{transcription}

Highlighted Transcription (mismatches in red):

{highlighted_transcription}

Differences:

Missed Words: {" ".join(missed) if missed else "None"}

Incorrect Words (Expected -> Produced): {"; ".join([f"{exp} -> {prod}" for exp, prod in incorrect]) if incorrect else "None"}

Extra Words: {" ".join(extra) if extra else "None"}

""" return html_output def update_verse_to(verse_from): n = verse_from verse_to = [] while n <= 7: verse_to.append(n) n += 1 return gr.update(choices=verse_to, value=verse_from, interactive=True) with gr.Blocks(title="ASR Surah Al-Fatihah") as demo: gr.HTML( f"""

ASR Surah Al-Fatihah

""" ) gr.Markdown("Demo pengecekan bacaan Al-Fatihah") with gr.Row(): with gr.Column(): with gr.Row(): a_from = gr.Dropdown( choices=list(fateha_ayahs.keys()), value=1, label="Dari ayah", interactive=True, allow_custom_value=True ) a_to = gr.Dropdown( choices=list(fateha_ayahs.keys()), value=1, label="Hingga ayah", interactive=True, allow_custom_value=True ) a_from.change( fn=update_verse_to, inputs=[a_from], outputs=[a_to] ) audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Unggah file atau rekam dengan mikrofon") btn = gr.Button("Kirim", variant="primary") with gr.Column(): output = gr.HTML(label="Hasil Analisis") btn.click( fn=process_audio, inputs=[a_from, a_to, audio], outputs=[output] ) # Launch if __name__ == "__main__": demo.launch(share=True)