Spaces:

mark-muhammad
/

ASR-Quran

Running

File size: 7,146 Bytes

7b51de6

import gradio as gr
import difflib
from transformers import pipeline
import unicodedata

# Initialize the ASR pipeline (model loaded once at startup)
asr_pipeline = pipeline("automatic-speech-recognition", model="tarteel-ai/whisper-base-ar-quran")

# Ground truth for Surah Al-Fatiha (each ayah)
fateha_ayahs = {
    1: "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ",
    2: "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
    3: "ٱلرَّحْمَنِ ٱلرَّحِيمِ",
    4: "مَالِكِ يَوْمِ الدِّينِ",
    5: "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
    6: "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
    7: "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ"
}

def remove_diacritics(text: str) -> str:
    """Remove Arabic diacritics from text using Unicode normalization."""
    normalized_text = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in normalized_text if not unicodedata.combining(c)])

def compare_texts(ref: str, hyp: str, ignore_diacritics: bool = True):
    """
    Compare the reference (ground truth) and hypothesis (ASR output) texts word-by-word.
    Detects:
      - Missed words: present in ref but not in hyp.
      - Incorrect words: substitutions.
      - Extra words: inserted in hyp.
    Returns:
      - highlighted_str: the transcription with wrong/extra words highlighted in red (HTML).
      - missed: list of missed words.
      - incorrect: list of tuples (expected, produced) for substitution errors.
      - extra: list of extra words.
    """
    if ignore_diacritics:
        ref_norm = remove_diacritics(ref)
        hyp_norm = remove_diacritics(hyp)
    else:
        ref_norm = ref
        hyp_norm = hyp

    ref_words = ref_norm.split()
    hyp_words = hyp_norm.split()
    
    matcher = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    highlighted_transcription = []
    missed = []
    incorrect = []
    extra = []
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            highlighted_transcription.extend(hyp_words[j1:j2])
        elif tag == "replace":
            sub_len = min(i2 - i1, j2 - j1)
            for idx in range(sub_len):
                r_word = ref_words[i1 + idx]
                h_word = hyp_words[j1 + idx]
                highlighted_transcription.append(f"<span style='color:red'>{h_word}</span>")
                incorrect.append((r_word, h_word))
            if (i2 - i1) > sub_len:
                missed.extend(ref_words[i1 + sub_len:i2])
            if (j2 - j1) > sub_len:
                for word in hyp_words[j1 + sub_len:j2]:
                    highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
                    extra.append(word)
        elif tag == "delete":
            missed.extend(ref_words[i1:i2])
        elif tag == "insert":
            for word in hyp_words[j1:j2]:
                highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
                extra.append(word)
    
    highlighted_str = " ".join(highlighted_transcription)
    return highlighted_str, missed, incorrect, extra

def process_audio(verse_from, verse_to, audio_file):
    print("[PROCESS] Initializing...")
    verse_from = int(verse_from)
    verse_to = int(verse_to)
# def process_audio(verse_from, audio_file):
    # verse_from = int(verse_from)
    # verse_to = int(verse_from)
    if verse_from not in fateha_ayahs or verse_to not in fateha_ayahs:
        return "<p style='color:red'>Invalid verse number. Please choose a number between 1 and 7.</p>"
    
    verse_number = f"{verse_from}" if verse_from == verse_to else f"{verse_from} - {verse_to}"
    print(f"[PROCESS] Processing ayah: {verse_number}")

    ground_truth = ""
    n = verse_from
    while n <= verse_to:
        ground_truth = ground_truth + " " + fateha_ayahs[n]
        n += 1
    print(f"[PROCESS] Ayah ref: {ground_truth}")
    
    # audio_file is a file path because we use type="filepath"
    result = asr_pipeline(audio_file)
    print(f"[PROCESS] Result: {result}")
    transcription = result["text"]
    
    highlighted_transcription, missed, incorrect, extra = compare_texts(
        ground_truth, transcription, ignore_diacritics=False
    )
    
    html_output = f"""
    <html>
      <head>
        <style>
          body {{ font-family: Arial, sans-serif; margin: 20px; }}
          table, th, td {{ border: 1px solid #ccc; border-collapse: collapse; padding: 8px; }}
        </style>
      </head>
      <body>
        <h2>Ground Truth (Verse {verse_number}):</h2>
        <p>{ground_truth}</p>
        <h2>Model Transcription:</h2>
        <p>{transcription}</p>
        <h2>Highlighted Transcription (mismatches in red):</h2>
        <p>{highlighted_transcription}</p>
        <h2>Differences:</h2>
        <p><strong>Missed Words:</strong> {" ".join(missed) if missed else "None"}</p>
        <p><strong>Incorrect Words (Expected -> Produced):</strong> {"; ".join([f"{exp} -> {prod}" for exp, prod in incorrect]) if incorrect else "None"}</p>
        <p><strong>Extra Words:</strong> {" ".join(extra) if extra else "None"}</p>
      </body>
    </html>
    """
    return html_output

def update_verse_to(verse_from):
    n = verse_from
    verse_to = []
    while n <= 7:
        verse_to.append(n)
        n += 1
    return gr.update(choices=verse_to, value=verse_from, interactive=True)

with gr.Blocks(title="ASR Surah Al-Fatihah") as demo:
    gr.HTML(
            f"""
            <div style="text-align: center;">
                <h1 style="margin-bottom: 0;">ASR Surah Al-Fatihah</h1>
            </div>
            """
        )
    gr.Markdown("Demo pengecekan bacaan Al-Fatihah")
    with gr.Row():
        with gr.Column():
            with gr.Row():
                a_from = gr.Dropdown(
                    choices=list(fateha_ayahs.keys()), 
                    value=1, 
                    label="Dari ayah", 
                    interactive=True, 
                    allow_custom_value=True
                )
                a_to = gr.Dropdown(
                    choices=list(fateha_ayahs.keys()), 
                    value=1, 
                    label="Hingga ayah", 
                    interactive=True, 
                    allow_custom_value=True
                )
                a_from.change(
                    fn=update_verse_to,
                    inputs=[a_from],
                    outputs=[a_to]
                )
            audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Unggah file atau rekam dengan mikrofon")
            btn = gr.Button("Kirim", variant="primary")
        with gr.Column():
            output = gr.HTML(label="Hasil Analisis")
    btn.click(
        fn=process_audio, 
        inputs=[a_from, a_to, audio], 
        outputs=[output]
    )

# Launch 
if __name__ == "__main__":
    demo.launch(share=True)