File size: 7,146 Bytes
7b51de6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import gradio as gr
import difflib
from transformers import pipeline
import unicodedata

# Initialize the ASR pipeline (model loaded once at startup)
asr_pipeline = pipeline("automatic-speech-recognition", model="tarteel-ai/whisper-base-ar-quran")

# Ground truth for Surah Al-Fatiha (each ayah)
fateha_ayahs = {
    1: "ุจูุณู’ู…ู ุงู„ู„ู‘ูŽู‡ู ุงู„ุฑู‘ูŽุญู’ู…ูŽู†ู ุงู„ุฑู‘ูŽุญููŠู…ู",
    2: "ุงู„ู’ุญูŽู…ู’ุฏู ู„ูู„ู‘ูŽู‡ู ุฑูŽุจู‘ู ุงู„ู’ุนูŽุงู„ูŽู…ููŠู†ูŽ",
    3: "ูฑู„ุฑู‘ูŽุญู’ู…ูŽู†ู ูฑู„ุฑู‘ูŽุญููŠู…ู",
    4: "ู…ูŽุงู„ููƒู ูŠูŽูˆู’ู…ู ุงู„ุฏู‘ููŠู†ู",
    5: "ุฅููŠู‘ูŽุงูƒูŽ ู†ูŽุนู’ุจูุฏู ูˆูŽุฅููŠู‘ูŽุงูƒูŽ ู†ูŽุณู’ุชูŽุนููŠู†ู",
    6: "ุงู‡ู’ุฏูู†ูŽุง ุงู„ุตู‘ูุฑูŽุงุทูŽ ุงู„ู’ู…ูุณู’ุชูŽู‚ููŠู…ูŽ",
    7: "ุตูุฑูŽุงุทูŽ ุงู„ู‘ูŽุฐููŠู†ูŽ ุฃูŽู†ู’ุนูŽู…ู’ุชูŽ ุนูŽู„ูŽูŠู’ู‡ูู…ู’ ุบูŽูŠู’ุฑู ุงู„ู’ู…ูŽุบู’ุถููˆุจู ุนูŽู„ูŽูŠู’ู‡ูู…ู’ ูˆูŽู„ูŽุง ุงู„ุถู‘ูŽุงู„ู‘ููŠู†ูŽ"
}

def remove_diacritics(text: str) -> str:
    """Remove Arabic diacritics from text using Unicode normalization."""
    normalized_text = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in normalized_text if not unicodedata.combining(c)])

def compare_texts(ref: str, hyp: str, ignore_diacritics: bool = True):
    """
    Compare the reference (ground truth) and hypothesis (ASR output) texts word-by-word.
    Detects:
      - Missed words: present in ref but not in hyp.
      - Incorrect words: substitutions.
      - Extra words: inserted in hyp.
    Returns:
      - highlighted_str: the transcription with wrong/extra words highlighted in red (HTML).
      - missed: list of missed words.
      - incorrect: list of tuples (expected, produced) for substitution errors.
      - extra: list of extra words.
    """
    if ignore_diacritics:
        ref_norm = remove_diacritics(ref)
        hyp_norm = remove_diacritics(hyp)
    else:
        ref_norm = ref
        hyp_norm = hyp

    ref_words = ref_norm.split()
    hyp_words = hyp_norm.split()
    
    matcher = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    highlighted_transcription = []
    missed = []
    incorrect = []
    extra = []
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            highlighted_transcription.extend(hyp_words[j1:j2])
        elif tag == "replace":
            sub_len = min(i2 - i1, j2 - j1)
            for idx in range(sub_len):
                r_word = ref_words[i1 + idx]
                h_word = hyp_words[j1 + idx]
                highlighted_transcription.append(f"<span style='color:red'>{h_word}</span>")
                incorrect.append((r_word, h_word))
            if (i2 - i1) > sub_len:
                missed.extend(ref_words[i1 + sub_len:i2])
            if (j2 - j1) > sub_len:
                for word in hyp_words[j1 + sub_len:j2]:
                    highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
                    extra.append(word)
        elif tag == "delete":
            missed.extend(ref_words[i1:i2])
        elif tag == "insert":
            for word in hyp_words[j1:j2]:
                highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
                extra.append(word)
    
    highlighted_str = " ".join(highlighted_transcription)
    return highlighted_str, missed, incorrect, extra

def process_audio(verse_from, verse_to, audio_file):
    print("[PROCESS] Initializing...")
    verse_from = int(verse_from)
    verse_to = int(verse_to)
# def process_audio(verse_from, audio_file):
    # verse_from = int(verse_from)
    # verse_to = int(verse_from)
    if verse_from not in fateha_ayahs or verse_to not in fateha_ayahs:
        return "<p style='color:red'>Invalid verse number. Please choose a number between 1 and 7.</p>"
    
    verse_number = f"{verse_from}" if verse_from == verse_to else f"{verse_from} - {verse_to}"
    print(f"[PROCESS] Processing ayah: {verse_number}")

    ground_truth = ""
    n = verse_from
    while n <= verse_to:
        ground_truth = ground_truth + " " + fateha_ayahs[n]
        n += 1
    print(f"[PROCESS] Ayah ref: {ground_truth}")
    
    # audio_file is a file path because we use type="filepath"
    result = asr_pipeline(audio_file)
    print(f"[PROCESS] Result: {result}")
    transcription = result["text"]
    
    highlighted_transcription, missed, incorrect, extra = compare_texts(
        ground_truth, transcription, ignore_diacritics=False
    )
    
    html_output = f"""
    <html>
      <head>
        <style>
          body {{ font-family: Arial, sans-serif; margin: 20px; }}
          table, th, td {{ border: 1px solid #ccc; border-collapse: collapse; padding: 8px; }}
        </style>
      </head>
      <body>
        <h2>Ground Truth (Verse {verse_number}):</h2>
        <p>{ground_truth}</p>
        <h2>Model Transcription:</h2>
        <p>{transcription}</p>
        <h2>Highlighted Transcription (mismatches in red):</h2>
        <p>{highlighted_transcription}</p>
        <h2>Differences:</h2>
        <p><strong>Missed Words:</strong> {" ".join(missed) if missed else "None"}</p>
        <p><strong>Incorrect Words (Expected -> Produced):</strong> {"; ".join([f"{exp} -> {prod}" for exp, prod in incorrect]) if incorrect else "None"}</p>
        <p><strong>Extra Words:</strong> {" ".join(extra) if extra else "None"}</p>
      </body>
    </html>
    """
    return html_output

def update_verse_to(verse_from):
    n = verse_from
    verse_to = []
    while n <= 7:
        verse_to.append(n)
        n += 1
    return gr.update(choices=verse_to, value=verse_from, interactive=True)

with gr.Blocks(title="ASR Surah Al-Fatihah") as demo:
    gr.HTML(
            f"""
            <div style="text-align: center;">
                <h1 style="margin-bottom: 0;">ASR Surah Al-Fatihah</h1>
            </div>
            """
        )
    gr.Markdown("Demo pengecekan bacaan Al-Fatihah")
    with gr.Row():
        with gr.Column():
            with gr.Row():
                a_from = gr.Dropdown(
                    choices=list(fateha_ayahs.keys()), 
                    value=1, 
                    label="Dari ayah", 
                    interactive=True, 
                    allow_custom_value=True
                )
                a_to = gr.Dropdown(
                    choices=list(fateha_ayahs.keys()), 
                    value=1, 
                    label="Hingga ayah", 
                    interactive=True, 
                    allow_custom_value=True
                )
                a_from.change(
                    fn=update_verse_to,
                    inputs=[a_from],
                    outputs=[a_to]
                )
            audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Unggah file atau rekam dengan mikrofon")
            btn = gr.Button("Kirim", variant="primary")
        with gr.Column():
            output = gr.HTML(label="Hasil Analisis")
    btn.click(
        fn=process_audio, 
        inputs=[a_from, a_to, audio], 
        outputs=[output]
    )

# Launch 
if __name__ == "__main__":
    demo.launch(share=True)