ASR-Quran / app.py
mark-muhammad's picture
Add initial implementation of ASR for Surah Al-Fatihah with Gradio interface
7b51de6
import gradio as gr
import difflib
from transformers import pipeline
import unicodedata
# Initialize the ASR pipeline (model loaded once at startup)
asr_pipeline = pipeline("automatic-speech-recognition", model="tarteel-ai/whisper-base-ar-quran")
# Ground truth for Surah Al-Fatiha (each ayah)
fateha_ayahs = {
1: "ุจูุณู’ู…ู ุงู„ู„ู‘ูŽู‡ู ุงู„ุฑู‘ูŽุญู’ู…ูŽู†ู ุงู„ุฑู‘ูŽุญููŠู…ู",
2: "ุงู„ู’ุญูŽู…ู’ุฏู ู„ูู„ู‘ูŽู‡ู ุฑูŽุจู‘ู ุงู„ู’ุนูŽุงู„ูŽู…ููŠู†ูŽ",
3: "ูฑู„ุฑู‘ูŽุญู’ู…ูŽู†ู ูฑู„ุฑู‘ูŽุญููŠู…ู",
4: "ู…ูŽุงู„ููƒู ูŠูŽูˆู’ู…ู ุงู„ุฏู‘ููŠู†ู",
5: "ุฅููŠู‘ูŽุงูƒูŽ ู†ูŽุนู’ุจูุฏู ูˆูŽุฅููŠู‘ูŽุงูƒูŽ ู†ูŽุณู’ุชูŽุนููŠู†ู",
6: "ุงู‡ู’ุฏูู†ูŽุง ุงู„ุตู‘ูุฑูŽุงุทูŽ ุงู„ู’ู…ูุณู’ุชูŽู‚ููŠู…ูŽ",
7: "ุตูุฑูŽุงุทูŽ ุงู„ู‘ูŽุฐููŠู†ูŽ ุฃูŽู†ู’ุนูŽู…ู’ุชูŽ ุนูŽู„ูŽูŠู’ู‡ูู…ู’ ุบูŽูŠู’ุฑู ุงู„ู’ู…ูŽุบู’ุถููˆุจู ุนูŽู„ูŽูŠู’ู‡ูู…ู’ ูˆูŽู„ูŽุง ุงู„ุถู‘ูŽุงู„ู‘ููŠู†ูŽ"
}
def remove_diacritics(text: str) -> str:
"""Remove Arabic diacritics from text using Unicode normalization."""
normalized_text = unicodedata.normalize('NFKD', text)
return ''.join([c for c in normalized_text if not unicodedata.combining(c)])
def compare_texts(ref: str, hyp: str, ignore_diacritics: bool = True):
"""
Compare the reference (ground truth) and hypothesis (ASR output) texts word-by-word.
Detects:
- Missed words: present in ref but not in hyp.
- Incorrect words: substitutions.
- Extra words: inserted in hyp.
Returns:
- highlighted_str: the transcription with wrong/extra words highlighted in red (HTML).
- missed: list of missed words.
- incorrect: list of tuples (expected, produced) for substitution errors.
- extra: list of extra words.
"""
if ignore_diacritics:
ref_norm = remove_diacritics(ref)
hyp_norm = remove_diacritics(hyp)
else:
ref_norm = ref
hyp_norm = hyp
ref_words = ref_norm.split()
hyp_words = hyp_norm.split()
matcher = difflib.SequenceMatcher(None, ref_words, hyp_words)
highlighted_transcription = []
missed = []
incorrect = []
extra = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "equal":
highlighted_transcription.extend(hyp_words[j1:j2])
elif tag == "replace":
sub_len = min(i2 - i1, j2 - j1)
for idx in range(sub_len):
r_word = ref_words[i1 + idx]
h_word = hyp_words[j1 + idx]
highlighted_transcription.append(f"<span style='color:red'>{h_word}</span>")
incorrect.append((r_word, h_word))
if (i2 - i1) > sub_len:
missed.extend(ref_words[i1 + sub_len:i2])
if (j2 - j1) > sub_len:
for word in hyp_words[j1 + sub_len:j2]:
highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
extra.append(word)
elif tag == "delete":
missed.extend(ref_words[i1:i2])
elif tag == "insert":
for word in hyp_words[j1:j2]:
highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
extra.append(word)
highlighted_str = " ".join(highlighted_transcription)
return highlighted_str, missed, incorrect, extra
def process_audio(verse_from, verse_to, audio_file):
print("[PROCESS] Initializing...")
verse_from = int(verse_from)
verse_to = int(verse_to)
# def process_audio(verse_from, audio_file):
# verse_from = int(verse_from)
# verse_to = int(verse_from)
if verse_from not in fateha_ayahs or verse_to not in fateha_ayahs:
return "<p style='color:red'>Invalid verse number. Please choose a number between 1 and 7.</p>"
verse_number = f"{verse_from}" if verse_from == verse_to else f"{verse_from} - {verse_to}"
print(f"[PROCESS] Processing ayah: {verse_number}")
ground_truth = ""
n = verse_from
while n <= verse_to:
ground_truth = ground_truth + " " + fateha_ayahs[n]
n += 1
print(f"[PROCESS] Ayah ref: {ground_truth}")
# audio_file is a file path because we use type="filepath"
result = asr_pipeline(audio_file)
print(f"[PROCESS] Result: {result}")
transcription = result["text"]
highlighted_transcription, missed, incorrect, extra = compare_texts(
ground_truth, transcription, ignore_diacritics=False
)
html_output = f"""
<html>
<head>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
table, th, td {{ border: 1px solid #ccc; border-collapse: collapse; padding: 8px; }}
</style>
</head>
<body>
<h2>Ground Truth (Verse {verse_number}):</h2>
<p>{ground_truth}</p>
<h2>Model Transcription:</h2>
<p>{transcription}</p>
<h2>Highlighted Transcription (mismatches in red):</h2>
<p>{highlighted_transcription}</p>
<h2>Differences:</h2>
<p><strong>Missed Words:</strong> {" ".join(missed) if missed else "None"}</p>
<p><strong>Incorrect Words (Expected -> Produced):</strong> {"; ".join([f"{exp} -> {prod}" for exp, prod in incorrect]) if incorrect else "None"}</p>
<p><strong>Extra Words:</strong> {" ".join(extra) if extra else "None"}</p>
</body>
</html>
"""
return html_output
def update_verse_to(verse_from):
n = verse_from
verse_to = []
while n <= 7:
verse_to.append(n)
n += 1
return gr.update(choices=verse_to, value=verse_from, interactive=True)
with gr.Blocks(title="ASR Surah Al-Fatihah") as demo:
gr.HTML(
f"""
<div style="text-align: center;">
<h1 style="margin-bottom: 0;">ASR Surah Al-Fatihah</h1>
</div>
"""
)
gr.Markdown("Demo pengecekan bacaan Al-Fatihah")
with gr.Row():
with gr.Column():
with gr.Row():
a_from = gr.Dropdown(
choices=list(fateha_ayahs.keys()),
value=1,
label="Dari ayah",
interactive=True,
allow_custom_value=True
)
a_to = gr.Dropdown(
choices=list(fateha_ayahs.keys()),
value=1,
label="Hingga ayah",
interactive=True,
allow_custom_value=True
)
a_from.change(
fn=update_verse_to,
inputs=[a_from],
outputs=[a_to]
)
audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Unggah file atau rekam dengan mikrofon")
btn = gr.Button("Kirim", variant="primary")
with gr.Column():
output = gr.HTML(label="Hasil Analisis")
btn.click(
fn=process_audio,
inputs=[a_from, a_to, audio],
outputs=[output]
)
# Launch
if __name__ == "__main__":
demo.launch(share=True)