Spaces:

mark-muhammad
/

ASR-Quran

Sleeping

App Files Files Community

ASR-Quran / app.py

mark-muhammad

Add initial implementation of ASR for Surah Al-Fatihah with Gradio interface

7b51de6 12 days ago

raw

history blame contribute delete

7.15 kB

	import gradio as gr
	import difflib
	from transformers import pipeline
	import unicodedata

	# Initialize the ASR pipeline (model loaded once at startup)
	asr_pipeline = pipeline("automatic-speech-recognition", model="tarteel-ai/whisper-base-ar-quran")

	# Ground truth for Surah Al-Fatiha (each ayah)
	fateha_ayahs = {
	1: "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ",
	2: "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
	3: "ٱلرَّحْمَنِ ٱلرَّحِيمِ",
	4: "مَالِكِ يَوْمِ الدِّينِ",
	5: "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
	6: "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
	7: "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ"
	}

	def remove_diacritics(text: str) -> str:
	"""Remove Arabic diacritics from text using Unicode normalization."""
	normalized_text = unicodedata.normalize('NFKD', text)
	return ''.join([c for c in normalized_text if not unicodedata.combining(c)])

	def compare_texts(ref: str, hyp: str, ignore_diacritics: bool = True):
	"""
	Compare the reference (ground truth) and hypothesis (ASR output) texts word-by-word.
	Detects:
	- Missed words: present in ref but not in hyp.
	- Incorrect words: substitutions.
	- Extra words: inserted in hyp.
	Returns:
	- highlighted_str: the transcription with wrong/extra words highlighted in red (HTML).
	- missed: list of missed words.
	- incorrect: list of tuples (expected, produced) for substitution errors.
	- extra: list of extra words.
	"""
	if ignore_diacritics:
	ref_norm = remove_diacritics(ref)
	hyp_norm = remove_diacritics(hyp)
	else:
	ref_norm = ref
	hyp_norm = hyp

	ref_words = ref_norm.split()
	hyp_words = hyp_norm.split()

	matcher = difflib.SequenceMatcher(None, ref_words, hyp_words)

	highlighted_transcription = []
	missed = []
	incorrect = []
	extra = []

	for tag, i1, i2, j1, j2 in matcher.get_opcodes():
	if tag == "equal":
	highlighted_transcription.extend(hyp_words[j1:j2])
	elif tag == "replace":
	sub_len = min(i2 - i1, j2 - j1)
	for idx in range(sub_len):
	r_word = ref_words[i1 + idx]
	h_word = hyp_words[j1 + idx]
	highlighted_transcription.append(f"<span style='color:red'>{h_word}</span>")
	incorrect.append((r_word, h_word))
	if (i2 - i1) > sub_len:
	missed.extend(ref_words[i1 + sub_len:i2])
	if (j2 - j1) > sub_len:
	for word in hyp_words[j1 + sub_len:j2]:
	highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
	extra.append(word)
	elif tag == "delete":
	missed.extend(ref_words[i1:i2])
	elif tag == "insert":
	for word in hyp_words[j1:j2]:
	highlighted_transcription.append(f"<span style='color:red'>{word}</span>")
	extra.append(word)

	highlighted_str = " ".join(highlighted_transcription)
	return highlighted_str, missed, incorrect, extra

	def process_audio(verse_from, verse_to, audio_file):
	print("[PROCESS] Initializing...")
	verse_from = int(verse_from)
	verse_to = int(verse_to)
	# def process_audio(verse_from, audio_file):
	# verse_from = int(verse_from)
	# verse_to = int(verse_from)
	if verse_from not in fateha_ayahs or verse_to not in fateha_ayahs:
	return "<p style='color:red'>Invalid verse number. Please choose a number between 1 and 7.</p>"

	verse_number = f"{verse_from}" if verse_from == verse_to else f"{verse_from} - {verse_to}"
	print(f"[PROCESS] Processing ayah: {verse_number}")

	ground_truth = ""
	n = verse_from
	while n <= verse_to:
	ground_truth = ground_truth + " " + fateha_ayahs[n]
	n += 1
	print(f"[PROCESS] Ayah ref: {ground_truth}")

	# audio_file is a file path because we use type="filepath"
	result = asr_pipeline(audio_file)
	print(f"[PROCESS] Result: {result}")
	transcription = result["text"]

	highlighted_transcription, missed, incorrect, extra = compare_texts(
	ground_truth, transcription, ignore_diacritics=False
	)

	html_output = f"""
	<html>
	<head>
	<style>
	body {{ font-family: Arial, sans-serif; margin: 20px; }}
	table, th, td {{ border: 1px solid #ccc; border-collapse: collapse; padding: 8px; }}
	</style>
	</head>
	<body>
	<h2>Ground Truth (Verse {verse_number}):</h2>
	<p>{ground_truth}</p>
	<h2>Model Transcription:</h2>
	<p>{transcription}</p>
	<h2>Highlighted Transcription (mismatches in red):</h2>
	<p>{highlighted_transcription}</p>
	<h2>Differences:</h2>
	<p><strong>Missed Words:</strong> {" ".join(missed) if missed else "None"}</p>
	<p><strong>Incorrect Words (Expected -> Produced):</strong> {"; ".join([f"{exp} -> {prod}" for exp, prod in incorrect]) if incorrect else "None"}</p>
	<p><strong>Extra Words:</strong> {" ".join(extra) if extra else "None"}</p>
	</body>
	</html>
	"""
	return html_output

	def update_verse_to(verse_from):
	n = verse_from
	verse_to = []
	while n <= 7:
	verse_to.append(n)
	n += 1
	return gr.update(choices=verse_to, value=verse_from, interactive=True)

	with gr.Blocks(title="ASR Surah Al-Fatihah") as demo:
	gr.HTML(
	f"""
	<div style="text-align: center;">
	<h1 style="margin-bottom: 0;">ASR Surah Al-Fatihah</h1>
	</div>
	"""
	)
	gr.Markdown("Demo pengecekan bacaan Al-Fatihah")
	with gr.Row():
	with gr.Column():
	with gr.Row():
	a_from = gr.Dropdown(
	choices=list(fateha_ayahs.keys()),
	value=1,
	label="Dari ayah",
	interactive=True,
	allow_custom_value=True
	)
	a_to = gr.Dropdown(
	choices=list(fateha_ayahs.keys()),
	value=1,
	label="Hingga ayah",
	interactive=True,
	allow_custom_value=True
	)
	a_from.change(
	fn=update_verse_to,
	inputs=[a_from],
	outputs=[a_to]
	)
	audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Unggah file atau rekam dengan mikrofon")
	btn = gr.Button("Kirim", variant="primary")
	with gr.Column():
	output = gr.HTML(label="Hasil Analisis")
	btn.click(
	fn=process_audio,
	inputs=[a_from, a_to, audio],
	outputs=[output]
	)

	# Launch
	if __name__ == "__main__":
	demo.launch(share=True)