import gradio as gr from transformers import pipeline import torch # Detect device and dtype for efficiency/memory device = "cuda:0" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load all 4 models (with chunking for long audio) pipe1 = pipeline( "automatic-speech-recognition", model="IJyad/whisper-large-v3-Tarteel", torch_dtype=dtype, device=device, chunk_length_s=30, ) pipe2 = pipeline( "automatic-speech-recognition", model="deepdml/whisper-medium-ar-quran-mix-norm", torch_dtype=dtype, device=device, chunk_length_s=30, ) pipe3 = pipeline( "automatic-speech-recognition", model="naazimsnh02/whisper-large-v3-turbo-ar-quran", torch_dtype=dtype, device=device, chunk_length_s=30, ) pipe4 = pipeline( "automatic-speech-recognition", model="Habib-HF/tarbiyah-ai-whisper-medium-merged", torch_dtype=dtype, device=device, chunk_length_s=30, ) def transcribe(audio): if audio is None: return "No audio", "No audio", "No audio", "No audio" # Force Arabic language for consistency (these models are Arabic/Quran specialized) kwargs = {"language": "arabic", "task": "transcribe"} text1 = pipe1(audio, generate_kwargs=kwargs)["text"] text2 = pipe2(audio, generate_kwargs=kwargs)["text"] text3 = pipe3(audio, generate_kwargs=kwargs)["text"] text4 = pipe4(audio, generate_kwargs=kwargs)["text"] return text1, text2, text3, text4 with gr.Blocks(title="Quran Whisper Models Comparison") as demo: gr.Markdown(""" # Quran ASR Models Comparison Upload or record a short Quranic recitation and compare transcriptions side-by-side. Models: - IJyad/whisper-large-v3-Tarteel (large-v3, high accuracy) - deepdml/whisper-medium-ar-quran-mix-norm (medium) - naazimsnh02/whisper-large-v3-turbo-ar-quran (turbo, fast & accurate) - Habib-HF/tarbiyah-ai-whisper-medium-merged (medium, merged general + Quran) """) audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record from mic or upload audio file (WAV/MP3, preferably Quran recitation)" ) btn = gr.Button("Transcribe with all 4 models") with gr.Row(): with gr.Column(): gr.Markdown("### IJyad/whisper-large-v3-Tarteel") out1 = gr.Textbox(label="Transcription", lines=6, rtl=True) with gr.Column(): gr.Markdown("### deepdml/whisper-medium-ar-quran-mix-norm") out2 = gr.Textbox(label="Transcription", lines=6, rtl=True) with gr.Row(): with gr.Column(): gr.Markdown("### naazimsnh02/whisper-large-v3-turbo-ar-quran") out3 = gr.Textbox(label="Transcription", lines=6, rtl=True) with gr.Column(): gr.Markdown("### Habib-HF/tarbiyah-ai-whisper-medium-merged") out4 = gr.Textbox(label="Transcription", lines=6, rtl=True) btn.click(transcribe, inputs=audio_input, outputs=[out1, out2, out3, out4]) gr.Markdown(""" **Notes:** - Best for short Quran recitations (mic recordings are usually <30s). - Transcriptions are plain Arabic text (no tashkeel/diacritics in most cases). - GPU highly recommended — CPU will be slow. - These models are Quran-specialized; general Arabic speech may not work well. """) demo.queue() # Helps with concurrent users demo.launch()