Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| import torch | |
| # Detect device and dtype for efficiency/memory | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # Load all 4 models (with chunking for long audio) | |
| pipe1 = pipeline( | |
| "automatic-speech-recognition", | |
| model="IJyad/whisper-large-v3-Tarteel", | |
| torch_dtype=dtype, | |
| device=device, | |
| chunk_length_s=30, | |
| ) | |
| pipe2 = pipeline( | |
| "automatic-speech-recognition", | |
| model="deepdml/whisper-medium-ar-quran-mix-norm", | |
| torch_dtype=dtype, | |
| device=device, | |
| chunk_length_s=30, | |
| ) | |
| pipe3 = pipeline( | |
| "automatic-speech-recognition", | |
| model="naazimsnh02/whisper-large-v3-turbo-ar-quran", | |
| torch_dtype=dtype, | |
| device=device, | |
| chunk_length_s=30, | |
| ) | |
| pipe4 = pipeline( | |
| "automatic-speech-recognition", | |
| model="Habib-HF/tarbiyah-ai-whisper-medium-merged", | |
| torch_dtype=dtype, | |
| device=device, | |
| chunk_length_s=30, | |
| ) | |
| def transcribe(audio): | |
| if audio is None: | |
| return "No audio", "No audio", "No audio", "No audio" | |
| # Force Arabic language for consistency (these models are Arabic/Quran specialized) | |
| kwargs = {"language": "arabic", "task": "transcribe"} | |
| text1 = pipe1(audio, generate_kwargs=kwargs)["text"] | |
| text2 = pipe2(audio, generate_kwargs=kwargs)["text"] | |
| text3 = pipe3(audio, generate_kwargs=kwargs)["text"] | |
| text4 = pipe4(audio, generate_kwargs=kwargs)["text"] | |
| return text1, text2, text3, text4 | |
| with gr.Blocks(title="Quran Whisper Models Comparison") as demo: | |
| gr.Markdown(""" | |
| # Quran ASR Models Comparison | |
| Upload or record a short Quranic recitation and compare transcriptions side-by-side. | |
| Models: | |
| - IJyad/whisper-large-v3-Tarteel (large-v3, high accuracy) | |
| - deepdml/whisper-medium-ar-quran-mix-norm (medium) | |
| - naazimsnh02/whisper-large-v3-turbo-ar-quran (turbo, fast & accurate) | |
| - Habib-HF/tarbiyah-ai-whisper-medium-merged (medium, merged general + Quran) | |
| """) | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Record from mic or upload audio file (WAV/MP3, preferably Quran recitation)" | |
| ) | |
| btn = gr.Button("Transcribe with all 4 models") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### IJyad/whisper-large-v3-Tarteel") | |
| out1 = gr.Textbox(label="Transcription", lines=6, rtl=True) | |
| with gr.Column(): | |
| gr.Markdown("### deepdml/whisper-medium-ar-quran-mix-norm") | |
| out2 = gr.Textbox(label="Transcription", lines=6, rtl=True) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### naazimsnh02/whisper-large-v3-turbo-ar-quran") | |
| out3 = gr.Textbox(label="Transcription", lines=6, rtl=True) | |
| with gr.Column(): | |
| gr.Markdown("### Habib-HF/tarbiyah-ai-whisper-medium-merged") | |
| out4 = gr.Textbox(label="Transcription", lines=6, rtl=True) | |
| btn.click(transcribe, inputs=audio_input, outputs=[out1, out2, out3, out4]) | |
| gr.Markdown(""" | |
| **Notes:** | |
| - Best for short Quran recitations (mic recordings are usually <30s). | |
| - Transcriptions are plain Arabic text (no tashkeel/diacritics in most cases). | |
| - GPU highly recommended — CPU will be slow. | |
| - These models are Quran-specialized; general Arabic speech may not work well. | |
| """) | |
| demo.queue() # Helps with concurrent users | |
| demo.launch() |