Quran_ASR-API

Sleeping

App Files Files Community

aboalaa1472 commited on Nov 27, 2025

Commit

58ed92a

verified ·

1 Parent(s): d447f46

Create app.py

Browse files

Files changed (1) hide show

app.py +114 -0

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Hugging Face Space: Quran ASR (Gradio)
+# File: app.py
+# Purpose: simple web page that accepts uploaded audio or microphone recording,
+# runs xLeonSTES/quran-to-text-base ASR and returns the diacritized (tashkeel) text.
+import os
+import tempfile
+import torch
+import librosa
+import soundfile as sf
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import gradio as gr
+# --- Configuration ---
+MODEL_ID = "xLeonSTES/quran-to-text-base"
+SAMPLE_RATE = 16000
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Load model & processor once on startup ---
+@torch.no_grad()
+def load_model():
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID)
+    model.to(DEVICE)
+    model.eval()
+    return processor, model
+processor, model = load_model()
+# --- Audio utility functions ---
+def resample_to_16k(path_or_array, sr_in=None):
+    # Accept either a path or a numpy array
+    if isinstance(path_or_array, str):
+        # read file with soundfile to preserve format then resample with librosa
+        audio, sr = sf.read(path_or_array)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        if sr != SAMPLE_RATE:
+            audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE)
+        return audio, SAMPLE_RATE
+    else:
+        # assume tuple (array, sr)
+        audio, sr = path_or_array
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        if sr != SAMPLE_RATE:
+            audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE)
+        return audio, SAMPLE_RATE
+# --- Main transcription function ---
+def transcribe_audio_file(audio_path):
+    try:
+        audio, sr = resample_to_16k(audio_path)
+    except Exception as e:
+        # try librosa load fallback
+        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
+    # Normalize audio
+    audio = audio / (max(abs(audio)) + 1e-9)
+    # Prepare inputs
+    inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+    input_features = inputs.input_features.to(DEVICE)
+    # Generate (uses model.generate under the hood)
+    with torch.no_grad():
+        generated_ids = model.generate(**{"input_features": input_features})
+    # Decode
+    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return transcription
+# --- Gradio UI ---
+with gr.Blocks(title="Quran ASR — Diacritized Transcription") as demo:
+    gr.Markdown("# Quran ASR — Diacritized Transcription\nUpload a recording or record with your microphone, then press **Convert** to get the text with tashkeel.")
+    with gr.Row():
+        with gr.Column():
+            audio_in = gr.Audio(source="upload", type="filepath", label="Upload audio file or record (mp3/wav/m4a/etc.)")
+            mic_in = gr.Audio(source="microphone", type="filepath", label="Or record from microphone (browser) — optional")
+            convert_btn = gr.Button("Convert")
+            status = gr.Textbox(value="Model loaded on device: {}".format(DEVICE), interactive=False, label="Status")
+        with gr.Column():
+            out_text = gr.Textbox(label="Diacritized transcription (Tashkeel)", lines=10)
+    def run_pipeline(uploaded_path, mic_path):
+        # Prefer microphone if provided, else uploaded file
+        if mic_path:
+            path = mic_path
+        elif uploaded_path:
+            path = uploaded_path
+        else:
+            return "", "No audio provided"
+        # Transcribe
+        try:
+            txt = transcribe_audio_file(path)
+            return txt
+        except Exception as e:
+            return f"Error during transcription: {e}"
+    convert_btn.click(fn=run_pipeline, inputs=[audio_in, mic_in], outputs=[out_text])
+    gr.Markdown("---\n**Notes:** This Space uses the `xLeonSTES/quran-to-text-base` model. The first invocation may take longer while the model downloads (~300MB). For best results, provide clear audio sampled at 16kHz.")
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)