import os import torch import librosa import soundfile as sf from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import gradio as gr MODEL_ID = "xLeonSTES/quran-to-text-base" SAMPLE_RATE = 16000 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" @torch.no_grad() def load_model(): processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID) model.to(DEVICE) model.eval() return processor, model processor, model = load_model() def resample_to_16k(path): audio, sr = sf.read(path) if audio.ndim > 1: audio = audio.mean(axis=1) if sr != SAMPLE_RATE: audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE) return audio, SAMPLE_RATE def transcribe_audio(path): audio, sr = resample_to_16k(path) audio = audio / (max(abs(audio)) + 1e-9) inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt") input_features = inputs.input_features.to(DEVICE) with torch.no_grad(): generated_ids = model.generate(input_features) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text def run(uploaded_audio, mic_audio): path = mic_audio or uploaded_audio if not path: return "No audio provided" try: return transcribe_audio(path) except Exception as e: return f"Error: {e}" with gr.Blocks(title="Quran ASR") as demo: gr.Markdown("# Quran ASR — Diacritized Transcription\nUpload or record audio, then press Convert.") with gr.Row(): with gr.Column(): upload = gr.Audio(type="filepath", label="Upload Audio") mic = gr.Audio(type="filepath", label="Microphone Recording") btn = gr.Button("Convert") with gr.Column(): out = gr.Textbox(label="Output Text", lines=10) btn.click(run, inputs=[upload, mic], outputs=[out]) demo.launch()