import gradio as gr import torchaudio from transformers import pipeline # Load only the Moul-Sout-100 model asr_pipeline = pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000") # Adjust generation config if necessary asr_pipeline.model.generation_config.input_ids = asr_pipeline.model.generation_config.forced_decoder_ids asr_pipeline.model.generation_config.forced_decoder_ids = None def ensure_mono_16k(audio_path): """Load audio, convert to mono + 16kHz, and save a temp version.""" waveform, sr = torchaudio.load(audio_path) # Convert to mono if necessary if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) # Resample to 16kHz if necessary if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) waveform = resampler(waveform) sr = 16000 tmp_path = "/tmp/processed_16k.wav" torchaudio.save(tmp_path, waveform, sr) return tmp_path def transcribe(audio): if audio is None: return "Please record or upload an audio file." # Process and transcribe processed_audio = ensure_mono_16k(audio) result = asr_pipeline(processed_audio)["text"] return result title = "🎙️ Moul-Sout ASR 🇲🇦" description = """ **Moul-Sout** model for Darija ASR 🇲🇦. You can record or upload an audio sample (it will be automatically resampled to 16 kHz mono), and view the transcription result below. """ with gr.Blocks(title=title) as demo: gr.Markdown(f"# {title}\n{description}") with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="🎤 Record or Upload Audio (auto 16 kHz mono)" ) transcribe_btn = gr.Button("🚀 Transcribe") output_text = gr.Textbox(label="🟩 Transcription Output") transcribe_btn.click( fn=transcribe, inputs=[audio_input], outputs=[output_text] ) # Local launch if __name__ == "__main__": demo.launch()