import gradio as gr import torch import librosa from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor # Load your custom tuned model MODEL_ID = "Jacaranda-Health/ASR-STT" processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID) model.generation_config.forced_decoder_ids = None model.eval() device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Transcription function def transcribe(audio_file): """ Gradio passes audio_file as (sr, np.ndarray) or a file path depending on config. """ if isinstance(audio_file, tuple): sr, audio = audio_file else: # Fallback: load with librosa audio, sr = librosa.load(audio_file, sr=16000) # Resample to 16k if needed if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) sr = 16000 inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate(inputs["input_features"]) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return transcription # Build Gradio UI with gr.Blocks() as demo: gr.Markdown("## 🎙️ Jacaranda Health – Live ASR Demo") gr.Markdown("Upload a WAV/MP3 file or record audio below, and the model will transcribe it.") with gr.Row(): audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio") output_text = gr.Textbox(label="Transcription") btn = gr.Button("Transcribe") btn.click(fn=transcribe, inputs=audio_input, outputs=output_text) # Launch app if __name__ == "__main__": demo.launch()