Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import librosa | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor | |
| # Load your custom tuned model | |
| MODEL_ID = "Jacaranda-Health/ASR-STT" | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID) | |
| model.generation_config.forced_decoder_ids = None | |
| model.eval() | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| # Transcription function | |
| def transcribe(audio_file): | |
| """ | |
| Gradio passes audio_file as (sr, np.ndarray) or a file path depending on config. | |
| """ | |
| if isinstance(audio_file, tuple): | |
| sr, audio = audio_file | |
| else: | |
| # Fallback: load with librosa | |
| audio, sr = librosa.load(audio_file, sr=16000) | |
| # Resample to 16k if needed | |
| if sr != 16000: | |
| audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) | |
| sr = 16000 | |
| inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| generated_ids = model.generate(inputs["input_features"]) | |
| transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return transcription | |
| # Build Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## ποΈ Jacaranda Health β Live ASR Demo") | |
| gr.Markdown("Upload a WAV/MP3 file or record audio below, and the model will transcribe it.") | |
| with gr.Row(): | |
| audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio") | |
| output_text = gr.Textbox(label="Transcription") | |
| btn = gr.Button("Transcribe") | |
| btn.click(fn=transcribe, inputs=audio_input, outputs=output_text) | |
| # Launch app | |
| if __name__ == "__main__": | |
| demo.launch() | |