Spaces:
Running
Running
| """ | |
| MedASR - Medical Speech Recognition API | |
| Based on Google's MedASR model for medical dictation and transcription. | |
| See: https://developers.google.com/health-ai-developer-foundations/medasr | |
| """ | |
| import gradio as gr | |
| from transformers import pipeline | |
| import librosa | |
| import numpy as np | |
| import tempfile | |
| import os | |
| # Load MedASR model | |
| # Note: This model requires accepting the license at https://huggingface.co/google/medasr | |
| # The Space needs HF_TOKEN secret with access to the model | |
| model_id = "google/medasr" | |
| pipe = pipeline("automatic-speech-recognition", model=model_id) | |
| def transcribe(audio_path): | |
| """ | |
| Transcribe audio file using MedASR. | |
| MedASR requires: mono-channel audio, 16kHz, int16 waveform | |
| This function handles resampling if needed. | |
| Args: | |
| audio_path: Path to audio file (any format supported by librosa) | |
| Returns: | |
| str: Transcribed text | |
| """ | |
| if audio_path is None: | |
| return "Error: No audio file provided" | |
| try: | |
| # Load and resample audio to 16kHz mono (as required by MedASR) | |
| speech, sample_rate = librosa.load(audio_path, sr=16000, mono=True) | |
| # Process audio with recommended parameters from docs | |
| # chunk_length_s: how long in seconds MedASR batches audio | |
| # stride_length_s: overlap between chunks | |
| result = pipe( | |
| {"raw": speech, "sampling_rate": 16000}, | |
| chunk_length_s=20, | |
| stride_length_s=2 | |
| ) | |
| return result['text'] | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(type="filepath", label="Upload Medical Audio"), | |
| outputs=gr.Textbox(label="Transcription", lines=10), | |
| title="MedASR - Medical Speech Recognition", | |
| description=""" | |
| Medical dictation and transcription powered by Google's MedASR model. | |
| **Supported audio formats:** WAV, MP3, FLAC, OGG, WebM | |
| **Best results with:** Clear speech, medical terminology | |
| Note: Audio is automatically resampled to 16kHz mono for optimal performance. | |
| """, | |
| api_name="predict", # Explicitly naming the endpoint for the API | |
| examples=[], # Add example audio files if available | |
| ) | |
| # Launch with queue for handling concurrent requests | |
| demo.queue() | |
| demo.launch() | |