docasr / app.py
libertango7's picture
Update app.py
5abea11 verified
"""
MedASR - Medical Speech Recognition API
Based on Google's MedASR model for medical dictation and transcription.
See: https://developers.google.com/health-ai-developer-foundations/medasr
"""
import gradio as gr
from transformers import pipeline
import librosa
import numpy as np
import tempfile
import os
# Load MedASR model
# Note: This model requires accepting the license at https://huggingface.co/google/medasr
# The Space needs HF_TOKEN secret with access to the model
model_id = "google/medasr"
pipe = pipeline("automatic-speech-recognition", model=model_id)
def transcribe(audio_path):
"""
Transcribe audio file using MedASR.
MedASR requires: mono-channel audio, 16kHz, int16 waveform
This function handles resampling if needed.
Args:
audio_path: Path to audio file (any format supported by librosa)
Returns:
str: Transcribed text
"""
if audio_path is None:
return "Error: No audio file provided"
try:
# Load and resample audio to 16kHz mono (as required by MedASR)
speech, sample_rate = librosa.load(audio_path, sr=16000, mono=True)
# Process audio with recommended parameters from docs
# chunk_length_s: how long in seconds MedASR batches audio
# stride_length_s: overlap between chunks
result = pipe(
{"raw": speech, "sampling_rate": 16000},
chunk_length_s=20,
stride_length_s=2
)
return result['text']
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload Medical Audio"),
outputs=gr.Textbox(label="Transcription", lines=10),
title="MedASR - Medical Speech Recognition",
description="""
Medical dictation and transcription powered by Google's MedASR model.
**Supported audio formats:** WAV, MP3, FLAC, OGG, WebM
**Best results with:** Clear speech, medical terminology
Note: Audio is automatically resampled to 16kHz mono for optimal performance.
""",
api_name="predict", # Explicitly naming the endpoint for the API
examples=[], # Add example audio files if available
)
# Launch with queue for handling concurrent requests
demo.queue()
demo.launch()