Spaces:

libertango7
/

docasr

Running

App Files Files Community

docasr / app.py

libertango7

Update app.py

5abea11 verified 12 days ago

raw

history blame contribute delete

2.38 kB

	"""
	MedASR - Medical Speech Recognition API
	Based on Google's MedASR model for medical dictation and transcription.
	See: https://developers.google.com/health-ai-developer-foundations/medasr
	"""
	import gradio as gr
	from transformers import pipeline
	import librosa
	import numpy as np
	import tempfile
	import os

	# Load MedASR model
	# Note: This model requires accepting the license at https://huggingface.co/google/medasr
	# The Space needs HF_TOKEN secret with access to the model
	model_id = "google/medasr"
	pipe = pipeline("automatic-speech-recognition", model=model_id)


	def transcribe(audio_path):
	"""
	Transcribe audio file using MedASR.

	MedASR requires: mono-channel audio, 16kHz, int16 waveform
	This function handles resampling if needed.

	Args:
	audio_path: Path to audio file (any format supported by librosa)

	Returns:
	str: Transcribed text
	"""
	if audio_path is None:
	return "Error: No audio file provided"

	try:
	# Load and resample audio to 16kHz mono (as required by MedASR)
	speech, sample_rate = librosa.load(audio_path, sr=16000, mono=True)

	# Process audio with recommended parameters from docs
	# chunk_length_s: how long in seconds MedASR batches audio
	# stride_length_s: overlap between chunks
	result = pipe(
	{"raw": speech, "sampling_rate": 16000},
	chunk_length_s=20,
	stride_length_s=2
	)

	return result['text']

	except Exception as e:
	return f"Error during transcription: {str(e)}"


	# Create Gradio interface
	demo = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Upload Medical Audio"),
	outputs=gr.Textbox(label="Transcription", lines=10),
	title="MedASR - Medical Speech Recognition",
	description="""
	Medical dictation and transcription powered by Google's MedASR model.

	Supported audio formats: WAV, MP3, FLAC, OGG, WebM
	Best results with: Clear speech, medical terminology

	Note: Audio is automatically resampled to 16kHz mono for optimal performance.
	""",
	api_name="predict", # Explicitly naming the endpoint for the API
	examples=[], # Add example audio files if available
	)

	# Launch with queue for handling concurrent requests
	demo.queue()
	demo.launch()