ai-building-blocks / automatic_speech_recognition.py
LiKenun's picture
Switch text-to-image and automatic speech recognition (ASR) back to using the Hugging Face inference client; Zero GPU cannot accommodate the time it takes for those tasks
b71a3ad
from functools import partial
from os import path, unlink
import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
"""Transcribe audio to text using Hugging Face Inference API.
This function converts speech audio into text transcription. The audio is
resampled to match the model's expected sample rate, saved to a temporary
file, and then sent to the Inference API for transcription.
Args:
client: Hugging Face InferenceClient instance for API calls.
model: Hugging Face model ID to use for automatic speech recognition.
audio: Tuple containing:
- int: Sample rate of the input audio (e.g., 44100 Hz)
- bytes | np.ndarray: Raw audio data as bytes or numpy array
Returns:
String containing the transcribed text from the audio.
Note:
- Audio is automatically resampled to match the model's expected sample rate.
- Audio is saved as a WAV file for InferenceClient compatibility.
- Automatically cleans up temporary files after transcription.
- Uses Inference API to offload model loading and inference to Hugging Face's
infrastructure, which is more suitable for environments with limited GPU memory
or time constraints (like Hugging Face Spaces with Zero GPU).
"""
temp_file_path = None
try:
target_sample_rate = get_model_sample_rate(model)
temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
result = client.automatic_speech_recognition(temp_file_path, model=model)
return result["text"]
finally:
if temp_file_path and path.exists(temp_file_path):
try:
unlink(temp_file_path)
except Exception:
pass # Ignore clean-up errors.
def create_asr_tab(client: InferenceClient, model: str):
"""Create the automatic speech recognition tab in the Gradio interface.
This function sets up all UI components for automatic speech recognition, including:
- URL input textbox for fetching audio files from the web
- Button to retrieve audio from URL
- Audio input component for uploading or recording audio
- Transcribe button and output textbox
Args:
client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
model: Hugging Face model ID to use for automatic speech recognition.
"""
gr.Markdown("Transcribe audio to text.")
audio_transcription_url_input = gr.Textbox(label="Audio URL")
audio_transcription_audio_request_button = gr.Button("Get Audio")
audio_transcription_audio_input = gr.Audio(label="Audio")
audio_transcription_audio_request_button.click(
fn=request_audio,
inputs=audio_transcription_url_input,
outputs=audio_transcription_audio_input
)
audio_transcription_generate_button = gr.Button("Transcribe")
audio_transcription_output = gr.Textbox(label="Text")
audio_transcription_generate_button.click(
fn=partial(automatic_speech_recognition, client, model),
inputs=audio_transcription_audio_input,
outputs=audio_transcription_output
)