Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

ai-building-blocks / automatic_speech_recognition.py

Switch text-to-image and automatic speech recognition (ASR) back to using the Hugging Face inference client; Zero GPU cannot accommodate the time it takes for those tasks

b71a3ad about 1 month ago

raw

history blame contribute delete

3.43 kB

	from functools import partial
	from os import path, unlink
	import gradio as gr
	import numpy as np
	from huggingface_hub import InferenceClient
	from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio

	def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes \| np.ndarray]) -> str:
	"""Transcribe audio to text using Hugging Face Inference API.

	This function converts speech audio into text transcription. The audio is
	resampled to match the model's expected sample rate, saved to a temporary
	file, and then sent to the Inference API for transcription.

	Args:
	client: Hugging Face InferenceClient instance for API calls.
	model: Hugging Face model ID to use for automatic speech recognition.
	audio: Tuple containing:
	- int: Sample rate of the input audio (e.g., 44100 Hz)
	- bytes \| np.ndarray: Raw audio data as bytes or numpy array

	Returns:
	String containing the transcribed text from the audio.

	Note:
	- Audio is automatically resampled to match the model's expected sample rate.
	- Audio is saved as a WAV file for InferenceClient compatibility.
	- Automatically cleans up temporary files after transcription.
	- Uses Inference API to offload model loading and inference to Hugging Face's
	infrastructure, which is more suitable for environments with limited GPU memory
	or time constraints (like Hugging Face Spaces with Zero GPU).
	"""
	temp_file_path = None
	try:
	target_sample_rate = get_model_sample_rate(model)
	temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
	result = client.automatic_speech_recognition(temp_file_path, model=model)
	return result["text"]
	finally:
	if temp_file_path and path.exists(temp_file_path):
	try:
	unlink(temp_file_path)
	except Exception:
	pass # Ignore clean-up errors.


	def create_asr_tab(client: InferenceClient, model: str):
	"""Create the automatic speech recognition tab in the Gradio interface.

	This function sets up all UI components for automatic speech recognition, including:
	- URL input textbox for fetching audio files from the web
	- Button to retrieve audio from URL
	- Audio input component for uploading or recording audio
	- Transcribe button and output textbox

	Args:
	client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
	model: Hugging Face model ID to use for automatic speech recognition.
	"""
	gr.Markdown("Transcribe audio to text.")
	audio_transcription_url_input = gr.Textbox(label="Audio URL")
	audio_transcription_audio_request_button = gr.Button("Get Audio")
	audio_transcription_audio_input = gr.Audio(label="Audio")
	audio_transcription_audio_request_button.click(
	fn=request_audio,
	inputs=audio_transcription_url_input,
	outputs=audio_transcription_audio_input
	)
	audio_transcription_generate_button = gr.Button("Transcribe")
	audio_transcription_output = gr.Textbox(label="Text")
	audio_transcription_generate_button.click(
	fn=partial(automatic_speech_recognition, client, model),
	inputs=audio_transcription_audio_input,
	outputs=audio_transcription_output
	)