Spaces:
Running
on
Zero
Running
on
Zero
| from functools import partial | |
| from os import path, unlink | |
| import gradio as gr | |
| import numpy as np | |
| from huggingface_hub import InferenceClient | |
| from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio | |
| def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str: | |
| """Transcribe audio to text using Hugging Face Inference API. | |
| This function converts speech audio into text transcription. The audio is | |
| resampled to match the model's expected sample rate, saved to a temporary | |
| file, and then sent to the Inference API for transcription. | |
| Args: | |
| client: Hugging Face InferenceClient instance for API calls. | |
| model: Hugging Face model ID to use for automatic speech recognition. | |
| audio: Tuple containing: | |
| - int: Sample rate of the input audio (e.g., 44100 Hz) | |
| - bytes | np.ndarray: Raw audio data as bytes or numpy array | |
| Returns: | |
| String containing the transcribed text from the audio. | |
| Note: | |
| - Audio is automatically resampled to match the model's expected sample rate. | |
| - Audio is saved as a WAV file for InferenceClient compatibility. | |
| - Automatically cleans up temporary files after transcription. | |
| - Uses Inference API to offload model loading and inference to Hugging Face's | |
| infrastructure, which is more suitable for environments with limited GPU memory | |
| or time constraints (like Hugging Face Spaces with Zero GPU). | |
| """ | |
| temp_file_path = None | |
| try: | |
| target_sample_rate = get_model_sample_rate(model) | |
| temp_file_path = save_audio_to_temp_file(target_sample_rate, audio) | |
| result = client.automatic_speech_recognition(temp_file_path, model=model) | |
| return result["text"] | |
| finally: | |
| if temp_file_path and path.exists(temp_file_path): | |
| try: | |
| unlink(temp_file_path) | |
| except Exception: | |
| pass # Ignore clean-up errors. | |
| def create_asr_tab(client: InferenceClient, model: str): | |
| """Create the automatic speech recognition tab in the Gradio interface. | |
| This function sets up all UI components for automatic speech recognition, including: | |
| - URL input textbox for fetching audio files from the web | |
| - Button to retrieve audio from URL | |
| - Audio input component for uploading or recording audio | |
| - Transcribe button and output textbox | |
| Args: | |
| client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function. | |
| model: Hugging Face model ID to use for automatic speech recognition. | |
| """ | |
| gr.Markdown("Transcribe audio to text.") | |
| audio_transcription_url_input = gr.Textbox(label="Audio URL") | |
| audio_transcription_audio_request_button = gr.Button("Get Audio") | |
| audio_transcription_audio_input = gr.Audio(label="Audio") | |
| audio_transcription_audio_request_button.click( | |
| fn=request_audio, | |
| inputs=audio_transcription_url_input, | |
| outputs=audio_transcription_audio_input | |
| ) | |
| audio_transcription_generate_button = gr.Button("Transcribe") | |
| audio_transcription_output = gr.Textbox(label="Text") | |
| audio_transcription_generate_button.click( | |
| fn=partial(automatic_speech_recognition, client, model), | |
| inputs=audio_transcription_audio_input, | |
| outputs=audio_transcription_output | |
| ) | |