File size: 3,137 Bytes
39d9406
0fea237
55d79e2
39d9406
 
02c9b64
1c1b97a
5bebd85
 
 
 
 
 
 
 
1c1b97a
5bebd85
 
 
 
 
 
 
 
 
 
 
 
0fea237
 
1c1b97a
0fea237
1c1b97a
0fea237
 
 
 
 
 
 
39d9406
 
55d79e2
5bebd85
 
 
 
 
 
 
 
 
 
55d79e2
5bebd85
39d9406
 
 
 
 
 
 
 
 
 
 
 
55d79e2
39d9406
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from functools import partial
from huggingface_hub import InferenceClient
from os import path, unlink
import gradio as gr
from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio

def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
    """Transcribe audio to text using Hugging Face Inference API.
    
    This function converts speech audio into text transcription. The audio is
    resampled to match the model's expected sample rate, saved to a temporary
    file, and then sent to the Inference API for transcription.
    
    Args:
        client: Hugging Face InferenceClient instance for API calls.
        model: Hugging Face model ID to use for automatic speech recognition.
        audio: Tuple containing:
            - int: Sample rate of the input audio (e.g., 44100 Hz)
            - bytes: Raw audio data as bytes
    
    Returns:
        String containing the transcribed text from the audio.
    
    Note:
        - Audio is automatically resampled to match the model's expected sample rate.
        - Audio is saved as a WAV file for InferenceClient compatibility.
        - Automatically cleans up temporary files after transcription.
    """
    temp_file_path = None
    try:
        sample_rate = get_model_sample_rate(model)
        temp_file_path = save_audio_to_temp_file(sample_rate, audio)
        result = client.automatic_speech_recognition(temp_file_path, model=model)
        return result["text"]
    finally:
        if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
            try:
                unlink(temp_file_path)
            except Exception:
                pass # Ignore clean-up errors.


def create_asr_tab(client: InferenceClient, model: str):
    """Create the automatic speech recognition tab in the Gradio interface.
    
    This function sets up all UI components for automatic speech recognition, including:
    - URL input textbox for fetching audio files from the web
    - Button to retrieve audio from URL
    - Audio input component for uploading or recording audio
    - Transcribe button and output textbox
    
    Args:
        client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
        model: Hugging Face model ID to use for automatic speech recognition.
    """
    gr.Markdown("Transcribe audio to text.")
    audio_transcription_url_input = gr.Textbox(label="Audio URL")
    audio_transcription_audio_request_button = gr.Button("Get Audio")
    audio_transcription_audio_input = gr.Audio(label="Audio")
    audio_transcription_audio_request_button.click(
        fn=request_audio,
        inputs=audio_transcription_url_input,
        outputs=audio_transcription_audio_input
    )
    audio_transcription_generate_button = gr.Button("Transcribe")
    audio_transcription_output = gr.Textbox(label="Text")
    audio_transcription_generate_button.click(
        fn=partial(automatic_speech_recognition, client, model),
        inputs=audio_transcription_audio_input,
        outputs=audio_transcription_output
    )