| """ |
| Description: |
| This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK. |
| |
| Dependencies: |
| all the necessary dependencies are listed in requirements.txt |
| |
| Usage: |
| The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space |
| |
| Author: Lorenzo Concina |
| Date: 4/6/2025 |
| """ |
| import os |
| import torch |
| import librosa as lb |
| import gradio as gr |
| from transformers import AutoProcessor, pipeline |
| from datasets import load_dataset |
|
|
| def load_fama(model_id, input_lang, task_type): |
| processor = AutoProcessor.from_pretrained(model_id) |
|
|
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| tgt_lang = "it" |
|
|
| |
| output_lang = "" |
| if task_type == "ASR": |
| output_lang = input_lang |
| elif task_type == "ST" and input_lang == "it": |
| output_lang = "en" |
| elif task_type == "ST" and input_lang == "en": |
| output_lang = "it" |
|
|
| |
| lang_tag = "<lang:{}>".format(output_lang) |
| lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag) |
|
|
| generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id} |
|
|
| pipe = pipeline( |
| "automatic-speech-recognition", |
| model=model_id, |
| trust_remote_code=True, |
| torch_dtype=torch.float32, |
| device=device, |
| return_timestamps=False, |
| generate_kwargs=generate_kwargs, |
| chunk_length_s=60, |
| stride_length_s=1 |
| ) |
| return pipe |
|
|
| def load_audio_file(audio_path): |
| y, sr = lb.load(audio_path, sr=16000, mono=True) |
| return y |
|
|
| def transcribe(audio, task_type, model_id, input_lang): |
| """ |
| Function called by gradio interface. It runs model inference on an audio sample |
| """ |
| pipeline = load_fama(model_id, input_lang, task_type) |
|
|
| if isinstance(audio, str) and os.path.isfile(audio): |
| |
| utterance = load_audio_file(audio) |
| result = pipeline(utterance) |
| else: |
| |
| result = pipeline(audio) |
| return result["text"] |
|
|
|
|
| def update_model_options(task_type): |
| if task_type == "ST": |
| model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"] |
| default_model = "FBK-MT/fama-small" |
| button_label = "Translate" |
| textbox_label = "Translation" |
| else: |
| model_choices = [ |
| "FBK-MT/fama-small", |
| "FBK-MT/fama-medium", |
| "FBK-MT/fama-small-asr", |
| "FBK-MT/fama-medium-asr" |
| ] |
| default_model = "FBK-MT/fama-small" |
| button_label = "Transcribe" |
| textbox_label = "Transcription" |
|
|
| return ( |
| gr.update(choices=model_choices, value=default_model), |
| gr.update(value=button_label), |
| gr.update(label=textbox_label) |
| ) |
|
|
|
|
| |
| language_choices = ["en", "it"] |
|
|
|
|
| if __name__ == "__main__": |
|
|
| with gr.Blocks() as iface: |
| gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \ |
| More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""") |
| |
| audio_input = gr.Audio(type="filepath", label="Upload or record audio") |
| |
|
|
| lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language") |
| task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type") |
|
|
| model_input = gr.Radio(choices=[ |
| "FBK-MT/fama-small", |
| "FBK-MT/fama-medium", |
| "FBK-MT/fama-small-asr", |
| "FBK-MT/fama-medium-asr" |
| ], value="FBK-MT/fama-small", label="Select a FAMA model") |
|
|
| output = gr.Textbox(label="Transcription") |
|
|
| transcribe_btn = gr.Button("Transcribe") |
| |
| task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output]) |
|
|
| transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output) |
|
|
| gr.Markdown(""" ### Instructions: \n |
| 1 - Load an audio file or record yourself talking with a microphone \n |
| 2 - Specify the language of the utterance (FAMA supports English and Italian)\n |
| 3 - Select the task to run: Speech recognition or Speech Translation. \n |
| 4 - Select a FAMA model among the available ones \n |
| 4 - Click on Transcribe/Translate |
| """) |
| |
| iface.launch() |
|
|