Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import nemo.collections.asr as nemo_asr | |
| SAMPLE_RATE = 16000 | |
| TITLE = "NeMo ASR Inference on Hugging Face" | |
| DESCRIPTION = "Demo of all languages supported by NeMo ASR" | |
| DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge" | |
| MARKDOWN = f""" | |
| # {TITLE} | |
| ## {DESCRIPTION} | |
| """ | |
| CSS = """ | |
| p.big { | |
| font-size: 20px; | |
| } | |
| """ | |
| ARTICLE = """ | |
| <br><br> | |
| <p class='big' style='text-align: center'> | |
| <a href='https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/intro.html' target='_blank'>NeMo ASR</a> | |
| | | |
| <a href='https://github.com/NVIDIA/NeMo#nvidia-nemo' target='_blank'>Github Repo</a> | |
| </p> | |
| """ | |
| SUPPORTED_LANGUAGES = set([]) | |
| SUPPORTED_MODEL_NAMES = set([]) | |
| # HF models | |
| hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter() | |
| hf_filter.task = "automatic-speech-recognition" | |
| hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter) | |
| for info in hf_infos: | |
| lang_id = info.modelId.split("_")[1] # obtains lang id as str | |
| SUPPORTED_LANGUAGES.add(lang_id) | |
| SUPPORTED_MODEL_NAMES.add(info.modelId) | |
| SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES)) | |
| model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES} | |
| SUPPORTED_LANG_MODEL_DICT = {} | |
| for lang in SUPPORTED_LANGUAGES: | |
| for model_id in SUPPORTED_MODEL_NAMES: | |
| if lang in model_id: | |
| # create new lang in dict | |
| if lang not in SUPPORTED_LANG_MODEL_DICT: | |
| SUPPORTED_LANG_MODEL_DICT[lang] = [model_id] | |
| else: | |
| SUPPORTED_LANG_MODEL_DICT[lang].append(model_id) | |
| # Sort model names | |
| for lang in SUPPORTED_LANG_MODEL_DICT.keys(): | |
| model_ids = SUPPORTED_LANG_MODEL_DICT[lang] | |
| model_ids = sorted(model_ids) | |
| SUPPORTED_LANG_MODEL_DICT[lang] = model_ids | |
| def transcribe(microphone, audio_file, model_name): | |
| model = model_dict[model_name] | |
| warn_output = "" | |
| if (microphone is not None) and (audio_file is not None): | |
| warn_output = ( | |
| "WARNING: You've uploaded an audio file and used the microphone. " | |
| "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
| ) | |
| audio_data = microphone | |
| elif (microphone is None) and (audio_file is None): | |
| return "ERROR: You have to either use the microphone or upload an audio file" | |
| elif microphone is not None: | |
| audio_data = microphone | |
| else: | |
| audio_data = audio_file | |
| try: | |
| # Use HF API for transcription | |
| transcriptions = model(audio_data) | |
| except Exception as e: | |
| transcriptions = "" | |
| warn_output = warn_output + "\n\n" | |
| warn_output += ( | |
| f"The model `{model_name}` is currently loading and cannot be used " | |
| f"for transcription.\n" | |
| f"Please try another model or wait a few minutes." | |
| ) | |
| return warn_output + transcriptions | |
| demo = gr.Blocks(title=TITLE, css=CSS) | |
| with demo: | |
| header = gr.Markdown(MARKDOWN) | |
| with gr.Row() as row: | |
| file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File') | |
| microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone') | |
| lang_selector = gr.components.Dropdown( | |
| choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True, | |
| ) | |
| models_in_lang = gr.components.Dropdown( | |
| choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])), | |
| value=DEFAULT_EN_MODEL, | |
| label="Models", | |
| interactive=True, | |
| ) | |
| def update_models_with_lang(lang): | |
| models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang])) | |
| default = models_names[0] | |
| if lang == 'en': | |
| default = DEFAULT_EN_MODEL | |
| return models_in_lang.update(choices=models_names, value=default) | |
| lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang]) | |
| transcript = gr.components.Label(label='Transcript') | |
| run = gr.components.Button('Transcribe') | |
| run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript]) | |
| gr.components.HTML(ARTICLE) | |
| demo.queue(concurrency_count=1) | |
| demo.launch() | |