| import gradio as gr |
| import librosa |
| from asr import transcribe, ASR_EXAMPLES, ASR_LANGUAGES, ASR_NOTE |
| from tts import synthesize, TTS_EXAMPLES, TTS_LANGUAGES |
| from lid import identify, LID_EXAMPLES |
| from generate import generate, GenExamples |
|
|
| MAX_MAX_NEW_TOKENS = 2048 |
| DEFAULT_MAX_NEW_TOKENS = 1024 |
|
|
| demo = gr.Blocks() |
|
|
| mms_select_source_trans = gr.Radio( |
| ["Record from Mic", "Upload audio"], |
| label="Audio input", |
| value="Record from Mic", |
| ) |
| mms_mic_source_trans = gr.Audio(source="microphone", type="filepath", label="Use mic") |
| mms_upload_source_trans = gr.Audio( |
| source="upload", type="filepath", label="Upload file", visible=False |
| ) |
| mms_transcribe = gr.Interface( |
| fn=transcribe, |
| inputs=[ |
| mms_select_source_trans, |
| mms_mic_source_trans, |
| mms_upload_source_trans, |
| gr.Dropdown( |
| [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()], |
| label="Language", |
| value="eng English", |
| ), |
| |
| ], |
| outputs="text", |
| examples=ASR_EXAMPLES, |
| title="Speech-to-text", |
| description=( |
| "Transcribe audio from a microphone or input file in your desired language." |
| ), |
| article=ASR_NOTE, |
| allow_flagging="never", |
| ) |
|
|
| mms_synthesize = gr.Interface( |
| fn=synthesize, |
| inputs=[ |
| gr.Text(label="Input text"), |
| gr.Dropdown( |
| [f"{k} ({v})" for k, v in TTS_LANGUAGES.items()], |
| label="Language", |
| value="eng English", |
| ), |
| gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), |
| ], |
| outputs=[ |
| gr.Audio(label="Generated Audio", type="numpy"), |
| gr.Text(label="Filtered text after removing OOVs"), |
| ], |
| examples=TTS_EXAMPLES, |
| title="Text-to-speech", |
| description=("Generate audio in your desired language from input text."), |
| allow_flagging="never", |
| ) |
|
|
| chat_interface = gr.Interface( |
| fn=generate, |
| inputs=[ |
| gr.Textbox(label="Message", type="text"), |
| gr.Textbox(label="Chat History", type="text"), |
| gr.Textbox(label="System prompt", type="text"), |
| ], |
| outputs=gr.Textbox(), |
| |
| title="Chat Interface", |
| description="Interactive chat interface using Hugging Face Transformers.", |
| |
| |
| ) |
|
|
| mms_select_source_iden = gr.Radio( |
| ["Record from Mic", "Upload audio"], |
| label="Audio input", |
| value="Record from Mic", |
| ) |
| mms_mic_source_iden = gr.Audio(source="microphone", type="filepath", label="Use mic") |
| mms_upload_source_iden = gr.Audio( |
| source="upload", type="filepath", label="Upload file", visible=False |
| ) |
| mms_identify = gr.Interface( |
| fn=identify, |
| inputs=[ |
| mms_select_source_iden, |
| mms_mic_source_iden, |
| mms_upload_source_iden, |
| ], |
| outputs=gr.Label(num_top_classes=10), |
| examples=LID_EXAMPLES, |
| title="Language Identification", |
| description=("Identity the language of input audio."), |
| allow_flagging="never", |
| ) |
|
|
| tabbed_interface = gr.TabbedInterface( |
| [mms_transcribe, mms_synthesize, mms_identify, chat_interface], |
| ["Speech-to-text", "Text-to-speech", "Language Identification", "Chat with Llama"], |
| ) |
|
|
| with gr.Blocks() as demo: |
|
|
| tabbed_interface.render() |
| mms_select_source_trans.change( |
| lambda x: [ |
| gr.update(visible=True if x == "Record from Mic" else False), |
| gr.update(visible=True if x == "Upload audio" else False), |
| ], |
| inputs=[mms_select_source_trans], |
| outputs=[mms_mic_source_trans, mms_upload_source_trans], |
| queue=False, |
| ) |
| mms_select_source_iden.change( |
| lambda x: [ |
| gr.update(visible=True if x == "Record from Mic" else False), |
| gr.update(visible=True if x == "Upload audio" else False), |
| ], |
| inputs=[mms_select_source_iden], |
| outputs=[mms_mic_source_iden, mms_upload_source_iden], |
| queue=False, |
| ) |
|
|
| demo.queue(concurrency_count=3) |
| demo.launch() |
| |
|
|
|
|