| import os |
|
|
| import gradio as gr |
|
|
| from transcribe import ( |
| finish_streaming_realtime, |
| preload_streaming_assets, |
| reset_realtime_stream, |
| transcribe, |
| transcribe_streaming_realtime, |
| ) |
|
|
| MODEL_CHOICES = [ |
| "Offline Model", |
| "Streaming 160ms", |
| "Streaming 480ms", |
| "Streaming 960ms", |
| "Streaming 1920ms", |
| ] |
|
|
| STREAMING_CHOICES = MODEL_CHOICES[1:] |
|
|
| def predict_asr(input_audio, model_mode): |
| if input_audio is None: |
| return "Please Upload or Record Audio!" |
|
|
| result = transcribe(input_audio, model_mode=model_mode) |
| return result["text"] |
|
|
|
|
| default_audio_path1 = "default_sample_zh.mp3" |
| default_audio_path2 = "default_sample_en.mp3" |
|
|
| |
| |
| |
|
|
| preload_streaming_assets() |
|
|
|
|
| with gr.Blocks( |
| title="X-ASR", |
| ) as gradio_app: |
| gr.Markdown("# X-ASR") |
|
|
| with gr.Tab("Utterance-Level Speech Recognition"): |
| batch_audio = gr.Audio( |
| label="Upload or Record", |
| sources=["upload", "microphone"], |
| type="numpy", |
| ) |
| batch_mode = gr.Dropdown( |
| label="Model Choices", |
| choices=MODEL_CHOICES, |
| value="Offline Model", |
| interactive=True, |
| ) |
| batch_button = gr.Button("Recognize", variant="primary") |
| batch_output = gr.Textbox(label="Transcription") |
|
|
| gr.Examples( |
| examples=[ |
| [default_audio_path1, "Offline Model"], |
| [default_audio_path2, "Offline Model"], |
| ], |
| inputs=[batch_audio, batch_mode], |
| outputs=batch_output, |
| fn=predict_asr, |
| cache_examples=False, |
| ) |
|
|
| with gr.Tab("Streaming Speech Recognition"): |
| stream_mode = gr.Dropdown( |
| label="Model Choices", |
| choices=STREAMING_CHOICES, |
| value="Streaming 160ms", |
| interactive=True, |
| ) |
| stream_audio = gr.Audio( |
| label="Microphone", |
| sources=["microphone"], |
| type="numpy", |
| streaming=True, |
| ) |
| stream_output = gr.Textbox(label="Transcription") |
| stream_state = gr.State({}) |
|
|
| with gr.Row(): |
| reset_button = gr.Button("Clear") |
|
|
| stream_audio.stream( |
| fn=transcribe_streaming_realtime, |
| inputs=[stream_audio, stream_state, stream_mode], |
| outputs=[stream_output, stream_state], |
| show_progress="hidden", |
| stream_every=0.25, |
| time_limit=3600, |
| trigger_mode="multiple", |
| concurrency_limit=1, |
| ) |
| stream_audio.start_recording( |
| fn=reset_realtime_stream, |
| inputs=stream_mode, |
| outputs=[stream_output, stream_state], |
| show_progress="hidden", |
| ) |
| stream_audio.stop_recording( |
| fn=finish_streaming_realtime, |
| inputs=[stream_state, stream_mode], |
| outputs=[stream_output, stream_state], |
| show_progress="hidden", |
| ) |
| stream_mode.change( |
| fn=reset_realtime_stream, |
| inputs=stream_mode, |
| outputs=[stream_output, stream_state], |
| show_progress="hidden", |
| ) |
| reset_button.click( |
| fn=reset_realtime_stream, |
| inputs=stream_mode, |
| outputs=[stream_output, stream_state], |
| show_progress="hidden", |
| ) |
|
|
| batch_button.click( |
| fn=predict_asr, |
| inputs=[batch_audio, batch_mode], |
| outputs=batch_output, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| gradio_app.launch(share=False) |
|
|