import os import gradio as gr from transcribe import ( finish_streaming_realtime, preload_streaming_assets, reset_realtime_stream, transcribe, transcribe_streaming_realtime, ) MODEL_CHOICES = [ "Offline Model", "Streaming 160ms", "Streaming 480ms", "Streaming 960ms", "Streaming 1920ms", ] STREAMING_CHOICES = MODEL_CHOICES[1:] def predict_asr(input_audio, model_mode): if input_audio is None: return "Please Upload or Record Audio!" result = transcribe(input_audio, model_mode=model_mode) return result["text"] default_audio_path1 = "default_sample_zh.mp3" default_audio_path2 = "default_sample_en.mp3" # if not os.path.exists(default_audio_path): # with open(default_audio_path, "wb") as f: # pass preload_streaming_assets() with gr.Blocks( title="X-ASR", ) as gradio_app: gr.Markdown("# X-ASR") with gr.Tab("Utterance-Level Speech Recognition"): batch_audio = gr.Audio( label="Upload or Record", sources=["upload", "microphone"], type="numpy", ) batch_mode = gr.Dropdown( label="Model Choices", choices=MODEL_CHOICES, value="Offline Model", interactive=True, ) batch_button = gr.Button("Recognize", variant="primary") batch_output = gr.Textbox(label="Transcription") gr.Examples( examples=[ [default_audio_path1, "Offline Model"], [default_audio_path2, "Offline Model"], ], inputs=[batch_audio, batch_mode], outputs=batch_output, fn=predict_asr, cache_examples=False, ) with gr.Tab("Streaming Speech Recognition"): stream_mode = gr.Dropdown( label="Model Choices", choices=STREAMING_CHOICES, value="Streaming 160ms", interactive=True, ) stream_audio = gr.Audio( label="Microphone", sources=["microphone"], type="numpy", streaming=True, ) stream_output = gr.Textbox(label="Transcription") stream_state = gr.State({}) with gr.Row(): reset_button = gr.Button("Clear") stream_audio.stream( fn=transcribe_streaming_realtime, inputs=[stream_audio, stream_state, stream_mode], outputs=[stream_output, stream_state], show_progress="hidden", stream_every=0.25, time_limit=3600, trigger_mode="multiple", concurrency_limit=1, ) stream_audio.start_recording( fn=reset_realtime_stream, inputs=stream_mode, outputs=[stream_output, stream_state], show_progress="hidden", ) stream_audio.stop_recording( fn=finish_streaming_realtime, inputs=[stream_state, stream_mode], outputs=[stream_output, stream_state], show_progress="hidden", ) stream_mode.change( fn=reset_realtime_stream, inputs=stream_mode, outputs=[stream_output, stream_state], show_progress="hidden", ) reset_button.click( fn=reset_realtime_stream, inputs=stream_mode, outputs=[stream_output, stream_state], show_progress="hidden", ) batch_button.click( fn=predict_asr, inputs=[batch_audio, batch_mode], outputs=batch_output, ) if __name__ == "__main__": gradio_app.launch(share=False)