X-ASR / app.py
James
Utterance-Level
beae5b1
import os
import gradio as gr
from transcribe import (
finish_streaming_realtime,
preload_streaming_assets,
reset_realtime_stream,
transcribe,
transcribe_streaming_realtime,
)
MODEL_CHOICES = [
"Offline Model",
"Streaming 160ms",
"Streaming 480ms",
"Streaming 960ms",
"Streaming 1920ms",
]
STREAMING_CHOICES = MODEL_CHOICES[1:]
def predict_asr(input_audio, model_mode):
if input_audio is None:
return "Please Upload or Record Audio!"
result = transcribe(input_audio, model_mode=model_mode)
return result["text"]
default_audio_path1 = "default_sample_zh.mp3"
default_audio_path2 = "default_sample_en.mp3"
# if not os.path.exists(default_audio_path):
# with open(default_audio_path, "wb") as f:
# pass
preload_streaming_assets()
with gr.Blocks(
title="X-ASR",
) as gradio_app:
gr.Markdown("# X-ASR")
with gr.Tab("Utterance-Level Speech Recognition"):
batch_audio = gr.Audio(
label="Upload or Record",
sources=["upload", "microphone"],
type="numpy",
)
batch_mode = gr.Dropdown(
label="Model Choices",
choices=MODEL_CHOICES,
value="Offline Model",
interactive=True,
)
batch_button = gr.Button("Recognize", variant="primary")
batch_output = gr.Textbox(label="Transcription")
gr.Examples(
examples=[
[default_audio_path1, "Offline Model"],
[default_audio_path2, "Offline Model"],
],
inputs=[batch_audio, batch_mode],
outputs=batch_output,
fn=predict_asr,
cache_examples=False,
)
with gr.Tab("Streaming Speech Recognition"):
stream_mode = gr.Dropdown(
label="Model Choices",
choices=STREAMING_CHOICES,
value="Streaming 160ms",
interactive=True,
)
stream_audio = gr.Audio(
label="Microphone",
sources=["microphone"],
type="numpy",
streaming=True,
)
stream_output = gr.Textbox(label="Transcription")
stream_state = gr.State({})
with gr.Row():
reset_button = gr.Button("Clear")
stream_audio.stream(
fn=transcribe_streaming_realtime,
inputs=[stream_audio, stream_state, stream_mode],
outputs=[stream_output, stream_state],
show_progress="hidden",
stream_every=0.25,
time_limit=3600,
trigger_mode="multiple",
concurrency_limit=1,
)
stream_audio.start_recording(
fn=reset_realtime_stream,
inputs=stream_mode,
outputs=[stream_output, stream_state],
show_progress="hidden",
)
stream_audio.stop_recording(
fn=finish_streaming_realtime,
inputs=[stream_state, stream_mode],
outputs=[stream_output, stream_state],
show_progress="hidden",
)
stream_mode.change(
fn=reset_realtime_stream,
inputs=stream_mode,
outputs=[stream_output, stream_state],
show_progress="hidden",
)
reset_button.click(
fn=reset_realtime_stream,
inputs=stream_mode,
outputs=[stream_output, stream_state],
show_progress="hidden",
)
batch_button.click(
fn=predict_asr,
inputs=[batch_audio, batch_mode],
outputs=batch_output,
)
if __name__ == "__main__":
gradio_app.launch(share=False)