|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
def transcribe(audio): |
|
|
sr, y = audio |
|
|
|
|
|
|
|
|
if y.ndim > 1: |
|
|
y = y.mean(axis=1) |
|
|
|
|
|
y = y.astype(np.float32) |
|
|
y /= np.max(np.abs(y)) |
|
|
|
|
|
return "Transcribed text: " + " ".join([str(i) for i in y[:10]]) |
|
|
|
|
|
|
|
|
demo_full_context = gr.Interface( |
|
|
transcribe, |
|
|
gr.Audio(sources="microphone"), |
|
|
"text", |
|
|
) |
|
|
|
|
|
|
|
|
def transcribe_stream(stream, new_chunk): |
|
|
sr, y = new_chunk |
|
|
|
|
|
|
|
|
if y.ndim > 1: |
|
|
y = y.mean(axis=1) |
|
|
|
|
|
y = y.astype(np.float32) |
|
|
y /= np.max(np.abs(y)) |
|
|
if stream is not None: |
|
|
stream = np.concatenate([stream, y]) |
|
|
else: |
|
|
stream = y |
|
|
|
|
|
return stream, "Transcribed text: " + " ".join([str(i) for i in stream[:10]]) |
|
|
|
|
|
|
|
|
demo_streaming = gr.Interface( |
|
|
transcribe_stream, |
|
|
["state", gr.Audio(sources=["microphone"], streaming=True)], |
|
|
["state", "text"], |
|
|
live=True, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo_full_context.launch(show_error=True) |
|
|
demo_streaming.launch() |