Spaces:
Runtime error
Runtime error
| import torch | |
| import torchaudio | |
| import numpy as np | |
| from espnet2.bin.st_inference_streaming import Speech2TextStreaming | |
| import gradio as gr | |
| import soundfile as sf | |
| import librosa | |
| # Load your custom model | |
| model = Speech2TextStreaming( | |
| st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth", # path to your model weights | |
| st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml", # path to your config file | |
| device="cuda", | |
| minlenratio=0.1, | |
| maxlenratio=0.7, | |
| beam_size=1 # change to "cuda" if using GPU | |
| ) | |
| silence_threshold = 0.01 # Adjust this threshold based on your audio levels | |
| silence_duration = 1.0 # Duration of silence to detect (in seconds) | |
| def is_silence(audio_chunk, sr, threshold=silence_threshold): | |
| return np.mean(np.abs(audio_chunk)) < threshold | |
| def transcribe(state, new_chunk): | |
| stream, silence_time = state | |
| if new_chunk is None: | |
| return (None, None), "" | |
| sr, y = new_chunk | |
| y = y.astype(np.float32) | |
| if sr != 16000: | |
| y = librosa.resample(y=y, orig_sr=sr, target_sr=16000) | |
| y /= np.max(np.abs(y)) | |
| if stream is not None: | |
| stream = np.concatenate([stream, y]) | |
| else: | |
| stream = y | |
| model(np.zeros(stream.shape), is_final=True) | |
| if is_silence(y, sr): | |
| silence_time += len(y) / sr | |
| else: | |
| silence_time = 0 | |
| if silence_time >= silence_duration: | |
| output = model(stream, is_final=True) | |
| return (None, 0), output[0][0] if output else "" | |
| else: | |
| output = model(stream) | |
| return (stream, silence_time), output[0][0] if output else "" | |
| def clear_transcription(): | |
| return (None, 0), "" | |
| with gr.Blocks() as demo: | |
| state = gr.State((None, 0)) | |
| audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True) | |
| text = gr.Textbox() | |
| clear_button = gr.Button("Clear") | |
| audio.stream(transcribe, inputs=[state, audio], outputs=[state, text]) | |
| clear_button.click(clear_transcription, inputs=[], outputs=[state, text]) | |
| demo.launch() |