| import gradio as gr |
| import numpy as np |
| import tensorflow as tf |
| from tensorflow import keras |
| import tensorflow_io as tfio |
| from huggingface_hub import from_pretrained_keras |
|
|
|
|
| model = from_pretrained_keras("keras-io/ctc_asr", compile=False) |
|
|
| characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] |
| |
| char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") |
| |
| num_to_char = keras.layers.StringLookup( |
| vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True |
| ) |
|
|
| |
| frame_length = 256 |
| |
| frame_step = 160 |
| |
| |
| fft_length = 384 |
|
|
| SAMPLE_RATE = 22050 |
|
|
|
|
| def decode_batch_predictions(pred): |
| input_len = np.ones(pred.shape[0]) * pred.shape[1] |
| |
| results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] |
| |
| output_text = [] |
| for result in results: |
| result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") |
| output_text.append(result) |
| return output_text |
|
|
|
|
| def load_16k_audio_wav(filename): |
| |
| file_content = tf.io.read_file(filename) |
|
|
| |
| audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) |
| audio_wav = tf.squeeze(audio_wav, axis=-1) |
| sample_rate = tf.cast(sample_rate, dtype=tf.int64) |
|
|
| |
| audio_wav = tfio.audio.resample( |
| audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE |
| ) |
|
|
| return audio_wav |
|
|
|
|
| def mic_to_tensor(recorded_audio_file): |
| sample_rate, audio = recorded_audio_file |
|
|
| audio_wav = tf.constant(audio, dtype=tf.float32) |
| if tf.rank(audio_wav) > 1: |
| audio_wav = tf.reduce_mean(audio_wav, axis=1) |
| audio_wav = tfio.audio.resample( |
| audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE |
| ) |
|
|
| audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) |
|
|
| return audio_wav |
|
|
|
|
| def tensor_to_predictions(audio_tensor): |
| |
| audio_tensor = tf.cast(audio_tensor, tf.float32) |
|
|
| |
| spectrogram = tf.signal.stft( |
| audio_tensor, |
| frame_length=frame_length, |
| frame_step=frame_step, |
| fft_length=fft_length, |
| ) |
|
|
| |
| spectrogram = tf.abs(spectrogram) |
| spectrogram = tf.math.pow(spectrogram, 0.5) |
|
|
| |
| means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) |
| stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) |
| spectrogram = (spectrogram - means) / (stddevs + 1e-10) |
|
|
| spectrogram = tf.expand_dims(spectrogram, axis=0) |
|
|
| batch_predictions = model.predict(spectrogram) |
| batch_predictions = decode_batch_predictions(batch_predictions) |
| return batch_predictions |
|
|
|
|
| def clear_inputs_and_outputs(): |
| return [None, None, None] |
|
|
|
|
| def predict(recorded_audio_file, uploaded_audio_file): |
| |
| if recorded_audio_file: |
| audio_tensor = mic_to_tensor(recorded_audio_file) |
| else: |
| audio_tensor = load_16k_audio_wav(uploaded_audio_file) |
|
|
| prediction = tensor_to_predictions(audio_tensor)[0] |
| return prediction |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| if __name__ == "__main__": |
| demo = gr.Blocks() |
|
|
| with demo: |
| gr.Markdown( |
| """ |
| <center><h1>Automatic Speech Recognition using CTC</h1></center> \ |
| This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.<br> \ |
| In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English<br><br> |
| """ |
| ) |
| with gr.Row(): |
| |
| with gr.Column(): |
| mic_input = gr.Audio(source="microphone", label="Record your own voice") |
| upl_input = gr.Audio( |
| source="upload", type="filepath", label="Upload a wav file" |
| ) |
|
|
| with gr.Row(): |
| clr_btn = gr.Button(value="Clear", variant="secondary") |
| prd_btn = gr.Button(value="Predict") |
|
|
| |
| with gr.Column(): |
| lbl_output = gr.Label(label="Text") |
|
|
| |
| with gr.Row(): |
| gr.Markdown( |
| """ |
| <h4>Credits</h4> |
| Author: <a href="https://twitter.com/anuragcomm"> Anurag Singh</a>.<br> |
| Based on the following Keras example <a href="https://keras.io/examples/audio/ctc_asr">Automatic Speech Recognition using CTC</a> by <a href="https://rbouadjenek.github.io/">Mohamed Reda Bouadjenek</a> and <a href="https://www.linkedin.com/in/parkerhuynh/">Ngoc Dung Huynh</a><br> |
| Check out the model <a href="https://huggingface.co/keras-io/ctc_asr">here</a> |
| """ |
| ) |
|
|
| clr_btn.click( |
| fn=clear_inputs_and_outputs, |
| inputs=[], |
| outputs=[mic_input, upl_input, lbl_output], |
| ) |
| prd_btn.click( |
| fn=predict, |
| inputs=[mic_input, upl_input], |
| outputs=[lbl_output], |
| ) |
|
|
| demo.launch(debug=True) |
|
|