Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import io | |
| import csv | |
| import gradio as gr | |
| import numpy as np | |
| import tensorflow as tf | |
| import tensorflow_hub as hub | |
| import tensorflow_io as tfio | |
| import matplotlib.pyplot as plt | |
| from tensorflow import keras | |
| from huggingface_hub import from_pretrained_keras | |
| # Configuration | |
| class_names = [ | |
| "Irish", | |
| "Midlands", | |
| "Northern", | |
| "Scottish", | |
| "Southern", | |
| "Welsh", | |
| "Not a speech", | |
| ] | |
| # Download Yamnet model from TF Hub | |
| yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1") | |
| # Download dense model from HF Hub | |
| model = from_pretrained_keras( | |
| pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification" | |
| ) | |
| # Function that reads a wav audio file and resamples it to 16000 Hz | |
| # This function is copied from the tutorial: | |
| # https://www.tensorflow.org/tutorials/audio/transfer_learning_audio | |
| def load_16k_audio_wav(filename): | |
| # Read file content | |
| file_content = tf.io.read_file(filename) | |
| # Decode audio wave | |
| audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) | |
| audio_wav = tf.squeeze(audio_wav, axis=-1) | |
| sample_rate = tf.cast(sample_rate, dtype=tf.int64) | |
| # Resample to 16k | |
| audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) | |
| return audio_wav | |
| # Function thatt takes the audio file produced by gr.Audio(source="microphone") and | |
| # returns a tensor applying the following transformations: | |
| # - Resample to 16000 Hz | |
| # - Normalize | |
| # - Reshape to [1, -1] | |
| def mic_to_tensor(recorded_audio_file): | |
| sample_rate, audio = recorded_audio_file | |
| audio_wav = tf.constant(audio, dtype=tf.float32) | |
| if tf.rank(audio_wav) > 1: | |
| audio_wav = tf.reduce_mean(audio_wav, axis=1) | |
| audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) | |
| audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) | |
| return audio_wav | |
| # Function that takes a tensor and applies the following: | |
| # - Pass it through Yamnet model to get the embeddings which are the input of the dense model | |
| # - Pass the embeddings through the dense model to get the predictions | |
| def tensor_to_predictions(audio_tensor): | |
| # Get audio embeddings & scores. | |
| scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor) | |
| # Predict the output of the accent recognition model with embeddings as input | |
| predictions = model.predict(embeddings) | |
| return predictions, mel_spectrogram | |
| # Function tha is called when the user clicks "Predict" button. It does the following: | |
| # - Calls tensor_to_predictions() to get the predictions | |
| # - Generates the top scoring labels | |
| # - Generates the top scoring plot | |
| def predict_accent(recorded_audio_file, uploaded_audio_file): | |
| # Transform input to tensor | |
| if recorded_audio_file: | |
| audio_tensor = mic_to_tensor(recorded_audio_file) | |
| else: | |
| audio_tensor = load_16k_audio_wav(uploaded_audio_file) | |
| # Model Inference | |
| predictions, mel_spectrogram = tensor_to_predictions(audio_tensor) | |
| # Get the infered class | |
| infered_class = class_names[predictions.mean(axis=0).argmax()] | |
| # Generate Output 1 - Accents | |
| top_scoring_labels_output = { | |
| class_names[i]: float(predictions.mean(axis=0)[i]) | |
| for i in range(len(class_names)) | |
| } | |
| # Generate Output 2 | |
| top_scoring_plot_output = generate_top_scoring_plot(predictions) | |
| return [top_scoring_labels_output, top_scoring_plot_output] | |
| # Clears all inputs and outputs when the user clicks "Clear" button | |
| def clear_inputs_and_outputs(): | |
| return [None, None, None, None] | |
| # Function that generates the top scoring plot | |
| # This function is copied from the tutorial and adjusted to our needs | |
| # https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at | |
| def generate_top_scoring_plot(predictions): | |
| # Plot and label the model output scores for the top-scoring classes. | |
| mean_predictions = np.mean(predictions, axis=0) | |
| top_class_indices = np.argsort(mean_predictions)[::-1] | |
| fig = plt.figure(figsize=(10, 2)) | |
| plt.imshow( | |
| predictions[:, top_class_indices].T, | |
| aspect="auto", | |
| interpolation="nearest", | |
| cmap="gray_r", | |
| ) | |
| # patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS | |
| # values from the model documentation | |
| patch_padding = (0.025 / 2) / 0.01 | |
| plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5]) | |
| # Label the top_N classes. | |
| yticks = range(0, len(class_names), 1) | |
| plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks]) | |
| _ = plt.ylim(-0.5 + np.array([len(class_names), 0])) | |
| return fig | |
| # Main function | |
| if __name__ == "__main__": | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown( | |
| """ | |
| <center><h1>English speaker accent recognition using Transfer Learning</h1></center> \ | |
| This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \ | |
| In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br> | |
| """ | |
| ) | |
| with gr.Row(): | |
| ## Input | |
| with gr.Column(): | |
| src_input = gr.Audio(sources=["microphone", "upload"]) | |
| with gr.Row(): | |
| clr_btn = gr.Button(value="Clear", variant="secondary") | |
| prd_btn = gr.Button(value="Predict") | |
| # Outputs | |
| with gr.Column(): | |
| lbl_output = gr.Label(label="Top Predictions") | |
| with gr.Group(): | |
| gr.Markdown("<center>Prediction per time slot</center>") | |
| plt_output = gr.Plot( | |
| label="Prediction per time slot", show_label=False | |
| ) | |
| # Credits | |
| with gr.Row(): | |
| gr.Markdown( | |
| """ | |
| <h4>Credits</h4> | |
| Author: <a href="https://twitter.com/fadibadine"> Fadi Badine</a>.<br> | |
| Based on the following Keras example <a href="https://keras.io/examples/audio/uk_ireland_accent_recognition"> English speaker accent recognition using Transfer Learning</a> by Fadi Badine<br> | |
| Check out the model <a href="https://huggingface.co/keras-io/english-speaker-accent-recognition-using-transfer-learning">here</a> | |
| """ | |
| ) | |
| clr_btn.click( | |
| fn=clear_inputs_and_outputs, | |
| inputs=[], | |
| outputs=[src_input, lbl_output, plt_output], | |
| ) | |
| prd_btn.click( | |
| fn=predict_accent, | |
| inputs=[src_input], | |
| outputs=[lbl_output, plt_output], | |
| ) | |
| demo.launch(debug=True, share=True) | |