Spaces:

keras-io
/

english-speaker-accent-recognition-using-transfer-learning

Running on CPU Upgrade

App Files Files Community

english-speaker-accent-recognition-using-transfer-learning / app.py

fbadine

Update app.py

4742896 verified about 1 year ago

raw

history blame contribute delete

6.75 kB

	import os
	import io
	import csv
	import gradio as gr
	import numpy as np
	import tensorflow as tf
	import tensorflow_hub as hub
	import tensorflow_io as tfio
	import matplotlib.pyplot as plt
	from tensorflow import keras
	from huggingface_hub import from_pretrained_keras

	# Configuration
	class_names = [
	"Irish",
	"Midlands",
	"Northern",
	"Scottish",
	"Southern",
	"Welsh",
	"Not a speech",
	]

	# Download Yamnet model from TF Hub
	yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")

	# Download dense model from HF Hub
	model = from_pretrained_keras(
	pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification"
	)

	# Function that reads a wav audio file and resamples it to 16000 Hz
	# This function is copied from the tutorial:
	# https://www.tensorflow.org/tutorials/audio/transfer_learning_audio
	def load_16k_audio_wav(filename):
	# Read file content
	file_content = tf.io.read_file(filename)

	# Decode audio wave
	audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
	audio_wav = tf.squeeze(audio_wav, axis=-1)
	sample_rate = tf.cast(sample_rate, dtype=tf.int64)

	# Resample to 16k
	audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)

	return audio_wav


	# Function thatt takes the audio file produced by gr.Audio(source="microphone") and
	# returns a tensor applying the following transformations:
	# - Resample to 16000 Hz
	# - Normalize
	# - Reshape to [1, -1]
	def mic_to_tensor(recorded_audio_file):
	sample_rate, audio = recorded_audio_file

	audio_wav = tf.constant(audio, dtype=tf.float32)
	if tf.rank(audio_wav) > 1:
	audio_wav = tf.reduce_mean(audio_wav, axis=1)
	audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)

	audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))

	return audio_wav


	# Function that takes a tensor and applies the following:
	# - Pass it through Yamnet model to get the embeddings which are the input of the dense model
	# - Pass the embeddings through the dense model to get the predictions
	def tensor_to_predictions(audio_tensor):
	# Get audio embeddings & scores.
	scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor)

	# Predict the output of the accent recognition model with embeddings as input
	predictions = model.predict(embeddings)

	return predictions, mel_spectrogram


	# Function tha is called when the user clicks "Predict" button. It does the following:
	# - Calls tensor_to_predictions() to get the predictions
	# - Generates the top scoring labels
	# - Generates the top scoring plot
	def predict_accent(recorded_audio_file, uploaded_audio_file):
	# Transform input to tensor
	if recorded_audio_file:
	audio_tensor = mic_to_tensor(recorded_audio_file)
	else:
	audio_tensor = load_16k_audio_wav(uploaded_audio_file)

	# Model Inference
	predictions, mel_spectrogram = tensor_to_predictions(audio_tensor)

	# Get the infered class
	infered_class = class_names[predictions.mean(axis=0).argmax()]

	# Generate Output 1 - Accents
	top_scoring_labels_output = {
	class_names[i]: float(predictions.mean(axis=0)[i])
	for i in range(len(class_names))
	}

	# Generate Output 2
	top_scoring_plot_output = generate_top_scoring_plot(predictions)

	return [top_scoring_labels_output, top_scoring_plot_output]


	# Clears all inputs and outputs when the user clicks "Clear" button
	def clear_inputs_and_outputs():
	return [None, None, None, None]


	# Function that generates the top scoring plot
	# This function is copied from the tutorial and adjusted to our needs
	# https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at
	def generate_top_scoring_plot(predictions):
	# Plot and label the model output scores for the top-scoring classes.
	mean_predictions = np.mean(predictions, axis=0)

	top_class_indices = np.argsort(mean_predictions)[::-1]
	fig = plt.figure(figsize=(10, 2))
	plt.imshow(
	predictions[:, top_class_indices].T,
	aspect="auto",
	interpolation="nearest",
	cmap="gray_r",
	)

	# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
	# values from the model documentation
	patch_padding = (0.025 / 2) / 0.01
	plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5])
	# Label the top_N classes.
	yticks = range(0, len(class_names), 1)
	plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
	_ = plt.ylim(-0.5 + np.array([len(class_names), 0]))

	return fig


	# Main function
	if __name__ == "__main__":
	demo = gr.Blocks()

	with demo:
	gr.Markdown(
	"""
	<center><h1>English speaker accent recognition using Transfer Learning</h1></center> \
	This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \
	In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br>
	"""
	)
	with gr.Row():
	## Input
	with gr.Column():
	src_input = gr.Audio(sources=["microphone", "upload"])

	with gr.Row():
	clr_btn = gr.Button(value="Clear", variant="secondary")
	prd_btn = gr.Button(value="Predict")

	# Outputs
	with gr.Column():
	lbl_output = gr.Label(label="Top Predictions")
	with gr.Group():
	gr.Markdown("<center>Prediction per time slot</center>")
	plt_output = gr.Plot(
	label="Prediction per time slot", show_label=False
	)

	# Credits
	with gr.Row():
	gr.Markdown(
	"""
	<h4>Credits</h4>
	Author: <a href="https://twitter.com/fadibadine"> Fadi Badine</a>.<br>
	Based on the following Keras example <a href="https://keras.io/examples/audio/uk_ireland_accent_recognition"> English speaker accent recognition using Transfer Learning</a> by Fadi Badine<br>
	Check out the model <a href="https://huggingface.co/keras-io/english-speaker-accent-recognition-using-transfer-learning">here</a>
	"""
	)

	clr_btn.click(
	fn=clear_inputs_and_outputs,
	inputs=[],
	outputs=[src_input, lbl_output, plt_output],
	)
	prd_btn.click(
	fn=predict_accent,
	inputs=[src_input],
	outputs=[lbl_output, plt_output],
	)

	demo.launch(debug=True, share=True)