import numpy as np import streamlit as st from tensorflow.keras.models import model_from_json from st_audiorec import st_audiorec import io import librosa import scipy from skimage.transform import resize import tensorflow as tf import tensorflow_io as tfio import warnings warnings.filterwarnings("ignore") st.title("Audio Classification using CNNs") @st.cache_resource def load_model(model_path_with_model, model_name): # Change made of \\ due to file not found error with open(f"{model_path_with_model}/{model_name}.json", "r") as json_file: model_json = json_file.read() model = model_from_json(model_json) model.load_weights(f"{model_path_with_model}/{model_name}.weights.h5") return model models_path = "vastai/working/models" waveform_gender = load_model(models_path, "waveform_gender_model") waveform_digit = load_model(models_path, "waveform_num_model") spectrogram_gender = load_model(models_path, "spectrogram_gender_model") spectrogram_digit = load_model(models_path, "spectrogram_num_model") st.write("Models loaded successfully.") st.write("Kindly record your voice by clicking on the below button:") try: wav_audio_data = st_audiorec() if wav_audio_data is not None: st.success("Your voice has been successfully recorded.") status = st.empty() status.write("Predictions are being made, kindly be patient.") audio_file_like_obj = io.BytesIO(wav_audio_data) audio_recording, sampling_rate = librosa.load( audio_file_like_obj, sr=8000, mono=True ) audio = np.empty((8000,), dtype=np.float32) if len(audio_recording) < 8000: audio = np.pad(audio_recording, (0, 8000 - len(audio_recording))) elif len(audio_recording) > 8000: audio = audio_recording[:8000] st.write( "This recording will be an audio input for the models to perform the classification task of speaker's gender and the digit spoken." " This recognition is based on the two ways an audio signal can be represented in." ) st.write("Waveform based predictions are being made and thus, are as follows:") predicted_gender = np.argmax( waveform_gender.predict(audio.reshape(1, 8000, 1)), axis=1 ) predicted_digit = np.argmax( waveform_digit.predict(audio.reshape(1, 8000, 1)), axis=1 ) gender = "INF" if predicted_gender == 0: gender = "Male" elif predicted_gender == 1: gender = "Female" st.write(f"Gender is {gender}.") st.write(f"Spoken digit is {predicted_digit[0]}.") st.write( "Spectrogram based predictions are being made and thus, are as follows:" ) f, t, Zxx = scipy.signal.stft( audio, 8000, nperseg=455, noverlap=393, window="hann" ) Zxx_mag = np.abs(Zxx) spect_final = np.atleast_3d(Zxx_mag) spect_mel = tfio.audio.melscale( spect_final, rate=8000, mels=128, fmin=0, fmax=4000 ) spect_log_mel = tf.math.log(spect_mel + 1e-6) spect_dB_scaled = tfio.audio.dbscale(spect_log_mel, top_db=80) resized = resize(spect_dB_scaled.numpy(), (128, 128, 1), preserve_range=True) resized = np.clip(resized, -80, 0) resized = (resized + 80) / 80.0 resized_spect = tf.reshape( tf.convert_to_tensor(resized, dtype=tf.float32), (1, 128, 128, 1) ) spect_predicted_gender = np.argmax( spectrogram_gender.predict(resized_spect), axis=1, ) spect_predicted_digit = np.argmax( spectrogram_digit.predict(resized_spect), axis=1, ) sgender = "INF" if spect_predicted_gender == 0: sgender = "Male" elif spect_predicted_gender == 1: sgender = "Female" st.write(f"Gender is {sgender}.") st.write(f"Spoken digit is {spect_predicted_digit[0]}.") status.empty() # else: # st.error("Failed to record your audio. Kindly record again. THANKS!") except Exception as e: print(f"Error occurred: {e}")