Spaces:

Khalida1w
/

denoising

Running

App Files Files Community

khalida1wwin commited on Dec 30, 2022

Commit

33192bb

1 Parent(s): 81763a1

Add new files

Browse files

Files changed (1) hide show

app.py +240 -7

app.py CHANGED Viewed

@@ -1,9 +1,242 @@
 import gradio as gr
-gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech",
-                  description="TTS using FastSpeech2",
-                  outputs = 'audio',
-                  title="Text to Speech (TTS)",
-                  examples=[["The quick brown fox jumps over the lazy dog."]],
-                  article = "Author: <a href=\"https://huggingface.co/rowel\">Rowel Atienza</a>",
-                  ).launch()

+import librosa
+import tensorflow as tf
+from tensorflow.keras.models import model_from_json
+import soundfile as sf
+import numpy as np
+import os
+import scipy
+from scipy.io import wavfile
 import gradio as gr
+def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
+    """This function take an audio and split into several frame
+       in a numpy matrix of size (nb_frame,frame_length)"""
+    sequence_sample_length = sound_data.shape[0]
+    sound_data_list = [sound_data[start:start + frame_length] for start in range(
+    0, sequence_sample_length - frame_length + 1, hop_length_frame)]  # get sliding windows
+    sound_data_array = np.vstack(sound_data_list)
+    return sound_data_array
+def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
+    """This function take audio files of a directory and merge them
+    in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""
+    list_sound_array = []
+    for file in list_audio_files:
+        # open the audio file
+        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
+        total_duration = librosa.get_duration(y=y, sr=sr)
+        if (total_duration >= min_duration):
+            list_sound_array.append(audio_to_audio_frame_stack(
+                y, frame_length, hop_length_frame))
+        else:
+            print(
+                f"The following file {os.path.join(audio_dir,file)} is below the min duration")
+    return np.vstack(list_sound_array)
+def blend_noise_randomly(voice, noise, nb_samples, frame_length):
+    """This function takes as input numpy arrays representing frames
+    of voice sounds, noise sounds and the number of frames to be created
+    and return numpy arrays with voice randomly blend with noise"""
+    prod_voice = np.zeros((nb_samples, frame_length))
+    prod_noise = np.zeros((nb_samples, frame_length))
+    prod_noisy_voice = np.zeros((nb_samples, frame_length))
+    for i in range(nb_samples):
+        id_voice = np.random.randint(0, voice.shape[0])
+        id_noise = np.random.randint(0, noise.shape[0])
+        level_noise = np.random.uniform(0.2, 0.8)
+        prod_voice[i, :] = voice[id_voice, :]
+        prod_noise[i, :] = level_noise * noise[id_noise, :]
+        prod_noisy_voice[i, :] = prod_voice[i, :] + prod_noise[i, :]
+    return prod_voice, prod_noise, prod_noisy_voice
+def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
+    """This function takes an audio and convert into spectrogram,
+       it returns the magnitude in dB and the phase"""
+    stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
+    stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
+    stftaudio_magnitude_db = librosa.amplitude_to_db(
+        stftaudio_magnitude, ref=np.max)
+    return stftaudio_magnitude_db, stftaudio_phase
+def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
+    """This function takes as input a numpy audi of size (nb_frame,frame_length), and return
+    a numpy containing the matrix spectrogram for amplitude in dB and phase. It will have the size
+    (nb_frame,dim_square_spec,dim_square_spec)"""
+    nb_audio = numpy_audio.shape[0]
+    m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
+    m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
+    for i in range(nb_audio):
+        m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
+            n_fft, hop_length_fft, numpy_audio[i])
+    return m_mag_db, m_phase
+def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
+    """This functions reverts a spectrogram to an audio"""
+    stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
+    # taking magnitude and phase of audio
+    audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
+    audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
+    return audio_reconstruct
+def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft)  :
+    """This functions reverts the matrix spectrograms to numpy audio"""
+    list_audio = []
+    nb_spec = m_mag_db.shape[0]
+    for i in range(nb_spec):
+        audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
+        list_audio.append(audio_reconstruct)
+    return np.vstack(list_audio)
+def scaled_in(matrix_spec):
+    "global scaling apply to noisy voice spectrograms (scale between -1 and 1)"
+    matrix_spec = (matrix_spec + 46)/50
+    return matrix_spec
+def scaled_ou(matrix_spec):
+    "global scaling apply to noise models spectrograms (scale between -1 and 1)"
+    matrix_spec = (matrix_spec -6 )/82
+    return matrix_spec
+def inv_scaled_in(matrix_spec):
+    "inverse global scaling apply to noisy voices spectrograms"
+    matrix_spec = matrix_spec * 50 - 46
+    return matrix_spec
+def inv_scaled_ou(matrix_spec):
+    "inverse global scaling apply to noise models spectrograms"
+    matrix_spec = matrix_spec * 82 + 6
+    return matrix_spec
+def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
+audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
+    """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
+    the denoise sound and save it to disk.
+    """
+    # load json and create model
+    json_file = open(weights_path+'/'+name_model+'.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into new model
+    loaded_model.load_weights(weights_path+'/'+name_model+'.h5')
+    print("Loaded model from disk")
+    # Extracting noise and voice from folder and convert to numpy
+    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate,
+                                 frame_length, hop_length_frame, min_duration)
+    #Dimensions of squared spectrogram
+    dim_square_spec = int(n_fft / 2) + 1
+    print(dim_square_spec)
+    # Create Amplitude and phase of the sounds
+    m_amp_db_audio,  m_pha_audio = numpy_audio_to_matrix_spectrogram(
+        audio, dim_square_spec, n_fft, hop_length_fft)
+    #global scaling to have distribution -1/1
+    X_in = scaled_in(m_amp_db_audio)
+    #Reshape for prediction
+    X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
+    #Prediction using loaded network
+    X_pred = loaded_model.predict(X_in)
+    #Rescale back the noise model
+    inv_sca_X_pred = inv_scaled_ou(X_pred)
+    #Remove noise model from noisy speech
+    X_denoise = m_amp_db_audio - inv_sca_X_pred[:,:,:,0]
+    #Reconstruct audio from denoised spectrogram and phase
+    print(X_denoise.shape)
+    print(m_pha_audio.shape)
+    print(frame_length)
+    print(hop_length_fft)
+    audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
+    #Number of frames
+    nb_samples = audio_denoise_recons.shape[0]
+    #Save all frames in one file
+    denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length)*10
+    # librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
+    print(dir_save_prediction + audio_output_prediction)
+    sf.write(dir_save_prediction + audio_output_prediction , denoise_long[0, :], sample_rate)
+def denoise_audio(audioName):
+  testNo = audioName
+  audio_dir_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"+testNo +".wav"
+  sample_rate, data = wavfile.read(audio_dir_prediction)
+  len_data = len(data)  # holds length of the numpy array
+  t = len_data / sample_rate # returns duration but in floats
+  print("t:",t)
+  weights_path = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
+  name_model = "model_unet"
+  audio_dir_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
+  dir_save_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
+  audio_output_prediction = "test"+ testNo+".wav"
+  audio_input_prediction = [testNo +".wav"]
+  sample_rate = 8000
+  min_duration = t
+  frame_length = 8064
+  hop_length_frame = 8064
+  n_fft = 255
+  hop_length_fft = 63
+  dim_square_spec = int(n_fft / 2) + 1
+  prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
+          audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft)
+  print(audio_output_prediction)
+  return audio_output_prediction
+examples = [
+    [os.path.abspath("3.wav")],
+    [os.path.abspath("2.wav")]
+]
+iface = gr.Interface(fn = denoise_audio,
+                     inputs = 'audio',
+                     outputs = 'audio',
+                     verbose = True,
+                     title = 'audio to denoised Audio Application',
+                     description = 'A simple application to denoise audio speech usinf UNet deep learning model. Upload your own audio, or click one of the examples to load them.',
+                     article =
+                        '''<div>
+                            <p style="text-align: center"> All you need to do is to upload the pdf file and hit submit, then wait for compiling. After that click on Play/Pause for listing to the audio. The audio is saved in a wav format.</p>
+                        </div>''',
+                     examples=examples
+                    )
+iface.launch()