Spaces:

Khalida1w
/

denoising

Running

App Files Files Community

Khalida1w commited on Dec 29, 2024

Commit

7f24723

verified ·

1 Parent(s): fd4a352

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -139

app.py CHANGED Viewed

@@ -9,44 +9,32 @@ from scipy.io import wavfile
 import gradio as gr
 def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
-    """This function take an audio and split into several frame
-       in a numpy matrix of size (nb_frame,frame_length)"""
     sequence_sample_length = sound_data.shape[0]
-    sound_data_list = [sound_data[start:start + frame_length] for start in range(
-    0, sequence_sample_length - frame_length + 1, hop_length_frame)]  # get sliding windows
     sound_data_array = np.vstack(sound_data_list)
     return sound_data_array
 def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
-    """This function take audio files of a directory and merge them
-    in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""
     list_sound_array = []
     for file in list_audio_files:
-        # open the audio file
         y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
         total_duration = librosa.get_duration(y=y, sr=sr)
-        if (total_duration >= min_duration):
-            list_sound_array.append(audio_to_audio_frame_stack(
-                y, frame_length, hop_length_frame))
         else:
-            print(
-                f"The following file {os.path.join(audio_dir,file)} is below the min duration")
-    return np.vstack(list_sound_array)
 def blend_noise_randomly(voice, noise, nb_samples, frame_length):
-    """This function takes as input numpy arrays representing frames
-    of voice sounds, noise sounds and the number of frames to be created
-    and return numpy arrays with voice randomly blend with noise"""
     prod_voice = np.zeros((nb_samples, frame_length))
     prod_noise = np.zeros((nb_samples, frame_length))
     prod_noisy_voice = np.zeros((nb_samples, frame_length))
@@ -61,188 +49,196 @@ def blend_noise_randomly(voice, noise, nb_samples, frame_length):
     return prod_voice, prod_noise, prod_noisy_voice
 def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
-    """This function takes an audio and convert into spectrogram,
-       it returns the magnitude in dB and the phase"""
     stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
     stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
-    stftaudio_magnitude_db = librosa.amplitude_to_db(
-        stftaudio_magnitude, ref=np.max)
     return stftaudio_magnitude_db, stftaudio_phase
 def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
-    """This function takes as input a numpy audi of size (nb_frame,frame_length), and return
-    a numpy containing the matrix spectrogram for amplitude in dB and phase. It will have the size
-    (nb_frame,dim_square_spec,dim_square_spec)"""
     nb_audio = numpy_audio.shape[0]
     m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
     m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
     for i in range(nb_audio):
         m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
             n_fft, hop_length_fft, numpy_audio[i])
     return m_mag_db, m_phase
 def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
-    """This functions reverts a spectrogram to an audio"""
     stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
-    # taking magnitude and phase of audio
     audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
-    audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
     return audio_reconstruct
-def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft)  :
-    """This functions reverts the matrix spectrograms to numpy audio"""
     list_audio = []
     nb_spec = m_mag_db.shape[0]
     for i in range(nb_spec):
-        audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
         list_audio.append(audio_reconstruct)
     return np.vstack(list_audio)
 def scaled_in(matrix_spec):
-    "global scaling apply to noisy voice spectrograms (scale between -1 and 1)"
-    matrix_spec = (matrix_spec + 46)/50
     return matrix_spec
 def scaled_ou(matrix_spec):
-    "global scaling apply to noise models spectrograms (scale between -1 and 1)"
-    matrix_spec = (matrix_spec -6 )/82
     return matrix_spec
 def inv_scaled_in(matrix_spec):
-    "inverse global scaling apply to noisy voices spectrograms"
     matrix_spec = matrix_spec * 50 - 46
     return matrix_spec
 def inv_scaled_ou(matrix_spec):
-    "inverse global scaling apply to noise models spectrograms"
     matrix_spec = matrix_spec * 82 + 6
     return matrix_spec
 def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
-audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
-    """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
-    the denoise sound and save it to disk.
-    """
-    # load json and create model
-    json_file = open(weights_path+'/'+name_model+'.json', 'r')
     loaded_model_json = json_file.read()
     json_file.close()
     loaded_model = model_from_json(loaded_model_json)
-    # load weights into new model
-    loaded_model.load_weights(weights_path+'/'+name_model+'.h5')
     print("Loaded model from disk")
-    # Extracting noise and voice from folder and convert to numpy
-    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate,
-                                 frame_length, hop_length_frame, min_duration)
-    # audio = audioData
-    #Dimensions of squared spectrogram
-    dim_square_spec = int(n_fft / 2) + 1
-    print(dim_square_spec)
-    # Create Amplitude and phase of the sounds
-    m_amp_db_audio,  m_pha_audio = numpy_audio_to_matrix_spectrogram(
-        audio, dim_square_spec, n_fft, hop_length_fft)
-    #global scaling to have distribution -1/1
     X_in = scaled_in(m_amp_db_audio)
-    #Reshape for prediction
-    X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
-    #Prediction using loaded network
     X_pred = loaded_model.predict(X_in)
-    #Rescale back the noise model
     inv_sca_X_pred = inv_scaled_ou(X_pred)
-    #Remove noise model from noisy speech
-    X_denoise = m_amp_db_audio - inv_sca_X_pred[:,:,:,0]
-    #Reconstruct audio from denoised spectrogram and phase
-    print(X_denoise.shape)
-    print(m_pha_audio.shape)
-    print(frame_length)
-    print(hop_length_fft)
     audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
-    #Number of frames
     nb_samples = audio_denoise_recons.shape[0]
-    #Save all frames in one file
-    denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length)*10
-    # librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
-    print(audio_output_prediction)
-    sf.write(audio_output_prediction , denoise_long[0, :], sample_rate)
-def denoise_audio(audioName):
-    sr, data = audioName
-    sf.write("temp.wav",data, sr)
-    testNo = "temp"
-    audio_dir_prediction = os.path.abspath("/")+ str(testNo) +".wav"
-    sample_rate, data = audioName[0], audioName[1]
-    len_data = len(data)  # holds length of the numpy array
-    t = len_data / sample_rate # returns duration but in floats
-    print("t:",t)
     weights_path = os.path.abspath("./")
     name_model = "model_unet"
     audio_dir_prediction = os.path.abspath("./")
     dir_save_prediction = os.path.abspath("./")
     audio_output_prediction = "test.wav"
-    audio_input_prediction = ["temp.wav"]
-    sample_rate = 8000
     min_duration = t
     frame_length = 8064
     hop_length_frame = 8064
     n_fft = 255
     hop_length_fft = 63
-    dim_square_spec = int(n_fft / 2) + 1
-    prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
-                audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft)
-    print(audio_output_prediction)
-    return audio_output_prediction
 examples = [
     [os.path.abspath("crowdNoise.wav")],
     [os.path.abspath("CrowdNoise2.wav")],
     [os.path.abspath("whiteNoise.wav")]
 ]
-iface = gr.Interface(fn = denoise_audio,
-                     inputs = 'audio',
-                     outputs = 'audio',
-                     title = 'audio to denoised Audio Application',
-                     description = 'A simple application to denoise audio speech using UNet deep learning model. Upload your own audio, or click one of the examples to load them.',
-                     article =
-                        '''<div>
-                            <p style="text-align: center"> All you need to do is to upload the audio file and hit submit, then wait for compiling. After that click on Play/Pause for listing to the audio. The audio is saved in a wav format.</p>
-                        </div>''',
-                     examples=examples
-                    )
-iface.launch()

 import gradio as gr
 def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
+    """This function takes an audio and splits it into several frames
+       returning a numpy matrix of size (nb_frame, frame_length)."""
     sequence_sample_length = sound_data.shape[0]
+    sound_data_list = [
+        sound_data[start:start + frame_length]
+        for start in range(0, sequence_sample_length - frame_length + 1, hop_length_frame)
+    ]
     sound_data_array = np.vstack(sound_data_list)
     return sound_data_array
 def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
+    """This function takes audio files in a directory and merges them
+    into a numpy matrix of size (nb_frame, frame_length) for a sliding window of size hop_length_frame."""
     list_sound_array = []
     for file in list_audio_files:
         y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
         total_duration = librosa.get_duration(y=y, sr=sr)
+        if total_duration >= min_duration:
+            list_sound_array.append(audio_to_audio_frame_stack(y, frame_length, hop_length_frame))
         else:
+            print(f"The following file {os.path.join(audio_dir,file)} is below the min duration")
+    return np.vstack(list_sound_array) if len(list_sound_array) > 0 else np.array([])
 def blend_noise_randomly(voice, noise, nb_samples, frame_length):
+    """This function randomly blends voice frames with noise frames."""
     prod_voice = np.zeros((nb_samples, frame_length))
     prod_noise = np.zeros((nb_samples, frame_length))
     prod_noisy_voice = np.zeros((nb_samples, frame_length))
     return prod_voice, prod_noise, prod_noisy_voice
 def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
+    """Convert audio into a spectrogram, returning the magnitude in dB and the phase."""
     stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
     stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
+    stftaudio_magnitude_db = librosa.amplitude_to_db(stftaudio_magnitude, ref=np.max)
     return stftaudio_magnitude_db, stftaudio_phase
 def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
+    """Takes a numpy array of shape (nb_frame, frame_length) and returns
+    the matrix spectrogram for amplitude in dB and phase (each of shape (nb_frame, dim_square_spec, dim_square_spec))."""
     nb_audio = numpy_audio.shape[0]
     m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
     m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
     for i in range(nb_audio):
         m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
             n_fft, hop_length_fft, numpy_audio[i])
     return m_mag_db, m_phase
 def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
+    """Reverts a dB spectrogram to audio."""
     stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
     audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
+    audio_reconstruct = librosa.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
     return audio_reconstruct
+def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft):
+    """Reverts matrix spectrograms to a stacked numpy audio array."""
     list_audio = []
     nb_spec = m_mag_db.shape[0]
     for i in range(nb_spec):
+        audio_reconstruct = magnitude_db_and_phase_to_audio(
+            frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
         list_audio.append(audio_reconstruct)
     return np.vstack(list_audio)
 def scaled_in(matrix_spec):
+    """Global scaling applied to noisy voice spectrograms (scale between -1 and 1)."""
+    matrix_spec = (matrix_spec + 46) / 50
     return matrix_spec
 def scaled_ou(matrix_spec):
+    """Global scaling applied to noise model spectrograms (scale between -1 and 1)."""
+    matrix_spec = (matrix_spec - 6) / 82
     return matrix_spec
 def inv_scaled_in(matrix_spec):
+    """Inverse global scaling applied to noisy voices spectrograms."""
     matrix_spec = matrix_spec * 50 - 46
     return matrix_spec
 def inv_scaled_ou(matrix_spec):
+    """Inverse global scaling applied to noise model spectrograms."""
     matrix_spec = matrix_spec * 82 + 6
     return matrix_spec
 def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
+               audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
+    """Use pretrained weights to denoise a noisy voice audio, and save the result."""
+    # Load model from JSON + weights
+    json_file = open(os.path.join(weights_path, name_model + '.json'), 'r')
     loaded_model_json = json_file.read()
     json_file.close()
     loaded_model = model_from_json(loaded_model_json)
+    loaded_model.load_weights(os.path.join(weights_path, name_model + '.h5'))
     print("Loaded model from disk")
+    # Convert audio file(s) to numpy frames
+    audio = audio_files_to_numpy(
+        audio_dir_prediction,
+        audio_input_prediction,
+        sample_rate,
+        frame_length,
+        hop_length_frame,
+        min_duration
+    )
+    if audio.size == 0:
+        print("No valid audio frames found, skipping prediction.")
+        return
+    dim_square_spec = int(n_fft / 2) + 1
+    # Create amplitude (dB) and phase
+    m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(audio, dim_square_spec, n_fft, hop_length_fft)
+    # Global scaling to get distribution -1 to 1
     X_in = scaled_in(m_amp_db_audio)
+    # Reshape for model prediction
+    X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1)
+    # Predict using loaded network
     X_pred = loaded_model.predict(X_in)
+    # Rescale back the predicted noise
     inv_sca_X_pred = inv_scaled_ou(X_pred)
+    # Remove noise model from noisy speech
+    X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0]
+    # Reconstruct audio
     audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
+    # Combine all frames into a single 1D array, scaled up
     nb_samples = audio_denoise_recons.shape[0]
+    denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10
+    # Save to disk
+    sf.write(audio_output_prediction, denoise_long[0, :], sample_rate)
+    print(f"Saved denoised audio to: {audio_output_prediction}")
+def denoise_audio(audio_input):
+    """
+    Gradio callback function to denoise audio.
+    `audio_input` can be None, a dict {"name", "sample_rate", "data"}, or a tuple (sr, data).
+    """
+    # 1) Handle None
+    if audio_input is None:
+        print("No audio was provided.")
+        return None
+    # 2) Handle dict vs tuple
+    if isinstance(audio_input, dict):
+        sr = audio_input["sample_rate"]
+        data = audio_input["data"]
+    else:
+        sr, data = audio_input
+    # Write out to a temp file
+    temp_wav = "temp.wav"
+    sf.write(temp_wav, data, sr)
+    # Compute duration
+    len_data = len(data)
+    t = len_data / sr  # duration in seconds
+    print("t:", t)
+    # Paths & config
     weights_path = os.path.abspath("./")
     name_model = "model_unet"
     audio_dir_prediction = os.path.abspath("./")
     dir_save_prediction = os.path.abspath("./")
     audio_output_prediction = "test.wav"
+    audio_input_prediction = [temp_wav]
+    sample_rate = 8000      # model was trained at 8k
     min_duration = t
     frame_length = 8064
     hop_length_frame = 8064
     n_fft = 255
     hop_length_fft = 63
+    # Run prediction (denoising)
+    prediction(weights_path, name_model,
+               audio_dir_prediction,
+               dir_save_prediction,
+               audio_input_prediction,
+               audio_output_prediction,
+               sample_rate,
+               min_duration,
+               frame_length,
+               hop_length_frame,
+               n_fft,
+               hop_length_fft)
+    # Return the path to the denoised file so Gradio can play it
+    return os.path.abspath(audio_output_prediction)
+# Example pre-loaded sample files
 examples = [
     [os.path.abspath("crowdNoise.wav")],
     [os.path.abspath("CrowdNoise2.wav")],
     [os.path.abspath("whiteNoise.wav")]
 ]
+iface = gr.Interface(
+    fn=denoise_audio,
+    inputs="audio",
+    outputs="audio",
+    title="Audio to Denoised Audio Application",
+    description=(
+        "A simple application to denoise audio speech using a UNet model. "
+        "Upload your own audio or click one of the examples to load it."
+    ),
+    article="""
+    <div style="text-align: center">
+        <p>All you need to do is to upload or record an audio file and hit 'Submit'.
+        After processing, you can click 'Play' to hear the denoised audio.
+        The audio is saved in WAV format.</p>
+    </div>
+    """,
+    examples=examples
+)
+iface.launch()