Spaces:

TheComputerMan
/

IMS-Toucan-modified

Build error

App Files Files Community

TheComputerMan commited on Aug 27, 2022

Commit

e06adea

1 Parent(s): 385b498

Upload AudioPreprocessor.py

Browse files

Files changed (1) hide show

AudioPreprocessor.py +166 -0

AudioPreprocessor.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import librosa
+import librosa.core as lb
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import numpy
+import numpy as np
+import pyloudnorm as pyln
+import torch
+from torchaudio.transforms import Resample
+class AudioPreprocessor:
+    def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"):
+        """
+        The parameters are by default set up to do well
+        on a 16kHz signal. A different sampling rate may
+        require different hop_length and n_fft (e.g.
+        doubling frequency --> doubling hop_length and
+        doubling n_fft)
+        """
+        self.cut_silence = cut_silence
+        self.device = device
+        self.sr = input_sr
+        self.new_sr = output_sr
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.mel_buckets = melspec_buckets
+        self.meter = pyln.Meter(input_sr)
+        self.final_sr = input_sr
+        if cut_silence:
+            torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
+            # careful: assumes 16kHz or 8kHz audio
+            self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                                      model='silero_vad',
+                                                      force_reload=False,
+                                                      onnx=False,
+                                                      verbose=False)
+            (self.get_speech_timestamps,
+             self.save_audio,
+             self.read_audio,
+             self.VADIterator,
+             self.collect_chunks) = utils
+            self.silero_model = self.silero_model.to(self.device)
+        if output_sr is not None and output_sr != input_sr:
+            self.resample = Resample(orig_freq=input_sr, new_freq=output_sr).to(self.device)
+            self.final_sr = output_sr
+        else:
+            self.resample = lambda x: x
+    def cut_silence_from_audio(self, audio):
+        """
+        https://github.com/snakers4/silero-vad
+        """
+        return self.collect_chunks(self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr), audio)
+    def to_mono(self, x):
+        """
+        make sure we deal with a 1D array
+        """
+        if len(x.shape) == 2:
+            return lb.to_mono(numpy.transpose(x))
+        else:
+            return x
+    def normalize_loudness(self, audio):
+        """
+        normalize the amplitudes according to
+        their decibels, so this should turn any
+        signal with different magnitudes into
+        the same magnitude by analysing loudness
+        """
+        loudness = self.meter.integrated_loudness(audio)
+        loud_normed = pyln.normalize.loudness(audio, loudness, -30.0)
+        peak = numpy.amax(numpy.abs(loud_normed))
+        peak_normed = numpy.divide(loud_normed, peak)
+        return peak_normed
+    def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
+        """
+        Compute log-Mel filterbank
+        one day this could be replaced by torchaudio's internal log10(melspec(audio)), but
+        for some reason it gives slightly different results, so in order not to break backwards
+        compatibility, this is kept for now. If there is ever a reason to completely re-train
+        all models, this would be a good opportunity to make the switch.
+        """
+        if isinstance(audio, torch.Tensor):
+            audio = audio.numpy()
+        # get amplitude spectrogram
+        x_stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=None, window="hann", pad_mode="reflect")
+        spc = np.abs(x_stft).T
+        # get mel basis
+        fmin = 0 if fmin is None else fmin
+        fmax = sampling_rate / 2 if fmax is None else fmax
+        mel_basis = librosa.filters.mel(sampling_rate, self.n_fft, self.mel_buckets, fmin, fmax)
+        # apply log and return
+        return torch.Tensor(np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))).transpose(0, 1)
+    def normalize_audio(self, audio):
+        """
+        one function to apply them all in an
+        order that makes sense.
+        """
+        audio = self.to_mono(audio)
+        audio = self.normalize_loudness(audio)
+        audio = torch.Tensor(audio).to(self.device)
+        audio = self.resample(audio)
+        if self.cut_silence:
+            audio = self.cut_silence_from_audio(audio)
+        return audio.to("cpu")
+    def visualize_cleaning(self, unclean_audio):
+        """
+        displays Mel Spectrogram of unclean audio
+        and then displays Mel Spectrogram of the
+        cleaned version.
+        """
+        fig, ax = plt.subplots(nrows=2, ncols=1)
+        unclean_audio_mono = self.to_mono(unclean_audio)
+        unclean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=False).numpy()
+        clean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=True).numpy()
+        lbd.specshow(unclean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[0], x_axis='time')
+        ax[0].set(title='Uncleaned Audio')
+        ax[0].label_outer()
+        if self.new_sr is not None:
+            lbd.specshow(clean_spec, sr=self.new_sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        else:
+            lbd.specshow(clean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        ax[1].set(title='Cleaned Audio')
+        ax[1].label_outer()
+        plt.show()
+    def audio_to_wave_tensor(self, audio, normalize=True):
+        if normalize:
+            return self.normalize_audio(audio)
+        else:
+            if isinstance(audio, torch.Tensor):
+                return audio
+            else:
+                return torch.Tensor(audio)
+    def audio_to_mel_spec_tensor(self, audio, normalize=True, explicit_sampling_rate=None):
+        """
+        explicit_sampling_rate is for when
+        normalization has already been applied
+        and that included resampling. No way
+        to detect the current sr of the incoming
+        audio
+        """
+        if explicit_sampling_rate is None:
+            if normalize:
+                audio = self.normalize_audio(audio)
+                return self.logmelfilterbank(audio=audio, sampling_rate=self.final_sr)
+            return self.logmelfilterbank(audio=audio, sampling_rate=self.sr)
+        if normalize:
+            audio = self.normalize_audio(audio)
+        return self.logmelfilterbank(audio=audio, sampling_rate=explicit_sampling_rate)
+if __name__ == '__main__':
+    import soundfile
+    wav, sr = soundfile.read("../audios/test.wav")
+    ap = AudioPreprocessor(input_sr=sr, output_sr=16000)
+    ap.visualize_cleaning(wav)