| """Audio utils""" |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
|
|
| def load_audio(audio_path: str, sr: int = None, max_duration: int = 10., start: int = 0, stop: int = None): |
| """Loads audio and pads/trims it to max_duration""" |
| import librosa |
| data, sr = librosa.load(audio_path, sr=sr) |
| |
| if stop is not None: |
| start = int(start * sr) |
| stop = int(stop * sr) |
| data = data[start:stop] |
| |
| |
| if len(data.shape) > 1: |
| data = np.mean(data, axis=1) |
| |
| n_frames = int(max_duration * sr) |
| if len(data) > n_frames: |
| data = data[:n_frames] |
| elif len(data) < n_frames: |
| data = np.pad(data, (0, n_frames - len(data)), "constant") |
| return data, sr |
| |
|
|
| |
| |
| |
| |
|
|
|
|
| def compute_spec_freq_mean(S_db: np.ndarray, eps=1e-5): |
| |
| S_db_normalized = (S_db - S_db.mean(axis=1)[:, None]) / (S_db.std(axis=1)[:, None] + eps) |
| S_db_over_time = S_db_normalized.sum(axis=0) |
| return S_db_over_time |
|
|
|
|
| def process_audiofile(audio_path, functions=["load_audio", "compute_spectrogram", "compute_spec_freq_mean"]): |
| """Processes audio file with a list of functions""" |
| data, sr = load_audio(audio_path) |
| for function in functions: |
| if function == "load_audio": |
| pass |
| elif function == "compute_spectrogram": |
| data = compute_spectrogram(data, sr) |
| elif function == "compute_spec_freq_mean": |
| data = compute_spec_freq_mean(data) |
| else: |
| raise ValueError(f"Unknown function {function}") |
| return data |
|
|
|
|
|
|
| """PyDub's silence detection is based on the energy of the audio signal.""" |
| import numpy as np |
|
|
|
|
| def sigmoid(x): |
| return 1 / (1 + np.exp(-x)) |
|
|
|
|
| class SilenceDetector: |
| |
|
|
| def __init__(self, silence_thresh=-36) -> None: |
| self.silence_thresh = silence_thresh |
| |
| def __call__(self, audio_path: str, start=None, end=None): |
| |
| import pydub |
| from pydub.utils import db_to_float |
| |
| try: |
| waveform = pydub.AudioSegment.from_file(audio_path) |
| except: |
| print("Error loading audio file: ", audio_path) |
| return 100.0 |
| |
| start_ms = int(start * 1000) if start else 0 |
| end_ms = int(end * 1000) if end else len(waveform) |
| waveform = waveform[start_ms:end_ms] |
| |
| |
| silence_thresh = db_to_float(self.silence_thresh) * waveform.max_possible_amplitude |
| |
| if waveform.rms == 0: |
| return 100.0 |
|
|
| silence_prob = sigmoid((silence_thresh - waveform.rms) / waveform.rms) |
|
|
| |
| return np.round(100 * silence_prob, 2) |
|
|
|
|
| def frequency_bin_to_value(bin_index, sr, n_fft): |
| return int(bin_index * sr / n_fft) |
|
|
|
|
| def time_bin_to_value(bin_index, hop_length, sr): |
| return (bin_index) * (hop_length / sr) |
|
|
|
|
| def add_time_annotations(ax, nt_bins, hop_length, sr, skip=50): |
| |
| t_bins = np.arange(nt_bins) |
| t_vals = np.round(np.array([time_bin_to_value(tb, hop_length, sr) for tb in t_bins]), 1) |
| try: |
| ax.set_xticks(t_bins[::skip], t_vals[::skip]) |
| except: |
| pass |
| ax.set_xlabel("Time (s)") |
|
|
|
|
| def add_freq_annotations(ax, nf_bins, sr, n_fft, skip=50): |
| f_bins = np.arange(nf_bins) |
| f_vals = np.array([frequency_bin_to_value(fb, sr, n_fft) for fb in f_bins]) |
| try: |
| ax.set_yticks(f_bins[::skip], f_vals[::skip]) |
| except: |
| pass |
| |
| |
| ax.set_ylabel("Frequency (Hz)") |
|
|
|
|
| def show_single_spectrogram( |
| spec, |
| sr, |
| n_fft, |
| hop_length, |
| ax=None, |
| fig=None, |
| figsize=(10, 2), |
| cmap="viridis", |
| colorbar=True, |
| show=True, |
| format='%+2.0f dB', |
| xlabel='Time (s)', |
| ylabel="Frequency (Hz)", |
| title=None, |
| show_dom_freq=False, |
| ): |
|
|
| if ax is None: |
| fig, ax = plt.subplots(1, 1, figsize=figsize) |
| axim = ax.imshow(spec, origin="lower", cmap=cmap) |
|
|
| |
| nf_bins, nt_bins = spec.shape |
|
|
| if "frequency" in ylabel.lower(): |
| |
| add_freq_annotations(ax, nf_bins, sr, n_fft) |
|
|
| |
| add_time_annotations(ax, nt_bins, hop_length, sr) |
|
|
| ax.set_title(title) |
| ax.set_xlabel(xlabel) |
| ax.set_ylabel(ylabel) |
|
|
| if colorbar: |
| fig.colorbar(axim, ax=ax, orientation='vertical', fraction=0.01, format=format) |
|
|
| if show_dom_freq: |
| fmax = spec.argmax(axis=0) |
| ax.scatter(np.arange(spec.shape[1]), fmax, color="white", s=0.2) |
|
|
| if show: |
| plt.show() |
|
|
|
|
| def compute_spectrogram(y, n_fft, hop_length, margin, n_mels=None): |
| import librosa |
|
|
| |
| D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length) |
|
|
| |
| S, _ = librosa.decompose.hpss(D, margin=margin) |
|
|
| |
| S = librosa.amplitude_to_db(np.abs(S), ref=np.max) |
|
|
| if n_mels is not None: |
| S = librosa.feature.melspectrogram(S=S, n_mels=n_mels) |
|
|
| return S |
|
|
|
|
| def show_spectrogram(S, sr, n_fft=512, hop_length=256, figsize=(10, 3), n_mels=None, ax=None, show=True): |
| import librosa |
| if ax is None: |
| fig, ax = plt.subplots(1, 1, figsize=figsize) |
| y_axis = "mel" if n_mels is not None else "linear" |
| librosa.display.specshow( |
| S, |
| sr=sr, |
| hop_length=hop_length, |
| n_fft=n_fft, |
| y_axis=y_axis, |
| x_axis='time', |
| ax=ax, |
| ) |
| ax.set_title("LogSpectrogram" if n_mels is None else "LogMelSpectrogram") |
| if show: |
| plt.show() |
|
|
|
|
| def show_frame_and_spectrogram(frame, S, sr, figsize=(12, 4), show=True, axes=None, **spec_args): |
| if axes is None: |
| fig, axes = plt.subplots(1, 2, figsize=figsize, gridspec_kw={"width_ratios": [0.2, 0.8]}) |
| ax = axes[0] |
| ax.imshow(frame) |
| ax.set_xticks([]) |
| ax.set_yticks([]) |
|
|
| ax = axes[1] |
| show_spectrogram(S=S, sr=sr, ax=ax, show=False, **spec_args) |
|
|
| plt.tight_layout() |
|
|
| if show: |
| plt.show() |