| | import numpy as np
|
| | import librosa
|
| | import textgrids
|
| | import os
|
| | import python_speech_features
|
| | from tqdm import tqdm
|
| |
|
| |
|
| | def readLabels(path, sample_rate):
|
| |
|
| | labeled_list = []
|
| | grid = textgrids.TextGrid(path)
|
| |
|
| | for interval in grid['silences']:
|
| | if interval.text == "-" or interval.text == " ":
|
| | label = 0
|
| | else:
|
| | label = 1
|
| |
|
| | dur = interval.dur
|
| | dur_samples = int(np.round(dur * sample_rate))
|
| |
|
| | for i in range(dur_samples):
|
| | labeled_list.append(label)
|
| |
|
| | return np.array(labeled_list)
|
| |
|
| | def load_files(audio_path, audio_extension):
|
| | """
|
| | Recursively loads audio files from a specified directory.
|
| |
|
| | Args:
|
| | audio_path (str): The root directory to search for audio files.
|
| | audio_extension (str, optional): The audio file extension to filter
|
| | for (default is ".wav").
|
| |
|
| | Returns:
|
| | list: A sorted list of full paths to the found audio files.
|
| | Raises:
|
| | FileNotFoundError: If the specified audio_path does not exist.
|
| | """
|
| |
|
| | if not os.path.exists(audio_path):
|
| | raise FileNotFoundError(f"Audio path '{audio_path}' not found.")
|
| |
|
| | audio_files = []
|
| |
|
| | for root, _, files in os.walk(audio_path):
|
| | for file in files:
|
| | if file.endswith(audio_extension):
|
| | audio_files.append(os.path.join(root, file))
|
| |
|
| | return sorted(audio_files)
|
| |
|
| | def max_signal_length(audio_files):
|
| | """
|
| | Determines the maximum signal length among a list of audio files.
|
| |
|
| | Args:
|
| | audio_files (list): A list of paths to audio files.
|
| |
|
| | Returns:
|
| | int: The maximum signal length found among the audio files.
|
| |
|
| | Raises:
|
| | ValueError: If the input list is empty.
|
| | IOError: If any audio file cannot be loaded.
|
| | """
|
| |
|
| | if not audio_files:
|
| | raise ValueError("Audio file list cannot be empty")
|
| |
|
| | max_length = 0
|
| | for audio_file in audio_files:
|
| | try:
|
| | signal, _ = librosa.load(audio_file)
|
| | max_length = max(max_length, len(signal))
|
| | except Exception as e:
|
| | raise IOError(f"Error loading audio file '{audio_file}': {e}")
|
| |
|
| | return max_length
|
| |
|
| | def object_padding(object, length):
|
| | """
|
| | Pad the object to the given length
|
| |
|
| | Args:
|
| | object (np.array): time series object
|
| | max_length (int): Desired length to pad/truncate signals to.
|
| | Returns:
|
| | object (np.array): processed time series object
|
| | """
|
| | if len(object) < length:
|
| | padding_length = length - len(object)
|
| |
|
| | object = np.pad(object, (0, padding_length), mode="constant")
|
| | else:
|
| | object = object[:length]
|
| | return np.array(object)
|
| |
|
| | def fbank_features_extraction(audio_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
|
| | """
|
| | Extracts log Mel-filterbank (fbank) features from a list of audio files.
|
| |
|
| | Args:
|
| | audio_files (list): List of paths to audio files.
|
| | max_length (int): Desired length to pad/truncate signals to.
|
| | preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
|
| | window_length (float): Length of the analysis window in seconds (default: 0.025).
|
| | window_step (float): Step between successive windows in seconds (default: 0.01).
|
| | window_function (callable): Window function to apply (default: np.hamming).
|
| | nfft (int): Number of FFT points (default: 551).
|
| | num_features (int): Number of Mel filters (default: 40).
|
| |
|
| | Returns:
|
| | np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
|
| | where num_features + 1 represents the log energy feature.
|
| | """
|
| |
|
| | fbank_features = list()
|
| | for i in tqdm(range(len(audio_files))):
|
| |
|
| | signal, sample_rate = librosa.load(audio_files[i])
|
| |
|
| | signal = object_padding(signal, max_length)
|
| |
|
| |
|
| |
|
| | features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
|
| | samplerate=sample_rate,
|
| | winlen=window_length,
|
| | winstep=window_step,
|
| | nfilt=num_features,
|
| | nfft=num_nfft,
|
| | lowfreq=0,
|
| | highfreq=None,
|
| | preemph=preemphasis_coef,
|
| | winfunc=window_function)
|
| |
|
| | features_logfbank = np.log(features_fbank)
|
| | feature_logenergy = np.log(feature_energy)
|
| |
|
| | features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
|
| |
|
| | fbank_features.append(features)
|
| |
|
| | return np.array(fbank_features)
|
| |
|
| | def supervised_features_extraction(audio_files, annotation_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
|
| | """
|
| | Extracts log Mel-filterbank (fbank) features from a list of audio files.
|
| |
|
| | Args:
|
| | audio_files (list): List of paths to audio files.
|
| | annotation_files (list): List of paths to annotation files
|
| | max_length (int): Desired length to pad/truncate signals to.
|
| | preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
|
| | window_length (float): Length of the analysis window in seconds (default: 0.025).
|
| | window_step (float): Step between successive windows in seconds (default: 0.01).
|
| | window_function (callable): Window function to apply (default: np.hamming).
|
| | nfft (int): Number of FFT points (default: 551).
|
| | num_features (int): Number of Mel filters (default: 40).
|
| |
|
| | Returns:
|
| | np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
|
| | where num_features + 1 represents the log energy feature.
|
| | """
|
| |
|
| | fbank_features = list()
|
| | labels = list()
|
| | for i in tqdm(range(len(audio_files))):
|
| |
|
| | signal, sample_rate = librosa.load(audio_files[i])
|
| | signal = object_padding(signal, max_length)
|
| | truth_labels = readLabels(path=annotation_files[i], sample_rate=sample_rate)
|
| | truth_labels = object_padding(truth_labels, max_length)
|
| |
|
| |
|
| |
|
| | features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
|
| | samplerate=sample_rate,
|
| | winlen=window_length,
|
| | winstep=window_step,
|
| | nfilt=num_features,
|
| | nfft=num_nfft,
|
| | lowfreq=0,
|
| | highfreq=None,
|
| | preemph=preemphasis_coef,
|
| | winfunc=window_function)
|
| |
|
| | features_logfbank = np.log(features_fbank)
|
| | feature_logenergy = np.log(feature_energy)
|
| |
|
| | features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
|
| |
|
| |
|
| | temp_label = python_speech_features.sigproc.framesig(sig=truth_labels,
|
| | frame_len=window_length * sample_rate,
|
| | frame_step=window_step * sample_rate,
|
| | winfunc=np.ones)
|
| | label = np.zeros(temp_label.shape[0])
|
| | label = np.array([1 if np.sum(temp_label[j], axis=0) > temp_label.shape[0] / 2 else 0 for j in range(temp_label.shape[0])])
|
| |
|
| | fbank_features.append(features)
|
| | labels.append(label)
|
| |
|
| | return np.array(fbank_features), np.array(labels)
|
| |
|