| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Compute input examples for VGGish from audio waveform.""" |
|
|
| |
| import torch |
|
|
| import numpy as np |
| import resampy |
|
|
| from dataloader.audio.preprocess_vgg import mel_features |
| from dataloader.audio.preprocess_vgg import vggish_params |
|
|
| import soundfile as sf |
|
|
|
|
| def waveform_to_examples(data, sample_rate, return_tensor=True): |
| """Converts audio waveform into an array of examples for VGGish. |
| |
| Args: |
| data: np.array of either one dimension (mono) or two dimensions |
| (multi-channel, with the outer dimension representing channels). |
| Each sample is generally expected to lie in the range [-1.0, +1.0], |
| although this is not required. |
| sample_rate: Sample rate of data. |
| return_tensor: Return data as a Pytorch tensor ready for VGGish |
| |
| Returns: |
| 3-D np.array of shape [num_examples, num_frames, num_bands] which represents |
| a sequence of examples, each of which contains a patch of log mel |
| spectrogram, covering num_frames frames of audio and num_bands mel frequency |
| bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. |
| |
| """ |
| |
| if len(data.shape) > 1: |
| data = np.mean(data, axis=1) |
| |
| if sample_rate != vggish_params.SAMPLE_RATE: |
| data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) |
|
|
| |
| log_mel = mel_features.log_mel_spectrogram( |
| data, |
| audio_sample_rate=vggish_params.SAMPLE_RATE, |
| log_offset=vggish_params.LOG_OFFSET, |
| window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, |
| hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, |
| num_mel_bins=vggish_params.NUM_MEL_BINS, |
| lower_edge_hertz=vggish_params.MEL_MIN_HZ, |
| upper_edge_hertz=vggish_params.MEL_MAX_HZ) |
|
|
| |
| features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS |
| example_window_length = int(round( |
| vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) |
| example_hop_length = int(round( |
| vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) |
| log_mel_examples = mel_features.frame( |
| log_mel, |
| window_length=example_window_length, |
| hop_length=example_hop_length) |
|
|
| if return_tensor: |
| log_mel_examples = torch.tensor( |
| log_mel_examples, requires_grad=True)[:, None, :, :].float() |
|
|
| return log_mel_examples |
|
|
|
|
| def wavfile_to_examples(wav_file, return_tensor=True): |
| """Convenience wrapper around waveform_to_examples() for a common WAV format. |
| |
| Args: |
| wav_file: String path to a file, or a file-like object. The file |
| is assumed to contain WAV audio data with signed 16-bit PCM samples. |
| torch: Return data as a Pytorch tensor ready for VGGish |
| |
| Returns: |
| See waveform_to_examples. |
| """ |
| wav_data, sr = sf.read(wav_file, dtype='int16') |
| assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype |
| samples = wav_data / 32768.0 |
| return waveform_to_examples(samples, sr, return_tensor) |
|
|