/* * Copyright (c) 2022 EdgeImpulse Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language * governing permissions and limitations under the License. * * SPDX-License-Identifier: Apache-2.0 */ #ifndef _EIDSP_SPEECHPY_PROCESSING_H_ #define _EIDSP_SPEECHPY_PROCESSING_H_ #include "../numpy.hpp" namespace ei { namespace speechpy { // one stack frame returned by stack_frames typedef struct ei_stack_frames_info { signal_t *signal; ei_vector frame_ixs; int frame_length; } stack_frames_info_t; namespace processing { /** * Lazy Preemphasising on the signal. * @param signal: The input signal. * @param shift (int): The shift step. * @param cof (float): The preemphasising coefficient. 0 equals to no filtering. */ class preemphasis { public: preemphasis(ei_signal_t *signal, int shift, float cof, bool rescale) : _signal(signal), _shift(shift), _cof(cof), _rescale(rescale) { _prev_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); _end_of_signal_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); _next_offset_should_be = 0; if (shift < 0) { _shift = signal->total_length + shift; } if (!_prev_buffer || !_end_of_signal_buffer) return; // we need to get the shift bytes from the end of the buffer... signal->get_data(signal->total_length - shift, shift, _end_of_signal_buffer); } /** * Get preemphasized data from the underlying audio buffer... * This retrieves data from the signal then preemphasizes it. * @param offset Offset in the audio signal * @param length Length of the audio signal */ int get_data(size_t offset, size_t length, float *out_buffer) { if (!_prev_buffer || !_end_of_signal_buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } if (offset + length > _signal->total_length) { EIDSP_ERR(EIDSP_OUT_OF_BOUNDS); } int ret; if (static_cast(offset) - _shift >= 0) { ret = _signal->get_data(offset - _shift, _shift, _prev_buffer); if (ret != 0) { EIDSP_ERR(ret); } } // else we'll use the end_of_signal_buffer; so no need to check ret = _signal->get_data(offset, length, out_buffer); if (ret != 0) { EIDSP_ERR(ret); } // it might be that everything is already normalized here... bool all_between_min_1_and_1 = true; // now we have the signal and we can preemphasize for (size_t ix = 0; ix < length; ix++) { float now = out_buffer[ix]; // under shift? read from end if (offset + ix < static_cast(_shift)) { out_buffer[ix] = now - (_cof * _end_of_signal_buffer[offset + ix]); } // otherwise read from history buffer else { out_buffer[ix] = now - (_cof * _prev_buffer[0]); } if (_rescale && all_between_min_1_and_1) { if (out_buffer[ix] < -1.0f || out_buffer[ix] > 1.0f) { all_between_min_1_and_1 = false; } } // roll through and overwrite last element if (_shift != 1) { numpy::roll(_prev_buffer, _shift, -1); } _prev_buffer[_shift - 1] = now; } _next_offset_should_be += length; // rescale from [-1 .. 1] ? if (_rescale && !all_between_min_1_and_1) { matrix_t scale_matrix(length, 1, out_buffer); ret = numpy::scale(&scale_matrix, 1.0f / 32768.0f); if (ret != 0) { EIDSP_ERR(ret); } } return EIDSP_OK; } ~preemphasis() { if (_prev_buffer) { ei_dsp_free(_prev_buffer, _shift * sizeof(float)); } if (_end_of_signal_buffer) { ei_dsp_free(_end_of_signal_buffer, _shift * sizeof(float)); } } private: ei_signal_t *_signal; int _shift; float _cof; float *_prev_buffer; float *_end_of_signal_buffer; size_t _next_offset_should_be; bool _rescale; }; } namespace processing { /** * Preemphasising on the signal. This modifies the signal in place! * For memory consumption reasons you **probably** want the preemphasis class, * which lazily loads the signal in. * @param signal (array): The input signal. * @param shift (int): The shift step. * @param cof (float): The preemphasising coefficient. 0 equals to no filtering. * @returns 0 when successful */ __attribute__((unused)) static int preemphasis(float *signal, size_t signal_size, int shift = 1, float cof = 0.98f) { if (shift < 0) { shift = signal_size + shift; } // so we need to keep some history float *prev_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); // signal - cof * xt::roll(signal, shift) for (size_t ix = 0; ix < signal_size; ix++) { float now = signal[ix]; // under shift? read from end if (ix < static_cast(shift)) { signal[ix] = now - (cof * signal[signal_size - shift + ix]); } // otherwise read from history buffer else { signal[ix] = now - (cof * prev_buffer[0]); } // roll through and overwrite last element numpy::roll(prev_buffer, shift, -1); prev_buffer[shift - 1] = now; } ei_dsp_free(prev_buffer, shift * sizeof(float)); return EIDSP_OK; } /** * frame_length is a float and can thus be off by a little bit, e.g. * frame_length = 0.018f actually can yield 0.018000011f * thus screwing up our frame calculations here... */ static float ceil_unless_very_close_to_floor(float v) { if (v > floor(v) && v - floor(v) < 0.001f) { v = (floor(v)); } else { v = (ceil(v)); } return v; } /** * Calculate the length of a signal that will be sused for the settings provided. * @param signal_size: The number of frames in the signal * @param sampling_frequency (int): The sampling frequency of the signal. * @param frame_length (float): The length of the frame in second. * @param frame_stride (float): The stride between frames. * @returns Number of frames required, or a negative number if an error occured */ static int calculate_signal_used( size_t signal_size, uint32_t sampling_frequency, float frame_length, float frame_stride, bool zero_padding, uint16_t version) { int frame_sample_length; int length; if (version == 1) { frame_sample_length = static_cast(round(static_cast(sampling_frequency) * frame_length)); frame_stride = round(static_cast(sampling_frequency) * frame_stride); length = frame_sample_length; } else { frame_sample_length = static_cast(ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_length)); float frame_stride_arg = frame_stride; frame_stride = ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_stride_arg); length = (frame_sample_length - (int)frame_stride); } volatile int numframes; volatile int len_sig; if (zero_padding) { // Calculation of number of frames numframes = static_cast( ceil(static_cast(signal_size - length) / frame_stride)); // Zero padding len_sig = static_cast(static_cast(numframes) * frame_stride) + frame_sample_length; } else { numframes = static_cast( floor(static_cast(signal_size - length) / frame_stride)); len_sig = static_cast( (static_cast(numframes - 1) * frame_stride + frame_sample_length)); } return len_sig; } /** * Frame a signal into overlapping frames. * @param info This is both the base object and where we'll store our results. * @param sampling_frequency (int): The sampling frequency of the signal. * @param frame_length (float): The length of the frame in second. * @param frame_stride (float): The stride between frames. * @param zero_padding (bool): If the samples is not a multiple of * frame_length(number of frames sample), zero padding will * be done for generating last frame. * @returns EIDSP_OK if OK */ static int stack_frames(stack_frames_info_t *info, float sampling_frequency, float frame_length, float frame_stride, bool zero_padding, uint16_t version) { if (!info->signal || !info->signal->get_data || info->signal->total_length == 0) { EIDSP_ERR(EIDSP_SIGNAL_SIZE_MISMATCH); } size_t length_signal = info->signal->total_length; int frame_sample_length; int length; if (version == 1) { frame_sample_length = static_cast(round(static_cast(sampling_frequency) * frame_length)); frame_stride = round(static_cast(sampling_frequency) * frame_stride); length = frame_sample_length; } else { frame_sample_length = static_cast(ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_length)); float frame_stride_arg = frame_stride; frame_stride = ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_stride_arg); length = (frame_sample_length - (int)frame_stride); } volatile int numframes; volatile int len_sig; if (zero_padding) { // Calculation of number of frames numframes = static_cast( ceil(static_cast(length_signal - length) / frame_stride)); // Zero padding len_sig = static_cast(static_cast(numframes) * frame_stride) + frame_sample_length; info->signal->total_length = static_cast(len_sig); } else { numframes = static_cast( floor(static_cast(length_signal - length) / frame_stride)); len_sig = static_cast( (static_cast(numframes - 1) * frame_stride + frame_sample_length)); info->signal->total_length = static_cast(len_sig); } info->frame_ixs.clear(); int frame_count = 0; for (size_t ix = 0; ix < static_cast(len_sig); ix += static_cast(frame_stride)) { if (++frame_count > numframes) break; info->frame_ixs.push_back(ix); } info->frame_length = frame_sample_length; return EIDSP_OK; } /** * Calculate the number of stack frames for the settings provided. * This is needed to allocate the right buffer size for the output of f.e. the MFE * blocks. * @param signal_size: The number of frames in the signal * @param sampling_frequency (int): The sampling frequency of the signal. * @param frame_length (float): The length of the frame in second. * @param frame_stride (float): The stride between frames. * @param zero_padding (bool): If the samples is not a multiple of * frame_length(number of frames sample), zero padding will * be done for generating last frame. * @returns Number of frames required, or a negative number if an error occured */ static int32_t calculate_no_of_stack_frames( size_t signal_size, uint32_t sampling_frequency, float frame_length, float frame_stride, bool zero_padding, uint16_t version) { int frame_sample_length; int length; if (version == 1) { frame_sample_length = static_cast(round(static_cast(sampling_frequency) * frame_length)); frame_stride = round(static_cast(sampling_frequency) * frame_stride); length = frame_sample_length; } else { frame_sample_length = static_cast(ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_length)); float frame_stride_arg = frame_stride; frame_stride = ceil_unless_very_close_to_floor(static_cast(sampling_frequency) * frame_stride_arg); length = (frame_sample_length - (int)frame_stride); } volatile int numframes; if (zero_padding) { // Calculation of number of frames numframes = static_cast( ceil(static_cast(signal_size - length) / frame_stride)); } else { numframes = static_cast( floor(static_cast(signal_size - length) / frame_stride)); } return numframes; } /** * This function performs local cepstral mean and * variance normalization on a sliding window. The code assumes that * there is one observation per row. * @param features_matrix input feature matrix, will be modified in place * @param win_size The size of sliding window for local normalization. * Default=301 which is around 3s if 100 Hz rate is * considered(== 10ms frame stide) * @param variance_normalization If the variance normilization should * be performed or not. * @param scale Scale output to 0..1 * @returns 0 if OK */ static int cmvnw(matrix_t *features_matrix, uint16_t win_size = 301, bool variance_normalization = false, bool scale = false) { if (win_size == 0) { return EIDSP_OK; } uint16_t pad_size = (win_size - 1) / 2; int ret; float *features_buffer_ptr; // mean & variance normalization EI_DSP_MATRIX(vec_pad, features_matrix->rows + (pad_size * 2), features_matrix->cols); if (!vec_pad.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } EI_DSP_MATRIX(mean_matrix, vec_pad.cols, 1); if (!mean_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } EI_DSP_MATRIX(window_variance, vec_pad.cols, 1); if (!window_variance.buffer) { return EIDSP_OUT_OF_MEM; } for (size_t ix = 0; ix < features_matrix->rows; ix++) { // create a slice on the vec_pad EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols)); if (!window.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } ret = numpy::mean_axis0(&window, &mean_matrix); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } // subtract the mean for the features for (size_t fm_col = 0; fm_col < features_matrix->cols; fm_col++) { features_matrix->buffer[(ix * features_matrix->cols) + fm_col] = features_matrix->buffer[(ix * features_matrix->cols) + fm_col] - mean_matrix.buffer[fm_col]; } } ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } for (size_t ix = 0; ix < features_matrix->rows; ix++) { // create a slice on the vec_pad EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols)); if (!window.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } if (variance_normalization == true) { ret = numpy::std_axis0(&window, &window_variance); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } features_buffer_ptr = &features_matrix->buffer[ix * vec_pad.cols]; for (size_t col = 0; col < vec_pad.cols; col++) { *(features_buffer_ptr) = (*(features_buffer_ptr)) / (window_variance.buffer[col] + 1e-10); features_buffer_ptr++; } } } if (scale) { ret = numpy::normalize(features_matrix); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } } return EIDSP_OK; } /** * Perform normalization for MFE frames, this converts the signal to dB, * then add a hard filter, and quantize / dequantize the output * @param features_matrix input feature matrix, will be modified in place */ static int mfe_normalization(matrix_t *features_matrix, int noise_floor_db) { const float noise = static_cast(noise_floor_db * -1); const float noise_scale = 1.0f / (static_cast(noise_floor_db * -1) + 12.0f); for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) { float f = features_matrix->buffer[ix]; if (f < 1e-30) { f = 1e-30; } f = numpy::log10(f); f *= 10.0f; // scale by 10 f += noise; f *= noise_scale; // clip again /* Here is the python code we're duplicating: # Quantize to 8 bits and dequantize back to float32 mfe = np.uint8(np.around(mfe * 2**8)) # clip to 2**8 mfe = np.clip(mfe, 0, 255) mfe = np.float32(mfe / 2**8) */ f = roundf(f*256)/256; if (f < 0.0f) f = 0.0f; else if (f > 1.0f) f = 1.0f; features_matrix->buffer[ix] = f; } return EIDSP_OK; } /** * Perform normalization for spectrogram frames, this converts the signal to dB, * then add a hard filter * @param features_matrix input feature matrix, will be modified in place */ static int spectrogram_normalization(matrix_t *features_matrix, int noise_floor_db) { const float noise = static_cast(noise_floor_db * -1); const float noise_scale = 1.0f / (static_cast(noise_floor_db * -1) + 12.0f); for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) { float f = features_matrix->buffer[ix]; if (f < 1e-30) { f = 1e-30; } f = numpy::log10(f); f *= 10.0f; // scale by 10 f += noise; f *= noise_scale; // clip again if (f < 0.0f) f = 0.0f; else if (f > 1.0f) f = 1.0f; features_matrix->buffer[ix] = f; } return EIDSP_OK; } }; } // namespace speechpy } // namespace ei #endif // _EIDSP_SPEECHPY_PROCESSING_H_