/* * Copyright (c) 2022 EdgeImpulse Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language * governing permissions and limitations under the License. * * SPDX-License-Identifier: Apache-2.0 */ #ifndef _EDGE_IMPULSE_RUN_DSP_H_ #define _EDGE_IMPULSE_RUN_DSP_H_ #include "edge-impulse-sdk/classifier/ei_model_types.h" #include "edge-impulse-sdk/dsp/spectral/spectral.hpp" #include "edge-impulse-sdk/dsp/speechpy/speechpy.hpp" #include "edge-impulse-sdk/classifier/ei_signal_with_range.h" #include "model-parameters/model_metadata.h" #if defined(__cplusplus) && EI_C_LINKAGE == 1 extern "C" { extern void ei_printf(const char *format, ...); } #else extern void ei_printf(const char *format, ...); #endif #ifdef __cplusplus namespace { #endif // __cplusplus using namespace ei; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) float ei_dsp_image_buffer[EI_DSP_IMAGE_BUFFER_STATIC_SIZE]; #endif // this is the frame we work on... allocate it statically so we share between invocations static float *ei_dsp_cont_current_frame = nullptr; static size_t ei_dsp_cont_current_frame_size = 0; static int ei_dsp_cont_current_frame_ix = 0; __attribute__((unused)) int extract_spectral_analysis_features( signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { ei_dsp_config_spectral_analysis_t *config = (ei_dsp_config_spectral_analysis_t *)config_ptr; // input matrix from the raw signal matrix_t input_matrix(signal->total_length / config->axes, config->axes); if (!input_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } signal->get_data(0, signal->total_length, input_matrix.buffer); #if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET || EI_DSP_PARAMS_ALL if (strcmp(config->analysis_type, "Wavelet") == 0) { return spectral::wavelet::extract_wavelet_features(&input_matrix, output_matrix, config, frequency); } #endif #if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_ALL if (strcmp(config->analysis_type, "FFT") == 0) { if (config->implementation_version == 1) { return spectral::feature::extract_spectral_analysis_features_v1( &input_matrix, output_matrix, config, frequency); } else if (config->implementation_version == 4) { return spectral::feature::extract_spectral_analysis_features_v4( &input_matrix, output_matrix, config, frequency); } else { return spectral::feature::extract_spectral_analysis_features_v2( &input_matrix, output_matrix, config, frequency); } } #endif #if !EI_DSP_PARAMS_GENERATED || EI_DSP_PARAMS_ALL || !(EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET) if (config->implementation_version == 1) { return spectral::feature::extract_spectral_analysis_features_v1( &input_matrix, output_matrix, config, frequency); } if (config->implementation_version == 2) { return spectral::feature::extract_spectral_analysis_features_v2( &input_matrix, output_matrix, config, frequency); } #endif return EIDSP_NOT_SUPPORTED; } __attribute__((unused)) int extract_raw_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { ei_dsp_config_raw_t config = *((ei_dsp_config_raw_t*)config_ptr); // Because of rounding errors during re-sampling the output size of the block might be // smaller than the input of the block. Make sure we don't write outside of the bounds // of the array: // https://forum.edgeimpulse.com/t/using-custom-sensors-on-raspberry-pi-4/3506/7 size_t els_to_copy = signal->total_length; if (els_to_copy > output_matrix->rows * output_matrix->cols) { els_to_copy = output_matrix->rows * output_matrix->cols; } signal->get_data(0, els_to_copy, output_matrix->buffer); // scale the signal int ret = numpy::scale(output_matrix, config.scale_axes); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } return EIDSP_OK; } __attribute__((unused)) int extract_flatten_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { ei_dsp_config_flatten_t config = *((ei_dsp_config_flatten_t*)config_ptr); uint32_t expected_matrix_size = 0; if (config.average) expected_matrix_size += config.axes; if (config.minimum) expected_matrix_size += config.axes; if (config.maximum) expected_matrix_size += config.axes; if (config.rms) expected_matrix_size += config.axes; if (config.stdev) expected_matrix_size += config.axes; if (config.skewness) expected_matrix_size += config.axes; if (config.kurtosis) expected_matrix_size += config.axes; if (output_matrix->rows * output_matrix->cols != expected_matrix_size) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } int ret; // input matrix from the raw signal matrix_t input_matrix(signal->total_length / config.axes, config.axes); if (!input_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } signal->get_data(0, signal->total_length, input_matrix.buffer); // scale the signal ret = numpy::scale(&input_matrix, config.scale_axes); if (ret != EIDSP_OK) { ei_printf("ERR: Failed to scale signal (%d)\n", ret); EIDSP_ERR(ret); } // transpose the matrix so we have one row per axis (nifty!) ret = numpy::transpose(&input_matrix); if (ret != EIDSP_OK) { ei_printf("ERR: Failed to transpose matrix (%d)\n", ret); EIDSP_ERR(ret); } size_t out_matrix_ix = 0; for (size_t row = 0; row < input_matrix.rows; row++) { matrix_t row_matrix(1, input_matrix.cols, input_matrix.buffer + (row * input_matrix.cols)); if (config.average) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::mean(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.minimum) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::min(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.maximum) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::max(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.rms) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::rms(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.stdev) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::stdev(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.skewness) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::skew(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } if (config.kurtosis) { float fbuffer; matrix_t out_matrix(1, 1, &fbuffer); numpy::kurtosis(&row_matrix, &out_matrix); output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; } } // flatten again output_matrix->cols = output_matrix->rows * output_matrix->cols; output_matrix->rows = 1; return EIDSP_OK; } static class speechpy::processing::preemphasis *preemphasis; static int preemphasized_audio_signal_get_data(size_t offset, size_t length, float *out_ptr) { return preemphasis->get_data(offset, length, out_ptr); } __attribute__((unused)) int extract_mfcc_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr); if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if((config.implementation_version == 0) || (config.implementation_version > 4)) { EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); // preemphasis class to preprocess the audio... class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false); preemphasis = ⪯ signal_t preemphasized_audio_signal; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; // calculate the size of the MFCC matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfcc_buffer_size( signal->total_length, frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.implementation_version); /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */ if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } output_matrix->rows = out_matrix_size.rows; output_matrix->cols = out_matrix_size.cols; // and run the MFCC extraction int ret = speechpy::feature::mfcc(output_matrix, &preemphasized_audio_signal, frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.num_filters, config.fft_length, config.low_frequency, config.high_frequency, true, config.implementation_version); if (ret != EIDSP_OK) { ei_printf("ERR: MFCC failed (%d)\n", ret); EIDSP_ERR(ret); } // cepstral mean and variance normalization ret = speechpy::processing::cmvnw(output_matrix, config.win_size, true, false); if (ret != EIDSP_OK) { ei_printf("ERR: cmvnw failed (%d)\n", ret); EIDSP_ERR(ret); } output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; output_matrix->rows = 1; return EIDSP_OK; } static int extract_mfcc_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfcc_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out, int implementation_version) { uint32_t frequency = (uint32_t)sampling_frequency; int x; // calculate the size of the spectrogram matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfcc_buffer_size( signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_cepstral, implementation_version); // we roll the output matrix back so we have room at the end... x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, -(out_matrix_size.rows * out_matrix_size.cols)); if (x != EIDSP_OK) { EIDSP_ERR(x); } // slice in the output matrix to write to // the offset in the classification matrix here is always at the end size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - (out_matrix_size.rows * out_matrix_size.cols); matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); // and run the MFCC extraction x = speechpy::feature::mfcc(&output_matrix_slice, signal, frequency, config->frame_length, config->frame_stride, config->num_cepstral, config->num_filters, config->fft_length, config->low_frequency, config->high_frequency, true, implementation_version); if (x != EIDSP_OK) { ei_printf("ERR: MFCC failed (%d)\n", x); EIDSP_ERR(x); } matrix_size_out->rows += out_matrix_size.rows; if (out_matrix_size.cols > 0) { matrix_size_out->cols = out_matrix_size.cols; } return EIDSP_OK; } __attribute__((unused)) int extract_mfcc_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { #if defined(__cplusplus) && EI_C_LINKAGE == 1 ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); EIDSP_ERR(EIDSP_NOT_SUPPORTED); #else ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr); if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if((config.implementation_version == 0) || (config.implementation_version > 4)) { EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); // preemphasis class to preprocess the audio... class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false); preemphasis = ⪯ signal_t preemphasized_audio_signal; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; // Go from the time (e.g. 0.25 seconds to number of frames based on freq) const size_t frame_length_values = frequency * config.frame_length; const size_t frame_stride_values = frequency * config.frame_stride; const int frame_overlap_values = static_cast(frame_length_values) - static_cast(frame_stride_values); if (frame_overlap_values < 0) { ei_printf("ERR: frame_length ("); ei_printf_float(config.frame_length); ei_printf(") cannot be lower than frame_stride ("); ei_printf_float(config.frame_stride); ei_printf(") for continuous classification\n"); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } int x; // have current frame, but wrong size? then free if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { ei_free(ei_dsp_cont_current_frame); ei_dsp_cont_current_frame = nullptr; } int implementation_version = config.implementation_version; // this is the offset in the signal from which we'll work size_t offset_in_signal = 0; if (!ei_dsp_cont_current_frame) { ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); if (!ei_dsp_cont_current_frame) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } ei_dsp_cont_current_frame_size = frame_length_values; ei_dsp_cont_current_frame_ix = 0; } if ((frame_length_values) > preemphasized_audio_signal.total_length + ei_dsp_cont_current_frame_ix) { ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", (int)frame_length_values, (int)preemphasized_audio_signal.total_length + ei_dsp_cont_current_frame_ix); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } matrix_size_out->rows = 0; matrix_size_out->cols = 0; // for continuous use v2 stack frame calculations if (implementation_version == 1) { implementation_version = 2; } if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size (ix=%d size=%d)\n", ei_dsp_cont_current_frame_ix, (int)ei_dsp_cont_current_frame_size); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } // if we still have some code from previous run while (ei_dsp_cont_current_frame_ix > 0) { // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix` // starting at offset 0 x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); if (x != EIDSP_OK) { EIDSP_ERR(x); } // now ei_dsp_cont_current_frame is complete signal_t frame_signal; x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); if (x != EIDSP_OK) { EIDSP_ERR(x); } x = extract_mfcc_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version); if (x != EIDSP_OK) { EIDSP_ERR(x); } // if there's overlap between frames we roll through if (frame_stride_values > 0) { numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); } ei_dsp_cont_current_frame_ix -= frame_stride_values; } if (ei_dsp_cont_current_frame_ix < 0) { offset_in_signal = -ei_dsp_cont_current_frame_ix; ei_dsp_cont_current_frame_ix = 0; } if (offset_in_signal >= signal->total_length) { offset_in_signal -= signal->total_length; return EIDSP_OK; } // now... we need to discard part of the signal... SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length); signal_t *range_signal = signal_with_range.get_signal(); size_t range_signal_orig_length = range_signal->total_length; // then we'll just go through normal processing of the signal: x = extract_mfcc_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version); if (x != EIDSP_OK) { EIDSP_ERR(x); } // Make sure v1 model are reset to the original length; range_signal->total_length = range_signal_orig_length; // update offset int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, config.frame_length, config.frame_stride, false, implementation_version); offset_in_signal += length_of_signal_used; // see what's left? int bytes_left_end_of_frame = signal->total_length - offset_in_signal; bytes_left_end_of_frame += frame_overlap_values; if (bytes_left_end_of_frame > 0) { // then read that into the ei_dsp_cont_current_frame buffer x = preemphasized_audio_signal.get_data( (preemphasized_audio_signal.total_length - bytes_left_end_of_frame), bytes_left_end_of_frame, ei_dsp_cont_current_frame); if (x != EIDSP_OK) { EIDSP_ERR(x); } } ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; preemphasis = nullptr; return EIDSP_OK; #endif } __attribute__((unused)) int extract_spectrogram_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr); if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); // calculate the size of the MFE matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfe_buffer_size( signal->total_length, frequency, config.frame_length, config.frame_stride, config.fft_length / 2 + 1, config.implementation_version); /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */ if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } output_matrix->rows = out_matrix_size.rows; output_matrix->cols = out_matrix_size.cols; int ret = speechpy::feature::spectrogram(output_matrix, signal, sampling_frequency, config.frame_length, config.frame_stride, config.fft_length, config.implementation_version); if (ret != EIDSP_OK) { ei_printf("ERR: Spectrogram failed (%d)\n", ret); EIDSP_ERR(ret); } if (config.implementation_version < 3) { ret = numpy::normalize(output_matrix); if (ret != EIDSP_OK) { EIDSP_ERR(ret); } } else { // normalization ret = speechpy::processing::spectrogram_normalization(output_matrix, config.noise_floor_db); if (ret != EIDSP_OK) { ei_printf("ERR: normalization failed (%d)\n", ret); EIDSP_ERR(ret); } } output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; output_matrix->rows = 1; return EIDSP_OK; } static int extract_spectrogram_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_spectrogram_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) { uint32_t frequency = (uint32_t)sampling_frequency; int x; // calculate the size of the spectrogram matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfe_buffer_size( signal->total_length, frequency, config->frame_length, config->frame_stride, config->fft_length / 2 + 1, config->implementation_version); // we roll the output matrix back so we have room at the end... x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, -(out_matrix_size.rows * out_matrix_size.cols)); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } // slice in the output matrix to write to // the offset in the classification matrix here is always at the end size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - (out_matrix_size.rows * out_matrix_size.cols); matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); // and run the spectrogram extraction int ret = speechpy::feature::spectrogram(&output_matrix_slice, signal, frequency, config->frame_length, config->frame_stride, config->fft_length, config->implementation_version); if (ret != EIDSP_OK) { ei_printf("ERR: Spectrogram failed (%d)\n", ret); EIDSP_ERR(ret); } matrix_size_out->rows += out_matrix_size.rows; if (out_matrix_size.cols > 0) { matrix_size_out->cols = out_matrix_size.cols; } return EIDSP_OK; } __attribute__((unused)) int extract_spectrogram_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { #if defined(__cplusplus) && EI_C_LINKAGE == 1 ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); EIDSP_ERR(EIDSP_NOT_SUPPORTED); #else ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr); static bool first_run = false; if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); /* Fake an extra frame_length for stack frames calculations. There, 1 frame_length is always subtracted and there for never used. But skip the first slice to fit the feature_matrix buffer */ if(config.implementation_version < 2) { if (first_run == true) { signal->total_length += (size_t)(config.frame_length * (float)frequency); } first_run = true; } // Go from the time (e.g. 0.25 seconds to number of frames based on freq) const size_t frame_length_values = frequency * config.frame_length; const size_t frame_stride_values = frequency * config.frame_stride; const int frame_overlap_values = static_cast(frame_length_values) - static_cast(frame_stride_values); if (frame_overlap_values < 0) { ei_printf("ERR: frame_length ("); ei_printf_float(config.frame_length); ei_printf(") cannot be lower than frame_stride ("); ei_printf_float(config.frame_stride); ei_printf(") for continuous classification\n"); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } if (frame_length_values > signal->total_length) { ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", (int)frame_length_values, (int)signal->total_length); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } int x; // have current frame, but wrong size? then free if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { ei_free(ei_dsp_cont_current_frame); ei_dsp_cont_current_frame = nullptr; } if (!ei_dsp_cont_current_frame) { ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); if (!ei_dsp_cont_current_frame) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } ei_dsp_cont_current_frame_size = frame_length_values; ei_dsp_cont_current_frame_ix = 0; } matrix_size_out->rows = 0; matrix_size_out->cols = 0; // this is the offset in the signal from which we'll work size_t offset_in_signal = 0; if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n"); EIDSP_ERR(EIDSP_PARAMETER_INVALID); } // if we still have some code from previous run while (ei_dsp_cont_current_frame_ix > 0) { // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix` // starting at offset 0 x = signal->get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); if (x != EIDSP_OK) { EIDSP_ERR(x); } // now ei_dsp_cont_current_frame is complete signal_t frame_signal; x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); if (x != EIDSP_OK) { EIDSP_ERR(x); } x = extract_spectrogram_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out); if (x != EIDSP_OK) { EIDSP_ERR(x); } // if there's overlap between frames we roll through if (frame_stride_values > 0) { numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); } ei_dsp_cont_current_frame_ix -= frame_stride_values; } if (ei_dsp_cont_current_frame_ix < 0) { offset_in_signal = -ei_dsp_cont_current_frame_ix; ei_dsp_cont_current_frame_ix = 0; } if (offset_in_signal >= signal->total_length) { offset_in_signal -= signal->total_length; return EIDSP_OK; } // now... we need to discard part of the signal... SignalWithRange signal_with_range(signal, offset_in_signal, signal->total_length); signal_t *range_signal = signal_with_range.get_signal(); size_t range_signal_orig_length = range_signal->total_length; // then we'll just go through normal processing of the signal: x = extract_spectrogram_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out); if (x != EIDSP_OK) { EIDSP_ERR(x); } // update offset int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, config.frame_length, config.frame_stride, false, config.implementation_version); offset_in_signal += length_of_signal_used; // not sure why this is being manipulated... range_signal->total_length = range_signal_orig_length; // see what's left? int bytes_left_end_of_frame = signal->total_length - offset_in_signal; bytes_left_end_of_frame += frame_overlap_values; if (bytes_left_end_of_frame > 0) { // then read that into the ei_dsp_cont_current_frame buffer x = signal->get_data( (signal->total_length - bytes_left_end_of_frame), bytes_left_end_of_frame, ei_dsp_cont_current_frame); if (x != EIDSP_OK) { EIDSP_ERR(x); } } ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; if (config.implementation_version < 2) { if (first_run == true) { signal->total_length -= (size_t)(config.frame_length * (float)frequency); } } return EIDSP_OK; #endif } __attribute__((unused)) int extract_mfe_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr); if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); signal_t preemphasized_audio_signal; // before version 3 we did not have preemphasis if (config.implementation_version < 3) { preemphasis = nullptr; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = signal->get_data; } else { // preemphasis class to preprocess the audio... class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true); preemphasis = pre; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; } // calculate the size of the MFE matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfe_buffer_size( preemphasized_audio_signal.total_length, frequency, config.frame_length, config.frame_stride, config.num_filters, config.implementation_version); /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */ if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); if (preemphasis) { delete preemphasis; } EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } output_matrix->rows = out_matrix_size.rows; output_matrix->cols = out_matrix_size.cols; int ret; // This probably seems incorrect, but the mfe func can actually handle all versions // There's a subtle issue with cmvn and v2, not worth tracking down // So for v2 and v1, we'll just use the old code // (the new mfe does away with the intermediate filterbank matrix) if (config.implementation_version > 2) { ret = speechpy::feature::mfe(output_matrix, nullptr, &preemphasized_audio_signal, frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length, config.low_frequency, config.high_frequency, config.implementation_version); } else { ret = speechpy::feature::mfe_v3(output_matrix, nullptr, &preemphasized_audio_signal, frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length, config.low_frequency, config.high_frequency, config.implementation_version); } if (preemphasis) { delete preemphasis; } if (ret != EIDSP_OK) { ei_printf("ERR: MFE failed (%d)\n", ret); EIDSP_ERR(ret); } if (config.implementation_version < 3) { // cepstral mean and variance normalization ret = speechpy::processing::cmvnw(output_matrix, config.win_size, false, true); if (ret != EIDSP_OK) { ei_printf("ERR: cmvnw failed (%d)\n", ret); EIDSP_ERR(ret); } } else { // normalization ret = speechpy::processing::mfe_normalization(output_matrix, config.noise_floor_db); if (ret != EIDSP_OK) { ei_printf("ERR: normalization failed (%d)\n", ret); EIDSP_ERR(ret); } } output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; output_matrix->rows = 1; return EIDSP_OK; } static int extract_mfe_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfe_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) { uint32_t frequency = (uint32_t)sampling_frequency; int x; // calculate the size of the spectrogram matrix matrix_size_t out_matrix_size = speechpy::feature::calculate_mfe_buffer_size( signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_filters, config->implementation_version); // we roll the output matrix back so we have room at the end... x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, -(out_matrix_size.rows * out_matrix_size.cols)); if (x != EIDSP_OK) { EIDSP_ERR(x); } // slice in the output matrix to write to // the offset in the classification matrix here is always at the end size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - (out_matrix_size.rows * out_matrix_size.cols); matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); // and run the MFE extraction // This probably seems incorrect, but the mfe func can actually handle all versions // There's a subtle issue with cmvn and v2, not worth tracking down // So for v2 and v1, we'll just use the old code // (the new mfe does away with the intermediate filterbank matrix) if (config->implementation_version > 2) { x = speechpy::feature::mfe(&output_matrix_slice, nullptr, signal, frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length, config->low_frequency, config->high_frequency, config->implementation_version); } else { x = speechpy::feature::mfe_v3(&output_matrix_slice, nullptr, signal, frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length, config->low_frequency, config->high_frequency, config->implementation_version); } if (x != EIDSP_OK) { ei_printf("ERR: MFE failed (%d)\n", x); EIDSP_ERR(x); } matrix_size_out->rows += out_matrix_size.rows; if (out_matrix_size.cols > 0) { matrix_size_out->cols = out_matrix_size.cols; } return EIDSP_OK; } __attribute__((unused)) int extract_mfe_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { #if defined(__cplusplus) && EI_C_LINKAGE == 1 ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); EIDSP_ERR(EIDSP_NOT_SUPPORTED); #else ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr); // signal is already the right size, // output matrix is not the right size, but we can start writing at offset 0 and then it's OK too static bool first_run = false; if (config.axes != 1) { EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); } if (signal->total_length == 0) { EIDSP_ERR(EIDSP_PARAMETER_INVALID); } const uint32_t frequency = static_cast(sampling_frequency); // Fake an extra frame_length for stack frames calculations. There, 1 frame_length is always // subtracted and there for never used. But skip the first slice to fit the feature_matrix // buffer if (config.implementation_version == 1) { if (first_run == true) { signal->total_length += (size_t)(config.frame_length * (float)frequency); } first_run = true; } // ok all setup, let's construct the signal (with preemphasis for impl version >3) signal_t preemphasized_audio_signal; // before version 3 we did not have preemphasis if (config.implementation_version < 3) { preemphasis = nullptr; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = signal->get_data; } else { // preemphasis class to preprocess the audio... class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true); preemphasis = pre; preemphasized_audio_signal.total_length = signal->total_length; preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; } // Go from the time (e.g. 0.25 seconds to number of frames based on freq) const size_t frame_length_values = frequency * config.frame_length; const size_t frame_stride_values = frequency * config.frame_stride; const int frame_overlap_values = static_cast(frame_length_values) - static_cast(frame_stride_values); if (frame_overlap_values < 0) { ei_printf("ERR: frame_length ("); ei_printf_float(config.frame_length); ei_printf(") cannot be lower than frame_stride ("); ei_printf_float(config.frame_stride); ei_printf(") for continuous classification\n"); if (preemphasis) { delete preemphasis; } EIDSP_ERR(EIDSP_PARAMETER_INVALID); } if (frame_length_values > preemphasized_audio_signal.total_length) { ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", (int)frame_length_values, (int)preemphasized_audio_signal.total_length); if (preemphasis) { delete preemphasis; } EIDSP_ERR(EIDSP_PARAMETER_INVALID); } int x; // have current frame, but wrong size? then free if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { ei_free(ei_dsp_cont_current_frame); ei_dsp_cont_current_frame = nullptr; } if (!ei_dsp_cont_current_frame) { ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); if (!ei_dsp_cont_current_frame) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(EIDSP_OUT_OF_MEM); } ei_dsp_cont_current_frame_size = frame_length_values; ei_dsp_cont_current_frame_ix = 0; } matrix_size_out->rows = 0; matrix_size_out->cols = 0; // this is the offset in the signal from which we'll work size_t offset_in_signal = 0; if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n"); if (preemphasis) { delete preemphasis; } EIDSP_ERR(EIDSP_PARAMETER_INVALID); } // if we still have some code from previous run while (ei_dsp_cont_current_frame_ix > 0) { // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix` // starting at offset 0 x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } // now ei_dsp_cont_current_frame is complete signal_t frame_signal; x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } x = extract_mfe_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } // if there's overlap between frames we roll through if (frame_stride_values > 0) { numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); } ei_dsp_cont_current_frame_ix -= frame_stride_values; } if (ei_dsp_cont_current_frame_ix < 0) { offset_in_signal = -ei_dsp_cont_current_frame_ix; ei_dsp_cont_current_frame_ix = 0; } if (offset_in_signal >= signal->total_length) { if (preemphasis) { delete preemphasis; } offset_in_signal -= signal->total_length; return EIDSP_OK; } // now... we need to discard part of the signal... SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length); signal_t *range_signal = signal_with_range.get_signal(); size_t range_signal_orig_length = range_signal->total_length; // then we'll just go through normal processing of the signal: x = extract_mfe_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } // update offset int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, config.frame_length, config.frame_stride, false, config.implementation_version); offset_in_signal += length_of_signal_used; // not sure why this is being manipulated... range_signal->total_length = range_signal_orig_length; // see what's left? int bytes_left_end_of_frame = signal->total_length - offset_in_signal; bytes_left_end_of_frame += frame_overlap_values; if (bytes_left_end_of_frame > 0) { // then read that into the ei_dsp_cont_current_frame buffer x = preemphasized_audio_signal.get_data( (preemphasized_audio_signal.total_length - bytes_left_end_of_frame), bytes_left_end_of_frame, ei_dsp_cont_current_frame); if (x != EIDSP_OK) { if (preemphasis) { delete preemphasis; } EIDSP_ERR(x); } } ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; if (config.implementation_version == 1) { if (first_run == true) { signal->total_length -= (size_t)(config.frame_length * (float)frequency); } } if (preemphasis) { delete preemphasis; } return EIDSP_OK; #endif } __attribute__((unused)) int extract_image_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; size_t output_ix = 0; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; #else const size_t page_size = 1024; #endif // buffered read from the signal size_t bytes_left = signal->total_length; for (size_t ix = 0; ix < signal->total_length; ix += page_size) { size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); #else matrix_t input_matrix(elements_to_read, config.axes); #endif if (!input_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } signal->get_data(ix, elements_to_read, input_matrix.buffer); for (size_t jx = 0; jx < elements_to_read; jx++) { uint32_t pixel = static_cast(input_matrix.buffer[jx]); // rgb to 0..1 float r = static_cast(pixel >> 16 & 0xff) / 255.0f; float g = static_cast(pixel >> 8 & 0xff) / 255.0f; float b = static_cast(pixel & 0xff) / 255.0f; if (channel_count == 3) { output_matrix->buffer[output_ix++] = r; output_matrix->buffer[output_ix++] = g; output_matrix->buffer[output_ix++] = b; } else { // ITU-R 601-2 luma transform // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert float v = (0.299f * r) + (0.587f * g) + (0.114f * b); output_matrix->buffer[output_ix++] = v; } } bytes_left -= elements_to_read; } return EIDSP_OK; } #if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI) __attribute__((unused)) int extract_drpai_features_quantized(signal_t *signal, matrix_u8_t *output_matrix, void *config_ptr, const float frequency) { ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; size_t output_ix = 0; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; #else const size_t page_size = 1024; #endif // buffered read from the signal size_t bytes_left = signal->total_length; for (size_t ix = 0; ix < signal->total_length; ix += page_size) { size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); #else matrix_t input_matrix(elements_to_read, config.axes); #endif if (!input_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } signal->get_data(ix, elements_to_read, input_matrix.buffer); for (size_t jx = 0; jx < elements_to_read; jx++) { uint32_t pixel = static_cast(input_matrix.buffer[jx]); if (channel_count == 3) { uint8_t r = static_cast(pixel >> 16 & 0xff); uint8_t g = static_cast(pixel >> 8 & 0xff); uint8_t b = static_cast(pixel & 0xff); output_matrix->buffer[output_ix++] = r; output_matrix->buffer[output_ix++] = g; output_matrix->buffer[output_ix++] = b; } else { //NOTE: not implementing greyscale yet } } bytes_left -= elements_to_read; } return EIDSP_OK; } #endif //(EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI) #if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE != EI_CLASSIFIER_DRPAI) __attribute__((unused)) int extract_image_features_quantized(signal_t *signal, matrix_i8_t *output_matrix, void *config_ptr, float scale, float zero_point, const float frequency, int image_scaling) { ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; size_t output_ix = 0; const int32_t iRedToGray = (int32_t)(0.299f * 65536.0f); const int32_t iGreenToGray = (int32_t)(0.587f * 65536.0f); const int32_t iBlueToGray = (int32_t)(0.114f * 65536.0f); static const float torch_mean[] = { 0.485, 0.456, 0.406 }; static const float torch_std[] = { 0.229, 0.224, 0.225 }; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; #else const size_t page_size = 1024; #endif // buffered read from the signal size_t bytes_left = signal->total_length; for (size_t ix = 0; ix < signal->total_length; ix += page_size) { size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; #if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); #else matrix_t input_matrix(elements_to_read, config.axes); #endif if (!input_matrix.buffer) { EIDSP_ERR(EIDSP_OUT_OF_MEM); } signal->get_data(ix, elements_to_read, input_matrix.buffer); for (size_t jx = 0; jx < elements_to_read; jx++) { uint32_t pixel = static_cast(input_matrix.buffer[jx]); if (channel_count == 3) { // fast code path if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { int32_t r = static_cast(pixel >> 16 & 0xff); int32_t g = static_cast(pixel >> 8 & 0xff); int32_t b = static_cast(pixel & 0xff); output_matrix->buffer[output_ix++] = static_cast(r + zero_point); output_matrix->buffer[output_ix++] = static_cast(g + zero_point); output_matrix->buffer[output_ix++] = static_cast(b + zero_point); } // slow code path else { float r = static_cast(pixel >> 16 & 0xff); float g = static_cast(pixel >> 8 & 0xff); float b = static_cast(pixel & 0xff); if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { r /= 255.0f; g /= 255.0f; b /= 255.0f; } else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) { r /= 255.0f; g /= 255.0f; b /= 255.0f; r = (r - torch_mean[0]) / torch_std[0]; g = (g - torch_mean[1]) / torch_std[1]; b = (b - torch_mean[2]) / torch_std[2]; } output_matrix->buffer[output_ix++] = static_cast(round(r / scale) + zero_point); output_matrix->buffer[output_ix++] = static_cast(round(g / scale) + zero_point); output_matrix->buffer[output_ix++] = static_cast(round(b / scale) + zero_point); } } else { // fast code path if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { int32_t r = static_cast(pixel >> 16 & 0xff); int32_t g = static_cast(pixel >> 8 & 0xff); int32_t b = static_cast(pixel & 0xff); // ITU-R 601-2 luma transform // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert int32_t gray = (iRedToGray * r) + (iGreenToGray * g) + (iBlueToGray * b); gray >>= 16; // scale down to int8_t gray += zero_point; if (gray < - 128) gray = -128; else if (gray > 127) gray = 127; output_matrix->buffer[output_ix++] = static_cast(gray); } // slow code path else { float r = static_cast(pixel >> 16 & 0xff); float g = static_cast(pixel >> 8 & 0xff); float b = static_cast(pixel & 0xff); if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { r /= 255.0f; g /= 255.0f; b /= 255.0f; } else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) { r /= 255.0f; g /= 255.0f; b /= 255.0f; r = (r - torch_mean[0]) / torch_std[0]; g = (g - torch_mean[1]) / torch_std[1]; b = (b - torch_mean[2]) / torch_std[2]; } // ITU-R 601-2 luma transform // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert float v = (0.299f * r) + (0.587f * g) + (0.114f * b); output_matrix->buffer[output_ix++] = static_cast(round(v / scale) + zero_point); } } } bytes_left -= elements_to_read; } return EIDSP_OK; } #endif // (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE != EI_CLASSIFIER_DRPAI) /** * Clear all state regarding continuous audio. Invoke this function after continuous audio loop ends. */ __attribute__((unused)) int ei_dsp_clear_continuous_audio_state() { if (ei_dsp_cont_current_frame) { ei_free(ei_dsp_cont_current_frame); } ei_dsp_cont_current_frame = nullptr; ei_dsp_cont_current_frame_size = 0; ei_dsp_cont_current_frame_ix = 0; return EIDSP_OK; } /** * @brief Calculates the cepstral mean and variable normalization. * * @param matrix Source and destination matrix * @param config_ptr ei_dsp_config_mfcc_t struct pointer */ __attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfcc(ei_matrix *matrix, void *config_ptr) { ei_dsp_config_mfcc_t *config = (ei_dsp_config_mfcc_t *)config_ptr; uint32_t original_matrix_size = matrix->rows * matrix->cols; /* Modify rows and colums ration for matrix normalization */ matrix->rows = original_matrix_size / config->num_cepstral; matrix->cols = config->num_cepstral; // cepstral mean and variance normalization int ret = speechpy::processing::cmvnw(matrix, config->win_size, true, false); if (ret != EIDSP_OK) { ei_printf("ERR: cmvnw failed (%d)\n", ret); return; } /* Reset rows and columns ratio */ matrix->rows = 1; matrix->cols = original_matrix_size; } /** * @brief Calculates the cepstral mean and variable normalization. * * @param matrix Source and destination matrix * @param config_ptr ei_dsp_config_mfe_t struct pointer */ __attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfe(ei_matrix *matrix, void *config_ptr) { ei_dsp_config_mfe_t *config = (ei_dsp_config_mfe_t *)config_ptr; uint32_t original_matrix_size = matrix->rows * matrix->cols; /* Modify rows and colums ration for matrix normalization */ matrix->rows = (original_matrix_size) / config->num_filters; matrix->cols = config->num_filters; if (config->implementation_version < 3) { // cepstral mean and variance normalization int ret = speechpy::processing::cmvnw(matrix, config->win_size, false, true); if (ret != EIDSP_OK) { ei_printf("ERR: cmvnw failed (%d)\n", ret); return; } } else { // normalization int ret = speechpy::processing::mfe_normalization(matrix, config->noise_floor_db); if (ret != EIDSP_OK) { ei_printf("ERR: normalization failed (%d)\n", ret); return; } } /* Reset rows and columns ratio */ matrix->rows = 1; matrix->cols = (original_matrix_size); } /** * @brief Calculates the cepstral mean and variable normalization. * * @param matrix Source and destination matrix * @param config_ptr ei_dsp_config_spectrogram_t struct pointer */ __attribute__((unused)) void calc_cepstral_mean_and_var_normalization_spectrogram(ei_matrix *matrix, void *config_ptr) { ei_dsp_config_spectrogram_t *config = (ei_dsp_config_spectrogram_t *)config_ptr; uint32_t original_matrix_size = matrix->rows * matrix->cols; /* Modify rows and colums ration for matrix normalization */ matrix->cols = config->fft_length / 2 + 1; matrix->rows = (original_matrix_size) / matrix->cols; if (config->implementation_version < 3) { int ret = numpy::normalize(matrix); if (ret != EIDSP_OK) { ei_printf("ERR: normalization failed (%d)\n", ret); return; } } else { // normalization int ret = speechpy::processing::spectrogram_normalization(matrix, config->noise_floor_db); if (ret != EIDSP_OK) { ei_printf("ERR: normalization failed (%d)\n", ret); return; } } /* Reset rows and columns ratio */ matrix->rows = 1; matrix->cols = (original_matrix_size); } #ifdef __cplusplus } #endif // __cplusplus #endif // _EDGE_IMPULSE_RUN_DSP_H_