|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef _EDGE_IMPULSE_RUN_DSP_H_ |
|
|
#define _EDGE_IMPULSE_RUN_DSP_H_ |
|
|
|
|
|
#include "edge-impulse-sdk/classifier/ei_model_types.h" |
|
|
#include "edge-impulse-sdk/dsp/spectral/spectral.hpp" |
|
|
#include "edge-impulse-sdk/dsp/speechpy/speechpy.hpp" |
|
|
#include "edge-impulse-sdk/classifier/ei_signal_with_range.h" |
|
|
#include "model-parameters/model_metadata.h" |
|
|
|
|
|
#if defined(__cplusplus) && EI_C_LINKAGE == 1 |
|
|
extern "C" { |
|
|
extern void ei_printf(const char *format, ...); |
|
|
} |
|
|
#else |
|
|
extern void ei_printf(const char *format, ...); |
|
|
#endif |
|
|
|
|
|
#ifdef __cplusplus |
|
|
namespace { |
|
|
#endif |
|
|
|
|
|
using namespace ei; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
float ei_dsp_image_buffer[EI_DSP_IMAGE_BUFFER_STATIC_SIZE]; |
|
|
#endif |
|
|
|
|
|
|
|
|
static float *ei_dsp_cont_current_frame = nullptr; |
|
|
static size_t ei_dsp_cont_current_frame_size = 0; |
|
|
static int ei_dsp_cont_current_frame_ix = 0; |
|
|
|
|
|
__attribute__((unused)) int extract_spectral_analysis_features( |
|
|
signal_t *signal, |
|
|
matrix_t *output_matrix, |
|
|
void *config_ptr, |
|
|
const float frequency) |
|
|
{ |
|
|
ei_dsp_config_spectral_analysis_t *config = (ei_dsp_config_spectral_analysis_t *)config_ptr; |
|
|
|
|
|
|
|
|
matrix_t input_matrix(signal->total_length / config->axes, config->axes); |
|
|
if (!input_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
|
|
|
signal->get_data(0, signal->total_length, input_matrix.buffer); |
|
|
|
|
|
#if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET || EI_DSP_PARAMS_ALL |
|
|
if (strcmp(config->analysis_type, "Wavelet") == 0) { |
|
|
return spectral::wavelet::extract_wavelet_features(&input_matrix, output_matrix, config, frequency); |
|
|
} |
|
|
#endif |
|
|
|
|
|
#if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_ALL |
|
|
if (strcmp(config->analysis_type, "FFT") == 0) { |
|
|
if (config->implementation_version == 1) { |
|
|
return spectral::feature::extract_spectral_analysis_features_v1( |
|
|
&input_matrix, |
|
|
output_matrix, |
|
|
config, |
|
|
frequency); |
|
|
} else if (config->implementation_version == 4) { |
|
|
return spectral::feature::extract_spectral_analysis_features_v4( |
|
|
&input_matrix, |
|
|
output_matrix, |
|
|
config, |
|
|
frequency); |
|
|
} else { |
|
|
return spectral::feature::extract_spectral_analysis_features_v2( |
|
|
&input_matrix, |
|
|
output_matrix, |
|
|
config, |
|
|
frequency); |
|
|
} |
|
|
} |
|
|
#endif |
|
|
|
|
|
#if !EI_DSP_PARAMS_GENERATED || EI_DSP_PARAMS_ALL || !(EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET) |
|
|
if (config->implementation_version == 1) { |
|
|
return spectral::feature::extract_spectral_analysis_features_v1( |
|
|
&input_matrix, |
|
|
output_matrix, |
|
|
config, |
|
|
frequency); |
|
|
} |
|
|
if (config->implementation_version == 2) { |
|
|
return spectral::feature::extract_spectral_analysis_features_v2( |
|
|
&input_matrix, |
|
|
output_matrix, |
|
|
config, |
|
|
frequency); |
|
|
} |
|
|
#endif |
|
|
return EIDSP_NOT_SUPPORTED; |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_raw_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { |
|
|
ei_dsp_config_raw_t config = *((ei_dsp_config_raw_t*)config_ptr); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t els_to_copy = signal->total_length; |
|
|
if (els_to_copy > output_matrix->rows * output_matrix->cols) { |
|
|
els_to_copy = output_matrix->rows * output_matrix->cols; |
|
|
} |
|
|
|
|
|
signal->get_data(0, els_to_copy, output_matrix->buffer); |
|
|
|
|
|
|
|
|
int ret = numpy::scale(output_matrix, config.scale_axes); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_flatten_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { |
|
|
ei_dsp_config_flatten_t config = *((ei_dsp_config_flatten_t*)config_ptr); |
|
|
|
|
|
uint32_t expected_matrix_size = 0; |
|
|
if (config.average) expected_matrix_size += config.axes; |
|
|
if (config.minimum) expected_matrix_size += config.axes; |
|
|
if (config.maximum) expected_matrix_size += config.axes; |
|
|
if (config.rms) expected_matrix_size += config.axes; |
|
|
if (config.stdev) expected_matrix_size += config.axes; |
|
|
if (config.skewness) expected_matrix_size += config.axes; |
|
|
if (config.kurtosis) expected_matrix_size += config.axes; |
|
|
|
|
|
if (output_matrix->rows * output_matrix->cols != expected_matrix_size) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
int ret; |
|
|
|
|
|
|
|
|
matrix_t input_matrix(signal->total_length / config.axes, config.axes); |
|
|
if (!input_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
signal->get_data(0, signal->total_length, input_matrix.buffer); |
|
|
|
|
|
|
|
|
ret = numpy::scale(&input_matrix, config.scale_axes); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: Failed to scale signal (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
|
|
|
ret = numpy::transpose(&input_matrix); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: Failed to transpose matrix (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
size_t out_matrix_ix = 0; |
|
|
|
|
|
for (size_t row = 0; row < input_matrix.rows; row++) { |
|
|
matrix_t row_matrix(1, input_matrix.cols, input_matrix.buffer + (row * input_matrix.cols)); |
|
|
|
|
|
if (config.average) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::mean(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.minimum) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::min(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.maximum) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::max(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.rms) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::rms(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.stdev) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::stdev(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.skewness) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::skew(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
|
|
|
if (config.kurtosis) { |
|
|
float fbuffer; |
|
|
matrix_t out_matrix(1, 1, &fbuffer); |
|
|
numpy::kurtosis(&row_matrix, &out_matrix); |
|
|
output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0]; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
output_matrix->cols = output_matrix->rows * output_matrix->cols; |
|
|
output_matrix->rows = 1; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
static class speechpy::processing::preemphasis *preemphasis; |
|
|
static int preemphasized_audio_signal_get_data(size_t offset, size_t length, float *out_ptr) { |
|
|
return preemphasis->get_data(offset, length, out_ptr); |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_mfcc_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { |
|
|
ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr); |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if((config.implementation_version == 0) || (config.implementation_version > 4)) { |
|
|
EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
|
|
|
class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false); |
|
|
preemphasis = ⪯ |
|
|
|
|
|
signal_t preemphasized_audio_signal; |
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfcc_buffer_size( |
|
|
signal->total_length, frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.implementation_version); |
|
|
|
|
|
if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { |
|
|
ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); |
|
|
ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
output_matrix->rows = out_matrix_size.rows; |
|
|
output_matrix->cols = out_matrix_size.cols; |
|
|
|
|
|
|
|
|
int ret = speechpy::feature::mfcc(output_matrix, &preemphasized_audio_signal, |
|
|
frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.num_filters, config.fft_length, |
|
|
config.low_frequency, config.high_frequency, true, config.implementation_version); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: MFCC failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
|
|
|
ret = speechpy::processing::cmvnw(output_matrix, config.win_size, true, false); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: cmvnw failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; |
|
|
output_matrix->rows = 1; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
static int extract_mfcc_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfcc_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out, int implementation_version) { |
|
|
uint32_t frequency = (uint32_t)sampling_frequency; |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfcc_buffer_size( |
|
|
signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_cepstral, |
|
|
implementation_version); |
|
|
|
|
|
|
|
|
x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, |
|
|
-(out_matrix_size.rows * out_matrix_size.cols)); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - |
|
|
(out_matrix_size.rows * out_matrix_size.cols); |
|
|
|
|
|
matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); |
|
|
|
|
|
|
|
|
x = speechpy::feature::mfcc(&output_matrix_slice, signal, |
|
|
frequency, config->frame_length, config->frame_stride, config->num_cepstral, config->num_filters, config->fft_length, |
|
|
config->low_frequency, config->high_frequency, true, implementation_version); |
|
|
if (x != EIDSP_OK) { |
|
|
ei_printf("ERR: MFCC failed (%d)\n", x); |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
matrix_size_out->rows += out_matrix_size.rows; |
|
|
if (out_matrix_size.cols > 0) { |
|
|
matrix_size_out->cols = out_matrix_size.cols; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_mfcc_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { |
|
|
#if defined(__cplusplus) && EI_C_LINKAGE == 1 |
|
|
ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); |
|
|
EIDSP_ERR(EIDSP_NOT_SUPPORTED); |
|
|
#else |
|
|
|
|
|
ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr); |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if((config.implementation_version == 0) || (config.implementation_version > 4)) { |
|
|
EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
|
|
|
class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false); |
|
|
preemphasis = ⪯ |
|
|
|
|
|
signal_t preemphasized_audio_signal; |
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; |
|
|
|
|
|
|
|
|
const size_t frame_length_values = frequency * config.frame_length; |
|
|
const size_t frame_stride_values = frequency * config.frame_stride; |
|
|
const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values); |
|
|
|
|
|
if (frame_overlap_values < 0) { |
|
|
ei_printf("ERR: frame_length ("); |
|
|
ei_printf_float(config.frame_length); |
|
|
ei_printf(") cannot be lower than frame_stride ("); |
|
|
ei_printf_float(config.frame_stride); |
|
|
ei_printf(") for continuous classification\n"); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { |
|
|
ei_free(ei_dsp_cont_current_frame); |
|
|
ei_dsp_cont_current_frame = nullptr; |
|
|
} |
|
|
|
|
|
int implementation_version = config.implementation_version; |
|
|
|
|
|
|
|
|
size_t offset_in_signal = 0; |
|
|
|
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); |
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
ei_dsp_cont_current_frame_size = frame_length_values; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
|
|
|
if ((frame_length_values) > preemphasized_audio_signal.total_length + ei_dsp_cont_current_frame_ix) { |
|
|
ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", |
|
|
(int)frame_length_values, (int)preemphasized_audio_signal.total_length + ei_dsp_cont_current_frame_ix); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
matrix_size_out->rows = 0; |
|
|
matrix_size_out->cols = 0; |
|
|
|
|
|
|
|
|
if (implementation_version == 1) { |
|
|
implementation_version = 2; |
|
|
} |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { |
|
|
ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size (ix=%d size=%d)\n", |
|
|
ei_dsp_cont_current_frame_ix, (int)ei_dsp_cont_current_frame_size); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
|
|
|
while (ei_dsp_cont_current_frame_ix > 0) { |
|
|
|
|
|
|
|
|
x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
signal_t frame_signal; |
|
|
x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
x = extract_mfcc_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
if (frame_stride_values > 0) { |
|
|
numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix -= frame_stride_values; |
|
|
} |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix < 0) { |
|
|
offset_in_signal = -ei_dsp_cont_current_frame_ix; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
if (offset_in_signal >= signal->total_length) { |
|
|
offset_in_signal -= signal->total_length; |
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length); |
|
|
|
|
|
signal_t *range_signal = signal_with_range.get_signal(); |
|
|
size_t range_signal_orig_length = range_signal->total_length; |
|
|
|
|
|
|
|
|
x = extract_mfcc_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
range_signal->total_length = range_signal_orig_length; |
|
|
|
|
|
|
|
|
int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, |
|
|
config.frame_length, config.frame_stride, false, implementation_version); |
|
|
offset_in_signal += length_of_signal_used; |
|
|
|
|
|
|
|
|
int bytes_left_end_of_frame = signal->total_length - offset_in_signal; |
|
|
bytes_left_end_of_frame += frame_overlap_values; |
|
|
|
|
|
if (bytes_left_end_of_frame > 0) { |
|
|
|
|
|
x = preemphasized_audio_signal.get_data( |
|
|
(preemphasized_audio_signal.total_length - bytes_left_end_of_frame), |
|
|
bytes_left_end_of_frame, |
|
|
ei_dsp_cont_current_frame); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; |
|
|
|
|
|
preemphasis = nullptr; |
|
|
|
|
|
return EIDSP_OK; |
|
|
#endif |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_spectrogram_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { |
|
|
ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr); |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfe_buffer_size( |
|
|
signal->total_length, frequency, config.frame_length, config.frame_stride, config.fft_length / 2 + 1, |
|
|
config.implementation_version); |
|
|
|
|
|
if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { |
|
|
ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); |
|
|
ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
output_matrix->rows = out_matrix_size.rows; |
|
|
output_matrix->cols = out_matrix_size.cols; |
|
|
|
|
|
int ret = speechpy::feature::spectrogram(output_matrix, signal, |
|
|
sampling_frequency, config.frame_length, config.frame_stride, config.fft_length, config.implementation_version); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: Spectrogram failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
if (config.implementation_version < 3) { |
|
|
ret = numpy::normalize(output_matrix); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
ret = speechpy::processing::spectrogram_normalization(output_matrix, config.noise_floor_db); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: normalization failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
|
|
|
output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; |
|
|
output_matrix->rows = 1; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
static int extract_spectrogram_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_spectrogram_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) { |
|
|
uint32_t frequency = (uint32_t)sampling_frequency; |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfe_buffer_size( |
|
|
signal->total_length, frequency, config->frame_length, config->frame_stride, config->fft_length / 2 + 1, |
|
|
config->implementation_version); |
|
|
|
|
|
|
|
|
x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, |
|
|
-(out_matrix_size.rows * out_matrix_size.cols)); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - |
|
|
(out_matrix_size.rows * out_matrix_size.cols); |
|
|
|
|
|
matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); |
|
|
|
|
|
|
|
|
int ret = speechpy::feature::spectrogram(&output_matrix_slice, signal, |
|
|
frequency, config->frame_length, config->frame_stride, config->fft_length, config->implementation_version); |
|
|
|
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: Spectrogram failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
matrix_size_out->rows += out_matrix_size.rows; |
|
|
if (out_matrix_size.cols > 0) { |
|
|
matrix_size_out->cols = out_matrix_size.cols; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_spectrogram_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { |
|
|
#if defined(__cplusplus) && EI_C_LINKAGE == 1 |
|
|
ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); |
|
|
EIDSP_ERR(EIDSP_NOT_SUPPORTED); |
|
|
#else |
|
|
|
|
|
ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr); |
|
|
|
|
|
static bool first_run = false; |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(config.implementation_version < 2) { |
|
|
|
|
|
if (first_run == true) { |
|
|
signal->total_length += (size_t)(config.frame_length * (float)frequency); |
|
|
} |
|
|
|
|
|
first_run = true; |
|
|
} |
|
|
|
|
|
|
|
|
const size_t frame_length_values = frequency * config.frame_length; |
|
|
const size_t frame_stride_values = frequency * config.frame_stride; |
|
|
const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values); |
|
|
|
|
|
if (frame_overlap_values < 0) { |
|
|
ei_printf("ERR: frame_length ("); |
|
|
ei_printf_float(config.frame_length); |
|
|
ei_printf(") cannot be lower than frame_stride ("); |
|
|
ei_printf_float(config.frame_stride); |
|
|
ei_printf(") for continuous classification\n"); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
if (frame_length_values > signal->total_length) { |
|
|
ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", |
|
|
(int)frame_length_values, (int)signal->total_length); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { |
|
|
ei_free(ei_dsp_cont_current_frame); |
|
|
ei_dsp_cont_current_frame = nullptr; |
|
|
} |
|
|
|
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); |
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
ei_dsp_cont_current_frame_size = frame_length_values; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
matrix_size_out->rows = 0; |
|
|
matrix_size_out->cols = 0; |
|
|
|
|
|
|
|
|
size_t offset_in_signal = 0; |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { |
|
|
ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n"); |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
|
|
|
while (ei_dsp_cont_current_frame_ix > 0) { |
|
|
|
|
|
|
|
|
x = signal->get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
signal_t frame_signal; |
|
|
x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
x = extract_spectrogram_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
if (frame_stride_values > 0) { |
|
|
numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix -= frame_stride_values; |
|
|
} |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix < 0) { |
|
|
offset_in_signal = -ei_dsp_cont_current_frame_ix; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
if (offset_in_signal >= signal->total_length) { |
|
|
offset_in_signal -= signal->total_length; |
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
SignalWithRange signal_with_range(signal, offset_in_signal, signal->total_length); |
|
|
|
|
|
signal_t *range_signal = signal_with_range.get_signal(); |
|
|
size_t range_signal_orig_length = range_signal->total_length; |
|
|
|
|
|
|
|
|
x = extract_spectrogram_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, |
|
|
config.frame_length, config.frame_stride, false, config.implementation_version); |
|
|
offset_in_signal += length_of_signal_used; |
|
|
|
|
|
|
|
|
range_signal->total_length = range_signal_orig_length; |
|
|
|
|
|
|
|
|
int bytes_left_end_of_frame = signal->total_length - offset_in_signal; |
|
|
bytes_left_end_of_frame += frame_overlap_values; |
|
|
|
|
|
if (bytes_left_end_of_frame > 0) { |
|
|
|
|
|
x = signal->get_data( |
|
|
(signal->total_length - bytes_left_end_of_frame), |
|
|
bytes_left_end_of_frame, |
|
|
ei_dsp_cont_current_frame); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; |
|
|
|
|
|
if (config.implementation_version < 2) { |
|
|
if (first_run == true) { |
|
|
signal->total_length -= (size_t)(config.frame_length * (float)frequency); |
|
|
} |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
#endif |
|
|
} |
|
|
|
|
|
|
|
|
__attribute__((unused)) int extract_mfe_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) { |
|
|
ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr); |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
signal_t preemphasized_audio_signal; |
|
|
|
|
|
|
|
|
if (config.implementation_version < 3) { |
|
|
preemphasis = nullptr; |
|
|
|
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = signal->get_data; |
|
|
} |
|
|
else { |
|
|
|
|
|
class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true); |
|
|
preemphasis = pre; |
|
|
|
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; |
|
|
} |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfe_buffer_size( |
|
|
preemphasized_audio_signal.total_length, frequency, config.frame_length, config.frame_stride, config.num_filters, |
|
|
config.implementation_version); |
|
|
|
|
|
if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) { |
|
|
ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols); |
|
|
ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols); |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
output_matrix->rows = out_matrix_size.rows; |
|
|
output_matrix->cols = out_matrix_size.cols; |
|
|
|
|
|
int ret; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (config.implementation_version > 2) { |
|
|
ret = speechpy::feature::mfe(output_matrix, nullptr, &preemphasized_audio_signal, |
|
|
frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length, |
|
|
config.low_frequency, config.high_frequency, config.implementation_version); |
|
|
} else { |
|
|
ret = speechpy::feature::mfe_v3(output_matrix, nullptr, &preemphasized_audio_signal, |
|
|
frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length, |
|
|
config.low_frequency, config.high_frequency, config.implementation_version); |
|
|
} |
|
|
|
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: MFE failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
if (config.implementation_version < 3) { |
|
|
|
|
|
ret = speechpy::processing::cmvnw(output_matrix, config.win_size, false, true); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: cmvnw failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
ret = speechpy::processing::mfe_normalization(output_matrix, config.noise_floor_db); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: normalization failed (%d)\n", ret); |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
|
|
|
output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols; |
|
|
output_matrix->rows = 1; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
static int extract_mfe_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfe_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) { |
|
|
uint32_t frequency = (uint32_t)sampling_frequency; |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
matrix_size_t out_matrix_size = |
|
|
speechpy::feature::calculate_mfe_buffer_size( |
|
|
signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_filters, |
|
|
config->implementation_version); |
|
|
|
|
|
|
|
|
x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols, |
|
|
-(out_matrix_size.rows * out_matrix_size.cols)); |
|
|
if (x != EIDSP_OK) { |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) - |
|
|
(out_matrix_size.rows * out_matrix_size.cols); |
|
|
|
|
|
matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (config->implementation_version > 2) { |
|
|
x = speechpy::feature::mfe(&output_matrix_slice, nullptr, signal, |
|
|
frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length, |
|
|
config->low_frequency, config->high_frequency, config->implementation_version); |
|
|
} else { |
|
|
x = speechpy::feature::mfe_v3(&output_matrix_slice, nullptr, signal, |
|
|
frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length, |
|
|
config->low_frequency, config->high_frequency, config->implementation_version); |
|
|
} |
|
|
if (x != EIDSP_OK) { |
|
|
ei_printf("ERR: MFE failed (%d)\n", x); |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
matrix_size_out->rows += out_matrix_size.rows; |
|
|
if (out_matrix_size.cols > 0) { |
|
|
matrix_size_out->cols = out_matrix_size.cols; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_mfe_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) { |
|
|
#if defined(__cplusplus) && EI_C_LINKAGE == 1 |
|
|
ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n"); |
|
|
EIDSP_ERR(EIDSP_NOT_SUPPORTED); |
|
|
#else |
|
|
|
|
|
ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static bool first_run = false; |
|
|
|
|
|
if (config.axes != 1) { |
|
|
EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
if (signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
const uint32_t frequency = static_cast<uint32_t>(sampling_frequency); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (config.implementation_version == 1) { |
|
|
if (first_run == true) { |
|
|
signal->total_length += (size_t)(config.frame_length * (float)frequency); |
|
|
} |
|
|
|
|
|
first_run = true; |
|
|
} |
|
|
|
|
|
|
|
|
signal_t preemphasized_audio_signal; |
|
|
|
|
|
|
|
|
if (config.implementation_version < 3) { |
|
|
preemphasis = nullptr; |
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = signal->get_data; |
|
|
} |
|
|
else { |
|
|
|
|
|
class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true); |
|
|
preemphasis = pre; |
|
|
preemphasized_audio_signal.total_length = signal->total_length; |
|
|
preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data; |
|
|
} |
|
|
|
|
|
|
|
|
const size_t frame_length_values = frequency * config.frame_length; |
|
|
const size_t frame_stride_values = frequency * config.frame_stride; |
|
|
const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values); |
|
|
|
|
|
if (frame_overlap_values < 0) { |
|
|
ei_printf("ERR: frame_length ("); |
|
|
ei_printf_float(config.frame_length); |
|
|
ei_printf(") cannot be lower than frame_stride ("); |
|
|
ei_printf_float(config.frame_stride); |
|
|
ei_printf(") for continuous classification\n"); |
|
|
|
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
if (frame_length_values > preemphasized_audio_signal.total_length) { |
|
|
ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n", |
|
|
(int)frame_length_values, (int)preemphasized_audio_signal.total_length); |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
int x; |
|
|
|
|
|
|
|
|
if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) { |
|
|
ei_free(ei_dsp_cont_current_frame); |
|
|
ei_dsp_cont_current_frame = nullptr; |
|
|
} |
|
|
|
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1); |
|
|
if (!ei_dsp_cont_current_frame) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
ei_dsp_cont_current_frame_size = frame_length_values; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
matrix_size_out->rows = 0; |
|
|
matrix_size_out->cols = 0; |
|
|
|
|
|
|
|
|
size_t offset_in_signal = 0; |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) { |
|
|
ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n"); |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(EIDSP_PARAMETER_INVALID); |
|
|
} |
|
|
|
|
|
|
|
|
while (ei_dsp_cont_current_frame_ix > 0) { |
|
|
|
|
|
|
|
|
x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
signal_t frame_signal; |
|
|
x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
x = extract_mfe_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
if (frame_stride_values > 0) { |
|
|
numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values); |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix -= frame_stride_values; |
|
|
} |
|
|
|
|
|
if (ei_dsp_cont_current_frame_ix < 0) { |
|
|
offset_in_signal = -ei_dsp_cont_current_frame_ix; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
} |
|
|
|
|
|
if (offset_in_signal >= signal->total_length) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
offset_in_signal -= signal->total_length; |
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length); |
|
|
|
|
|
signal_t *range_signal = signal_with_range.get_signal(); |
|
|
size_t range_signal_orig_length = range_signal->total_length; |
|
|
|
|
|
|
|
|
x = extract_mfe_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
|
|
|
|
|
|
int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency, |
|
|
config.frame_length, config.frame_stride, false, config.implementation_version); |
|
|
offset_in_signal += length_of_signal_used; |
|
|
|
|
|
|
|
|
range_signal->total_length = range_signal_orig_length; |
|
|
|
|
|
|
|
|
int bytes_left_end_of_frame = signal->total_length - offset_in_signal; |
|
|
bytes_left_end_of_frame += frame_overlap_values; |
|
|
|
|
|
if (bytes_left_end_of_frame > 0) { |
|
|
|
|
|
x = preemphasized_audio_signal.get_data( |
|
|
(preemphasized_audio_signal.total_length - bytes_left_end_of_frame), |
|
|
bytes_left_end_of_frame, |
|
|
ei_dsp_cont_current_frame); |
|
|
if (x != EIDSP_OK) { |
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
EIDSP_ERR(x); |
|
|
} |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame; |
|
|
|
|
|
|
|
|
if (config.implementation_version == 1) { |
|
|
if (first_run == true) { |
|
|
signal->total_length -= (size_t)(config.frame_length * (float)frequency); |
|
|
} |
|
|
} |
|
|
|
|
|
if (preemphasis) { |
|
|
delete preemphasis; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
#endif |
|
|
} |
|
|
|
|
|
__attribute__((unused)) int extract_image_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) { |
|
|
ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); |
|
|
|
|
|
int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; |
|
|
|
|
|
size_t output_ix = 0; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; |
|
|
#else |
|
|
const size_t page_size = 1024; |
|
|
#endif |
|
|
|
|
|
|
|
|
size_t bytes_left = signal->total_length; |
|
|
for (size_t ix = 0; ix < signal->total_length; ix += page_size) { |
|
|
size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); |
|
|
#else |
|
|
matrix_t input_matrix(elements_to_read, config.axes); |
|
|
#endif |
|
|
if (!input_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
signal->get_data(ix, elements_to_read, input_matrix.buffer); |
|
|
|
|
|
for (size_t jx = 0; jx < elements_to_read; jx++) { |
|
|
uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]); |
|
|
|
|
|
|
|
|
float r = static_cast<float>(pixel >> 16 & 0xff) / 255.0f; |
|
|
float g = static_cast<float>(pixel >> 8 & 0xff) / 255.0f; |
|
|
float b = static_cast<float>(pixel & 0xff) / 255.0f; |
|
|
|
|
|
if (channel_count == 3) { |
|
|
output_matrix->buffer[output_ix++] = r; |
|
|
output_matrix->buffer[output_ix++] = g; |
|
|
output_matrix->buffer[output_ix++] = b; |
|
|
} |
|
|
else { |
|
|
|
|
|
|
|
|
float v = (0.299f * r) + (0.587f * g) + (0.114f * b); |
|
|
output_matrix->buffer[output_ix++] = v; |
|
|
} |
|
|
} |
|
|
|
|
|
bytes_left -= elements_to_read; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
#if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI) |
|
|
|
|
|
__attribute__((unused)) int extract_drpai_features_quantized(signal_t *signal, matrix_u8_t *output_matrix, void *config_ptr, const float frequency) { |
|
|
ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); |
|
|
|
|
|
int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; |
|
|
|
|
|
size_t output_ix = 0; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; |
|
|
#else |
|
|
const size_t page_size = 1024; |
|
|
#endif |
|
|
|
|
|
|
|
|
size_t bytes_left = signal->total_length; |
|
|
for (size_t ix = 0; ix < signal->total_length; ix += page_size) { |
|
|
size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); |
|
|
#else |
|
|
matrix_t input_matrix(elements_to_read, config.axes); |
|
|
#endif |
|
|
if (!input_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
signal->get_data(ix, elements_to_read, input_matrix.buffer); |
|
|
|
|
|
for (size_t jx = 0; jx < elements_to_read; jx++) { |
|
|
uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]); |
|
|
|
|
|
if (channel_count == 3) { |
|
|
uint8_t r = static_cast<uint8_t>(pixel >> 16 & 0xff); |
|
|
uint8_t g = static_cast<uint8_t>(pixel >> 8 & 0xff); |
|
|
uint8_t b = static_cast<uint8_t>(pixel & 0xff); |
|
|
|
|
|
output_matrix->buffer[output_ix++] = r; |
|
|
output_matrix->buffer[output_ix++] = g; |
|
|
output_matrix->buffer[output_ix++] = b; |
|
|
} |
|
|
else { |
|
|
|
|
|
} |
|
|
} |
|
|
bytes_left -= elements_to_read; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
#if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE != EI_CLASSIFIER_DRPAI) |
|
|
|
|
|
__attribute__((unused)) int extract_image_features_quantized(signal_t *signal, matrix_i8_t *output_matrix, void *config_ptr, float scale, float zero_point, const float frequency, |
|
|
int image_scaling) { |
|
|
ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr); |
|
|
|
|
|
int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3; |
|
|
|
|
|
size_t output_ix = 0; |
|
|
|
|
|
const int32_t iRedToGray = (int32_t)(0.299f * 65536.0f); |
|
|
const int32_t iGreenToGray = (int32_t)(0.587f * 65536.0f); |
|
|
const int32_t iBlueToGray = (int32_t)(0.114f * 65536.0f); |
|
|
|
|
|
static const float torch_mean[] = { 0.485, 0.456, 0.406 }; |
|
|
static const float torch_std[] = { 0.229, 0.224, 0.225 }; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE; |
|
|
#else |
|
|
const size_t page_size = 1024; |
|
|
#endif |
|
|
|
|
|
|
|
|
size_t bytes_left = signal->total_length; |
|
|
for (size_t ix = 0; ix < signal->total_length; ix += page_size) { |
|
|
size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left; |
|
|
|
|
|
#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE) |
|
|
matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer); |
|
|
#else |
|
|
matrix_t input_matrix(elements_to_read, config.axes); |
|
|
#endif |
|
|
if (!input_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
signal->get_data(ix, elements_to_read, input_matrix.buffer); |
|
|
|
|
|
for (size_t jx = 0; jx < elements_to_read; jx++) { |
|
|
uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]); |
|
|
|
|
|
if (channel_count == 3) { |
|
|
|
|
|
if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { |
|
|
int32_t r = static_cast<int32_t>(pixel >> 16 & 0xff); |
|
|
int32_t g = static_cast<int32_t>(pixel >> 8 & 0xff); |
|
|
int32_t b = static_cast<int32_t>(pixel & 0xff); |
|
|
|
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(r + zero_point); |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(g + zero_point); |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(b + zero_point); |
|
|
} |
|
|
|
|
|
else { |
|
|
float r = static_cast<float>(pixel >> 16 & 0xff); |
|
|
float g = static_cast<float>(pixel >> 8 & 0xff); |
|
|
float b = static_cast<float>(pixel & 0xff); |
|
|
|
|
|
if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { |
|
|
r /= 255.0f; |
|
|
g /= 255.0f; |
|
|
b /= 255.0f; |
|
|
} |
|
|
else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) { |
|
|
r /= 255.0f; |
|
|
g /= 255.0f; |
|
|
b /= 255.0f; |
|
|
|
|
|
r = (r - torch_mean[0]) / torch_std[0]; |
|
|
g = (g - torch_mean[1]) / torch_std[1]; |
|
|
b = (b - torch_mean[2]) / torch_std[2]; |
|
|
} |
|
|
|
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(r / scale) + zero_point); |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(g / scale) + zero_point); |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(b / scale) + zero_point); |
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { |
|
|
int32_t r = static_cast<int32_t>(pixel >> 16 & 0xff); |
|
|
int32_t g = static_cast<int32_t>(pixel >> 8 & 0xff); |
|
|
int32_t b = static_cast<int32_t>(pixel & 0xff); |
|
|
|
|
|
|
|
|
|
|
|
int32_t gray = (iRedToGray * r) + (iGreenToGray * g) + (iBlueToGray * b); |
|
|
gray >>= 16; |
|
|
gray += zero_point; |
|
|
if (gray < - 128) gray = -128; |
|
|
else if (gray > 127) gray = 127; |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(gray); |
|
|
} |
|
|
|
|
|
else { |
|
|
float r = static_cast<float>(pixel >> 16 & 0xff); |
|
|
float g = static_cast<float>(pixel >> 8 & 0xff); |
|
|
float b = static_cast<float>(pixel & 0xff); |
|
|
|
|
|
if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) { |
|
|
r /= 255.0f; |
|
|
g /= 255.0f; |
|
|
b /= 255.0f; |
|
|
} |
|
|
else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) { |
|
|
r /= 255.0f; |
|
|
g /= 255.0f; |
|
|
b /= 255.0f; |
|
|
|
|
|
r = (r - torch_mean[0]) / torch_std[0]; |
|
|
g = (g - torch_mean[1]) / torch_std[1]; |
|
|
b = (b - torch_mean[2]) / torch_std[2]; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
float v = (0.299f * r) + (0.587f * g) + (0.114f * b); |
|
|
output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(v / scale) + zero_point); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
bytes_left -= elements_to_read; |
|
|
|
|
|
} |
|
|
return EIDSP_OK; |
|
|
} |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((unused)) int ei_dsp_clear_continuous_audio_state() { |
|
|
if (ei_dsp_cont_current_frame) { |
|
|
ei_free(ei_dsp_cont_current_frame); |
|
|
} |
|
|
|
|
|
ei_dsp_cont_current_frame = nullptr; |
|
|
ei_dsp_cont_current_frame_size = 0; |
|
|
ei_dsp_cont_current_frame_ix = 0; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfcc(ei_matrix *matrix, void *config_ptr) |
|
|
{ |
|
|
ei_dsp_config_mfcc_t *config = (ei_dsp_config_mfcc_t *)config_ptr; |
|
|
|
|
|
uint32_t original_matrix_size = matrix->rows * matrix->cols; |
|
|
|
|
|
|
|
|
matrix->rows = original_matrix_size / config->num_cepstral; |
|
|
matrix->cols = config->num_cepstral; |
|
|
|
|
|
|
|
|
int ret = speechpy::processing::cmvnw(matrix, config->win_size, true, false); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: cmvnw failed (%d)\n", ret); |
|
|
return; |
|
|
} |
|
|
|
|
|
|
|
|
matrix->rows = 1; |
|
|
matrix->cols = original_matrix_size; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfe(ei_matrix *matrix, void *config_ptr) |
|
|
{ |
|
|
ei_dsp_config_mfe_t *config = (ei_dsp_config_mfe_t *)config_ptr; |
|
|
|
|
|
uint32_t original_matrix_size = matrix->rows * matrix->cols; |
|
|
|
|
|
|
|
|
matrix->rows = (original_matrix_size) / config->num_filters; |
|
|
matrix->cols = config->num_filters; |
|
|
|
|
|
if (config->implementation_version < 3) { |
|
|
|
|
|
int ret = speechpy::processing::cmvnw(matrix, config->win_size, false, true); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: cmvnw failed (%d)\n", ret); |
|
|
return; |
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
int ret = speechpy::processing::mfe_normalization(matrix, config->noise_floor_db); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: normalization failed (%d)\n", ret); |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
matrix->rows = 1; |
|
|
matrix->cols = (original_matrix_size); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_spectrogram(ei_matrix *matrix, void *config_ptr) |
|
|
{ |
|
|
ei_dsp_config_spectrogram_t *config = (ei_dsp_config_spectrogram_t *)config_ptr; |
|
|
|
|
|
uint32_t original_matrix_size = matrix->rows * matrix->cols; |
|
|
|
|
|
|
|
|
matrix->cols = config->fft_length / 2 + 1; |
|
|
matrix->rows = (original_matrix_size) / matrix->cols; |
|
|
|
|
|
if (config->implementation_version < 3) { |
|
|
int ret = numpy::normalize(matrix); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: normalization failed (%d)\n", ret); |
|
|
return; |
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
int ret = speechpy::processing::spectrogram_normalization(matrix, config->noise_floor_db); |
|
|
if (ret != EIDSP_OK) { |
|
|
ei_printf("ERR: normalization failed (%d)\n", ret); |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
matrix->rows = 1; |
|
|
matrix->cols = (original_matrix_size); |
|
|
} |
|
|
|
|
|
#ifdef __cplusplus |
|
|
} |
|
|
#endif |
|
|
|
|
|
#endif |
|
|
|