File size: 58,259 Bytes

b7b614e

/*
 * Copyright (c) 2022 EdgeImpulse Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS
 * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#ifndef _EDGE_IMPULSE_RUN_DSP_H_
#define _EDGE_IMPULSE_RUN_DSP_H_

#include "edge-impulse-sdk/classifier/ei_model_types.h"
#include "edge-impulse-sdk/dsp/spectral/spectral.hpp"
#include "edge-impulse-sdk/dsp/speechpy/speechpy.hpp"
#include "edge-impulse-sdk/classifier/ei_signal_with_range.h"
#include "model-parameters/model_metadata.h"

#if defined(__cplusplus) && EI_C_LINKAGE == 1
extern "C" {
    extern void ei_printf(const char *format, ...);
}
#else
extern void ei_printf(const char *format, ...);
#endif

#ifdef __cplusplus
namespace {
#endif // __cplusplus

using namespace ei;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
float ei_dsp_image_buffer[EI_DSP_IMAGE_BUFFER_STATIC_SIZE];
#endif

// this is the frame we work on... allocate it statically so we share between invocations
static float *ei_dsp_cont_current_frame = nullptr;
static size_t ei_dsp_cont_current_frame_size = 0;
static int ei_dsp_cont_current_frame_ix = 0;

__attribute__((unused)) int extract_spectral_analysis_features(
    signal_t *signal,
    matrix_t *output_matrix,
    void *config_ptr,
    const float frequency)
{
    ei_dsp_config_spectral_analysis_t *config = (ei_dsp_config_spectral_analysis_t *)config_ptr;

    // input matrix from the raw signal
    matrix_t input_matrix(signal->total_length / config->axes, config->axes);
    if (!input_matrix.buffer) {
        EIDSP_ERR(EIDSP_OUT_OF_MEM);
    }

    signal->get_data(0, signal->total_length, input_matrix.buffer);

#if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET || EI_DSP_PARAMS_ALL
    if (strcmp(config->analysis_type, "Wavelet") == 0) {
        return spectral::wavelet::extract_wavelet_features(&input_matrix, output_matrix, config, frequency);
    }
#endif

#if EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_ALL
    if (strcmp(config->analysis_type, "FFT") == 0) {
        if (config->implementation_version == 1) {
            return spectral::feature::extract_spectral_analysis_features_v1(
                &input_matrix,
                output_matrix,
                config,
                frequency);
        } else if (config->implementation_version == 4) {
            return spectral::feature::extract_spectral_analysis_features_v4(
                &input_matrix,
                output_matrix,
                config,
                frequency);
        } else {
            return spectral::feature::extract_spectral_analysis_features_v2(
                &input_matrix,
                output_matrix,
                config,
                frequency);
        }
    }
#endif

#if !EI_DSP_PARAMS_GENERATED || EI_DSP_PARAMS_ALL || !(EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_FFT || EI_DSP_PARAMS_SPECTRAL_ANALYSIS_ANALYSIS_TYPE_WAVELET)
    if (config->implementation_version == 1) {
        return spectral::feature::extract_spectral_analysis_features_v1(
            &input_matrix,
            output_matrix,
            config,
            frequency);
    }
    if (config->implementation_version == 2) {
        return spectral::feature::extract_spectral_analysis_features_v2(
            &input_matrix,
            output_matrix,
            config,
            frequency);
    }
#endif
    return EIDSP_NOT_SUPPORTED;
}

__attribute__((unused)) int extract_raw_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) {
    ei_dsp_config_raw_t config = *((ei_dsp_config_raw_t*)config_ptr);

    // Because of rounding errors during re-sampling the output size of the block might be
    // smaller than the input of the block. Make sure we don't write outside of the bounds
    // of the array:
    // https://forum.edgeimpulse.com/t/using-custom-sensors-on-raspberry-pi-4/3506/7
    size_t els_to_copy = signal->total_length;
    if (els_to_copy > output_matrix->rows * output_matrix->cols) {
        els_to_copy = output_matrix->rows * output_matrix->cols;
    }

    signal->get_data(0, els_to_copy, output_matrix->buffer);

    // scale the signal
    int ret = numpy::scale(output_matrix, config.scale_axes);
    if (ret != EIDSP_OK) {
        EIDSP_ERR(ret);
    }

    return EIDSP_OK;
}

__attribute__((unused)) int extract_flatten_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) {
    ei_dsp_config_flatten_t config = *((ei_dsp_config_flatten_t*)config_ptr);

    uint32_t expected_matrix_size = 0;
    if (config.average) expected_matrix_size += config.axes;
    if (config.minimum) expected_matrix_size += config.axes;
    if (config.maximum) expected_matrix_size += config.axes;
    if (config.rms) expected_matrix_size += config.axes;
    if (config.stdev) expected_matrix_size += config.axes;
    if (config.skewness) expected_matrix_size += config.axes;
    if (config.kurtosis) expected_matrix_size += config.axes;

    if (output_matrix->rows * output_matrix->cols != expected_matrix_size) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    int ret;

    // input matrix from the raw signal
    matrix_t input_matrix(signal->total_length / config.axes, config.axes);
    if (!input_matrix.buffer) {
        EIDSP_ERR(EIDSP_OUT_OF_MEM);
    }
    signal->get_data(0, signal->total_length, input_matrix.buffer);

    // scale the signal
    ret = numpy::scale(&input_matrix, config.scale_axes);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: Failed to scale signal (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    // transpose the matrix so we have one row per axis (nifty!)
    ret = numpy::transpose(&input_matrix);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: Failed to transpose matrix (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    size_t out_matrix_ix = 0;

    for (size_t row = 0; row < input_matrix.rows; row++) {
        matrix_t row_matrix(1, input_matrix.cols, input_matrix.buffer + (row * input_matrix.cols));

        if (config.average) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::mean(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.minimum) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::min(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.maximum) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::max(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.rms) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::rms(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.stdev) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::stdev(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.skewness) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::skew(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }

        if (config.kurtosis) {
            float fbuffer;
            matrix_t out_matrix(1, 1, &fbuffer);
            numpy::kurtosis(&row_matrix, &out_matrix);
            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
        }
    }

    // flatten again
    output_matrix->cols = output_matrix->rows * output_matrix->cols;
    output_matrix->rows = 1;

    return EIDSP_OK;
}

static class speechpy::processing::preemphasis *preemphasis;
static int preemphasized_audio_signal_get_data(size_t offset, size_t length, float *out_ptr) {
    return preemphasis->get_data(offset, length, out_ptr);
}

__attribute__((unused)) int extract_mfcc_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) {
    ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr);

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if((config.implementation_version == 0) || (config.implementation_version > 4)) {
        EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    // preemphasis class to preprocess the audio...
    class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false);
    preemphasis = &pre;

    signal_t preemphasized_audio_signal;
    preemphasized_audio_signal.total_length = signal->total_length;
    preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data;

    // calculate the size of the MFCC matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfcc_buffer_size(
            signal->total_length, frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.implementation_version);
    /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */
    if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) {
        ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols);
        ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols);
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    output_matrix->rows = out_matrix_size.rows;
    output_matrix->cols = out_matrix_size.cols;

    // and run the MFCC extraction
    int ret = speechpy::feature::mfcc(output_matrix, &preemphasized_audio_signal,
        frequency, config.frame_length, config.frame_stride, config.num_cepstral, config.num_filters, config.fft_length,
        config.low_frequency, config.high_frequency, true, config.implementation_version);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: MFCC failed (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    // cepstral mean and variance normalization
    ret = speechpy::processing::cmvnw(output_matrix, config.win_size, true, false);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: cmvnw failed (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols;
    output_matrix->rows = 1;

    return EIDSP_OK;
}


static int extract_mfcc_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfcc_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out, int implementation_version) {
    uint32_t frequency = (uint32_t)sampling_frequency;

    int x;

    // calculate the size of the spectrogram matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfcc_buffer_size(
            signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_cepstral,
            implementation_version);

    // we roll the output matrix back so we have room at the end...
    x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols,
        -(out_matrix_size.rows * out_matrix_size.cols));
    if (x != EIDSP_OK) {
        EIDSP_ERR(x);
    }

    // slice in the output matrix to write to
    // the offset in the classification matrix here is always at the end
    size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) -
        (out_matrix_size.rows * out_matrix_size.cols);

    matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset);

    // and run the MFCC extraction
    x = speechpy::feature::mfcc(&output_matrix_slice, signal,
        frequency, config->frame_length, config->frame_stride, config->num_cepstral, config->num_filters, config->fft_length,
        config->low_frequency, config->high_frequency, true, implementation_version);
    if (x != EIDSP_OK) {
        ei_printf("ERR: MFCC failed (%d)\n", x);
        EIDSP_ERR(x);
    }

    matrix_size_out->rows += out_matrix_size.rows;
    if (out_matrix_size.cols > 0) {
        matrix_size_out->cols = out_matrix_size.cols;
    }

    return EIDSP_OK;
}

__attribute__((unused)) int extract_mfcc_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) {
#if defined(__cplusplus) && EI_C_LINKAGE == 1
    ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n");
    EIDSP_ERR(EIDSP_NOT_SUPPORTED);
#else

    ei_dsp_config_mfcc_t config = *((ei_dsp_config_mfcc_t*)config_ptr);

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if((config.implementation_version == 0) || (config.implementation_version > 4)) {
        EIDSP_ERR(EIDSP_BLOCK_VERSION_INCORRECT);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    // preemphasis class to preprocess the audio...
    class speechpy::processing::preemphasis pre(signal, config.pre_shift, config.pre_cof, false);
    preemphasis = &pre;

    signal_t preemphasized_audio_signal;
    preemphasized_audio_signal.total_length = signal->total_length;
    preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data;

    // Go from the time (e.g. 0.25 seconds to number of frames based on freq)
    const size_t frame_length_values = frequency * config.frame_length;
    const size_t frame_stride_values = frequency * config.frame_stride;
    const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values);

    if (frame_overlap_values < 0) {
        ei_printf("ERR: frame_length (");
        ei_printf_float(config.frame_length);
        ei_printf(") cannot be lower than frame_stride (");
        ei_printf_float(config.frame_stride);
        ei_printf(") for continuous classification\n");
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    int x;

    // have current frame, but wrong size? then free
    if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) {
        ei_free(ei_dsp_cont_current_frame);
        ei_dsp_cont_current_frame = nullptr;
    }

    int implementation_version = config.implementation_version;

    // this is the offset in the signal from which we'll work
    size_t offset_in_signal = 0;

    if (!ei_dsp_cont_current_frame) {
        ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1);
        if (!ei_dsp_cont_current_frame) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        ei_dsp_cont_current_frame_size = frame_length_values;
        ei_dsp_cont_current_frame_ix = 0;
    }


    if ((frame_length_values) > preemphasized_audio_signal.total_length  + ei_dsp_cont_current_frame_ix) {
        ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n",
            (int)frame_length_values, (int)preemphasized_audio_signal.total_length  + ei_dsp_cont_current_frame_ix);
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    matrix_size_out->rows = 0;
    matrix_size_out->cols = 0;

    // for continuous use v2 stack frame calculations
    if (implementation_version == 1) {
        implementation_version = 2;
    }

    if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) {
        ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size (ix=%d size=%d)\n",
            ei_dsp_cont_current_frame_ix, (int)ei_dsp_cont_current_frame_size);
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    // if we still have some code from previous run
    while (ei_dsp_cont_current_frame_ix > 0) {
        // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix`
        // starting at offset 0
        x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        // now ei_dsp_cont_current_frame is complete
        signal_t frame_signal;
        x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        x = extract_mfcc_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        // if there's overlap between frames we roll through
        if (frame_stride_values > 0) {
            numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values);
        }

        ei_dsp_cont_current_frame_ix -= frame_stride_values;
    }

    if (ei_dsp_cont_current_frame_ix < 0) {
        offset_in_signal = -ei_dsp_cont_current_frame_ix;
        ei_dsp_cont_current_frame_ix = 0;
    }

    if (offset_in_signal >= signal->total_length) {
        offset_in_signal -= signal->total_length;
        return EIDSP_OK;
    }

    // now... we need to discard part of the signal...
    SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length);

    signal_t *range_signal = signal_with_range.get_signal();
    size_t range_signal_orig_length = range_signal->total_length;

    // then we'll just go through normal processing of the signal:
    x = extract_mfcc_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out, implementation_version);
    if (x != EIDSP_OK) {
        EIDSP_ERR(x);
    }

    // Make sure v1 model are reset to the original length;
    range_signal->total_length = range_signal_orig_length;

    // update offset
    int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency,
        config.frame_length, config.frame_stride, false, implementation_version);
    offset_in_signal += length_of_signal_used;

    // see what's left?
    int bytes_left_end_of_frame = signal->total_length - offset_in_signal;
    bytes_left_end_of_frame += frame_overlap_values;

    if (bytes_left_end_of_frame > 0) {
        // then read that into the ei_dsp_cont_current_frame buffer
        x = preemphasized_audio_signal.get_data(
            (preemphasized_audio_signal.total_length - bytes_left_end_of_frame),
            bytes_left_end_of_frame,
            ei_dsp_cont_current_frame);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }
    }

    ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame;

    preemphasis = nullptr;

    return EIDSP_OK;
#endif
}

__attribute__((unused)) int extract_spectrogram_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) {
    ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr);

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    // calculate the size of the MFE matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfe_buffer_size(
            signal->total_length, frequency, config.frame_length, config.frame_stride, config.fft_length / 2 + 1,
            config.implementation_version);
    /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */
    if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) {
        ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols);
        ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols);
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    output_matrix->rows = out_matrix_size.rows;
    output_matrix->cols = out_matrix_size.cols;

    int ret = speechpy::feature::spectrogram(output_matrix, signal,
        sampling_frequency, config.frame_length, config.frame_stride, config.fft_length, config.implementation_version);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: Spectrogram failed (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    if (config.implementation_version < 3) {
        ret = numpy::normalize(output_matrix);
        if (ret != EIDSP_OK) {
            EIDSP_ERR(ret);
        }
    }
    else {
        // normalization
        ret = speechpy::processing::spectrogram_normalization(output_matrix, config.noise_floor_db);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: normalization failed (%d)\n", ret);
            EIDSP_ERR(ret);
        }
    }

    output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols;
    output_matrix->rows = 1;

    return EIDSP_OK;
}


static int extract_spectrogram_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_spectrogram_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
    uint32_t frequency = (uint32_t)sampling_frequency;

    int x;

    // calculate the size of the spectrogram matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfe_buffer_size(
            signal->total_length, frequency, config->frame_length, config->frame_stride, config->fft_length / 2 + 1,
            config->implementation_version);

    // we roll the output matrix back so we have room at the end...
    x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols,
        -(out_matrix_size.rows * out_matrix_size.cols));
    if (x != EIDSP_OK) {
        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(x);
    }

    // slice in the output matrix to write to
    // the offset in the classification matrix here is always at the end
    size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) -
        (out_matrix_size.rows * out_matrix_size.cols);

    matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset);

    // and run the spectrogram extraction
    int ret = speechpy::feature::spectrogram(&output_matrix_slice, signal,
        frequency, config->frame_length, config->frame_stride, config->fft_length, config->implementation_version);

    if (ret != EIDSP_OK) {
        ei_printf("ERR: Spectrogram failed (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    matrix_size_out->rows += out_matrix_size.rows;
    if (out_matrix_size.cols > 0) {
        matrix_size_out->cols = out_matrix_size.cols;
    }

    return EIDSP_OK;
}

__attribute__((unused)) int extract_spectrogram_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) {
#if defined(__cplusplus) && EI_C_LINKAGE == 1
    ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n");
    EIDSP_ERR(EIDSP_NOT_SUPPORTED);
#else

    ei_dsp_config_spectrogram_t config = *((ei_dsp_config_spectrogram_t*)config_ptr);

    static bool first_run = false;

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    /* Fake an extra frame_length for stack frames calculations. There, 1 frame_length is always
    subtracted and there for never used. But skip the first slice to fit the feature_matrix
    buffer */
    if(config.implementation_version < 2) {

        if (first_run == true) {
            signal->total_length += (size_t)(config.frame_length * (float)frequency);
        }

        first_run = true;
    }

    // Go from the time (e.g. 0.25 seconds to number of frames based on freq)
    const size_t frame_length_values = frequency * config.frame_length;
    const size_t frame_stride_values = frequency * config.frame_stride;
    const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values);

    if (frame_overlap_values < 0) {
        ei_printf("ERR: frame_length (");
        ei_printf_float(config.frame_length);
        ei_printf(") cannot be lower than frame_stride (");
        ei_printf_float(config.frame_stride);
        ei_printf(") for continuous classification\n");
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    if (frame_length_values > signal->total_length) {
        ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n",
            (int)frame_length_values, (int)signal->total_length);
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    int x;

    // have current frame, but wrong size? then free
    if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) {
        ei_free(ei_dsp_cont_current_frame);
        ei_dsp_cont_current_frame = nullptr;
    }

    if (!ei_dsp_cont_current_frame) {
        ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1);
        if (!ei_dsp_cont_current_frame) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        ei_dsp_cont_current_frame_size = frame_length_values;
        ei_dsp_cont_current_frame_ix = 0;
    }

    matrix_size_out->rows = 0;
    matrix_size_out->cols = 0;

    // this is the offset in the signal from which we'll work
    size_t offset_in_signal = 0;

    if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) {
        ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n");
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    // if we still have some code from previous run
    while (ei_dsp_cont_current_frame_ix > 0) {
        // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix`
        // starting at offset 0
        x = signal->get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        // now ei_dsp_cont_current_frame is complete
        signal_t frame_signal;
        x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        x = extract_spectrogram_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }

        // if there's overlap between frames we roll through
        if (frame_stride_values > 0) {
            numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values);
        }

        ei_dsp_cont_current_frame_ix -= frame_stride_values;
    }

    if (ei_dsp_cont_current_frame_ix < 0) {
        offset_in_signal = -ei_dsp_cont_current_frame_ix;
        ei_dsp_cont_current_frame_ix = 0;
    }

    if (offset_in_signal >= signal->total_length) {
        offset_in_signal -= signal->total_length;
        return EIDSP_OK;
    }

    // now... we need to discard part of the signal...
    SignalWithRange signal_with_range(signal, offset_in_signal, signal->total_length);

    signal_t *range_signal = signal_with_range.get_signal();
    size_t range_signal_orig_length = range_signal->total_length;

    // then we'll just go through normal processing of the signal:
    x = extract_spectrogram_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out);
    if (x != EIDSP_OK) {
        EIDSP_ERR(x);
    }

    // update offset
    int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency,
        config.frame_length, config.frame_stride, false, config.implementation_version);
    offset_in_signal += length_of_signal_used;

    // not sure why this is being manipulated...
    range_signal->total_length = range_signal_orig_length;

    // see what's left?
    int bytes_left_end_of_frame = signal->total_length - offset_in_signal;
    bytes_left_end_of_frame += frame_overlap_values;

    if (bytes_left_end_of_frame > 0) {
        // then read that into the ei_dsp_cont_current_frame buffer
        x = signal->get_data(
            (signal->total_length - bytes_left_end_of_frame),
            bytes_left_end_of_frame,
            ei_dsp_cont_current_frame);
        if (x != EIDSP_OK) {
            EIDSP_ERR(x);
        }
    }

    ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame;

    if (config.implementation_version < 2) {
        if (first_run == true) {
            signal->total_length -= (size_t)(config.frame_length * (float)frequency);
        }
    }

    return EIDSP_OK;
#endif
}


__attribute__((unused)) int extract_mfe_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency) {
    ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr);

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    signal_t preemphasized_audio_signal;

    // before version 3 we did not have preemphasis
    if (config.implementation_version < 3) {
        preemphasis = nullptr;

        preemphasized_audio_signal.total_length = signal->total_length;
        preemphasized_audio_signal.get_data = signal->get_data;
    }
    else {
        // preemphasis class to preprocess the audio...
        class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true);
        preemphasis = pre;

        preemphasized_audio_signal.total_length = signal->total_length;
        preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data;
    }

    // calculate the size of the MFE matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfe_buffer_size(
            preemphasized_audio_signal.total_length, frequency, config.frame_length, config.frame_stride, config.num_filters,
            config.implementation_version);
    /* Only throw size mismatch error calculated buffer doesn't fit for continuous inferencing */
    if (out_matrix_size.rows * out_matrix_size.cols > output_matrix->rows * output_matrix->cols) {
        ei_printf("out_matrix = %dx%d\n", (int)output_matrix->rows, (int)output_matrix->cols);
        ei_printf("calculated size = %dx%d\n", (int)out_matrix_size.rows, (int)out_matrix_size.cols);
        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    output_matrix->rows = out_matrix_size.rows;
    output_matrix->cols = out_matrix_size.cols;

    int ret;
    // This probably seems incorrect, but the mfe func can actually handle all versions
    // There's a subtle issue with cmvn and v2, not worth tracking down
    // So for v2 and v1, we'll just use the old code
    // (the new mfe does away with the intermediate filterbank matrix)
    if (config.implementation_version > 2) {
        ret = speechpy::feature::mfe(output_matrix, nullptr, &preemphasized_audio_signal,
            frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length,
            config.low_frequency, config.high_frequency, config.implementation_version);
    } else {
        ret = speechpy::feature::mfe_v3(output_matrix, nullptr, &preemphasized_audio_signal,
            frequency, config.frame_length, config.frame_stride, config.num_filters, config.fft_length,
            config.low_frequency, config.high_frequency, config.implementation_version);
    }

    if (preemphasis) {
        delete preemphasis;
    }
    if (ret != EIDSP_OK) {
        ei_printf("ERR: MFE failed (%d)\n", ret);
        EIDSP_ERR(ret);
    }

    if (config.implementation_version < 3) {
        // cepstral mean and variance normalization
        ret = speechpy::processing::cmvnw(output_matrix, config.win_size, false, true);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: cmvnw failed (%d)\n", ret);
            EIDSP_ERR(ret);
        }
    }
    else {
        // normalization
        ret = speechpy::processing::mfe_normalization(output_matrix, config.noise_floor_db);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: normalization failed (%d)\n", ret);
            EIDSP_ERR(ret);
        }
    }

    output_matrix->cols = out_matrix_size.rows * out_matrix_size.cols;
    output_matrix->rows = 1;

    return EIDSP_OK;
}

static int extract_mfe_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfe_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
    uint32_t frequency = (uint32_t)sampling_frequency;

    int x;

    // calculate the size of the spectrogram matrix
    matrix_size_t out_matrix_size =
        speechpy::feature::calculate_mfe_buffer_size(
            signal->total_length, frequency, config->frame_length, config->frame_stride, config->num_filters,
            config->implementation_version);

    // we roll the output matrix back so we have room at the end...
    x = numpy::roll(output_matrix->buffer, output_matrix->rows * output_matrix->cols,
        -(out_matrix_size.rows * out_matrix_size.cols));
    if (x != EIDSP_OK) {
        EIDSP_ERR(x);
    }

    // slice in the output matrix to write to
    // the offset in the classification matrix here is always at the end
    size_t output_matrix_offset = (output_matrix->rows * output_matrix->cols) -
        (out_matrix_size.rows * out_matrix_size.cols);

    matrix_t output_matrix_slice(out_matrix_size.rows, out_matrix_size.cols, output_matrix->buffer + output_matrix_offset);

    // and run the MFE extraction
    // This probably seems incorrect, but the mfe func can actually handle all versions
    // There's a subtle issue with cmvn and v2, not worth tracking down
    // So for v2 and v1, we'll just use the old code
    // (the new mfe does away with the intermediate filterbank matrix)
    if (config->implementation_version > 2) {
         x = speechpy::feature::mfe(&output_matrix_slice, nullptr, signal,
            frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length,
            config->low_frequency, config->high_frequency, config->implementation_version);
    } else {
        x = speechpy::feature::mfe_v3(&output_matrix_slice, nullptr, signal,
            frequency, config->frame_length, config->frame_stride, config->num_filters, config->fft_length,
            config->low_frequency, config->high_frequency, config->implementation_version);
    }
    if (x != EIDSP_OK) {
        ei_printf("ERR: MFE failed (%d)\n", x);
        EIDSP_ERR(x);
    }

    matrix_size_out->rows += out_matrix_size.rows;
    if (out_matrix_size.cols > 0) {
        matrix_size_out->cols = out_matrix_size.cols;
    }

    return EIDSP_OK;
}

__attribute__((unused)) int extract_mfe_per_slice_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float sampling_frequency, matrix_size_t *matrix_size_out) {
#if defined(__cplusplus) && EI_C_LINKAGE == 1
    ei_printf("ERR: Continuous audio is not supported when EI_C_LINKAGE is defined\n");
    EIDSP_ERR(EIDSP_NOT_SUPPORTED);
#else

    ei_dsp_config_mfe_t config = *((ei_dsp_config_mfe_t*)config_ptr);

    // signal is already the right size,
    // output matrix is not the right size, but we can start writing at offset 0 and then it's OK too

    static bool first_run = false;

    if (config.axes != 1) {
        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
    }

    if (signal->total_length == 0) {
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    const uint32_t frequency = static_cast<uint32_t>(sampling_frequency);

    // Fake an extra frame_length for stack frames calculations. There, 1 frame_length is always
    // subtracted and there for never used. But skip the first slice to fit the feature_matrix
    // buffer
    if (config.implementation_version == 1) {
        if (first_run == true) {
            signal->total_length += (size_t)(config.frame_length * (float)frequency);
        }

        first_run = true;
    }

    // ok all setup, let's construct the signal (with preemphasis for impl version >3)
    signal_t preemphasized_audio_signal;

   // before version 3 we did not have preemphasis
    if (config.implementation_version < 3) {
        preemphasis = nullptr;
        preemphasized_audio_signal.total_length = signal->total_length;
        preemphasized_audio_signal.get_data = signal->get_data;
    }
    else {
        // preemphasis class to preprocess the audio...
        class speechpy::processing::preemphasis *pre = new class speechpy::processing::preemphasis(signal, 1, 0.98f, true);
        preemphasis = pre;
        preemphasized_audio_signal.total_length = signal->total_length;
        preemphasized_audio_signal.get_data = &preemphasized_audio_signal_get_data;
    }

    // Go from the time (e.g. 0.25 seconds to number of frames based on freq)
    const size_t frame_length_values = frequency * config.frame_length;
    const size_t frame_stride_values = frequency * config.frame_stride;
    const int frame_overlap_values = static_cast<int>(frame_length_values) - static_cast<int>(frame_stride_values);

    if (frame_overlap_values < 0) {
        ei_printf("ERR: frame_length (");
            ei_printf_float(config.frame_length);
            ei_printf(") cannot be lower than frame_stride (");
            ei_printf_float(config.frame_stride);
            ei_printf(") for continuous classification\n");

        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    if (frame_length_values > preemphasized_audio_signal.total_length) {
        ei_printf("ERR: frame_length (%d) cannot be larger than signal's total length (%d) for continuous classification\n",
            (int)frame_length_values, (int)preemphasized_audio_signal.total_length);
        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    int x;

    // have current frame, but wrong size? then free
    if (ei_dsp_cont_current_frame && ei_dsp_cont_current_frame_size != frame_length_values) {
        ei_free(ei_dsp_cont_current_frame);
        ei_dsp_cont_current_frame = nullptr;
    }

    if (!ei_dsp_cont_current_frame) {
        ei_dsp_cont_current_frame = (float*)ei_calloc(frame_length_values * sizeof(float), 1);
        if (!ei_dsp_cont_current_frame) {
            if (preemphasis) {
                delete preemphasis;
            }
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        ei_dsp_cont_current_frame_size = frame_length_values;
        ei_dsp_cont_current_frame_ix = 0;
    }

    matrix_size_out->rows = 0;
    matrix_size_out->cols = 0;

    // this is the offset in the signal from which we'll work
    size_t offset_in_signal = 0;

    if (ei_dsp_cont_current_frame_ix > (int)ei_dsp_cont_current_frame_size) {
        ei_printf("ERR: ei_dsp_cont_current_frame_ix is larger than frame size\n");
        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(EIDSP_PARAMETER_INVALID);
    }

    // if we still have some code from previous run
    while (ei_dsp_cont_current_frame_ix > 0) {
        // then from the current frame we need to read `frame_length_values - ei_dsp_cont_current_frame_ix`
        // starting at offset 0
        x = preemphasized_audio_signal.get_data(0, frame_length_values - ei_dsp_cont_current_frame_ix, ei_dsp_cont_current_frame + ei_dsp_cont_current_frame_ix);
        if (x != EIDSP_OK) {
            if (preemphasis) {
                delete preemphasis;
            }
            EIDSP_ERR(x);
        }

        // now ei_dsp_cont_current_frame is complete
        signal_t frame_signal;
        x = numpy::signal_from_buffer(ei_dsp_cont_current_frame, frame_length_values, &frame_signal);
        if (x != EIDSP_OK) {
            if (preemphasis) {
                delete preemphasis;
            }
            EIDSP_ERR(x);
        }

        x = extract_mfe_run_slice(&frame_signal, output_matrix, &config, sampling_frequency, matrix_size_out);
        if (x != EIDSP_OK) {
            if (preemphasis) {
                delete preemphasis;
            }
            EIDSP_ERR(x);
        }

        // if there's overlap between frames we roll through
        if (frame_stride_values > 0) {
            numpy::roll(ei_dsp_cont_current_frame, frame_length_values, -frame_stride_values);
        }

        ei_dsp_cont_current_frame_ix -= frame_stride_values;
    }

    if (ei_dsp_cont_current_frame_ix < 0) {
        offset_in_signal = -ei_dsp_cont_current_frame_ix;
        ei_dsp_cont_current_frame_ix = 0;
    }

    if (offset_in_signal >= signal->total_length) {
        if (preemphasis) {
            delete preemphasis;
        }
        offset_in_signal -= signal->total_length;
        return EIDSP_OK;
    }

    // now... we need to discard part of the signal...
    SignalWithRange signal_with_range(&preemphasized_audio_signal, offset_in_signal, signal->total_length);

    signal_t *range_signal = signal_with_range.get_signal();
    size_t range_signal_orig_length = range_signal->total_length;

    // then we'll just go through normal processing of the signal:
    x = extract_mfe_run_slice(range_signal, output_matrix, &config, sampling_frequency, matrix_size_out);
    if (x != EIDSP_OK) {
        if (preemphasis) {
            delete preemphasis;
        }
        EIDSP_ERR(x);
    }

    // update offset
    int length_of_signal_used = speechpy::processing::calculate_signal_used(range_signal->total_length, sampling_frequency,
        config.frame_length, config.frame_stride, false, config.implementation_version);
    offset_in_signal += length_of_signal_used;

    // not sure why this is being manipulated...
    range_signal->total_length = range_signal_orig_length;

    // see what's left?
    int bytes_left_end_of_frame = signal->total_length - offset_in_signal;
    bytes_left_end_of_frame += frame_overlap_values;

    if (bytes_left_end_of_frame > 0) {
        // then read that into the ei_dsp_cont_current_frame buffer
        x = preemphasized_audio_signal.get_data(
            (preemphasized_audio_signal.total_length - bytes_left_end_of_frame),
            bytes_left_end_of_frame,
            ei_dsp_cont_current_frame);
        if (x != EIDSP_OK) {
            if (preemphasis) {
                delete preemphasis;
            }
            EIDSP_ERR(x);
        }
    }

    ei_dsp_cont_current_frame_ix = bytes_left_end_of_frame;


    if (config.implementation_version == 1) {
        if (first_run == true) {
            signal->total_length -= (size_t)(config.frame_length * (float)frequency);
        }
    }

    if (preemphasis) {
        delete preemphasis;
    }

    return EIDSP_OK;
#endif
}

__attribute__((unused)) int extract_image_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) {
    ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr);

    int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3;

    size_t output_ix = 0;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
    const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE;
#else
    const size_t page_size = 1024;
#endif

    // buffered read from the signal
    size_t bytes_left = signal->total_length;
    for (size_t ix = 0; ix < signal->total_length; ix += page_size) {
        size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
        matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer);
#else
        matrix_t input_matrix(elements_to_read, config.axes);
#endif
        if (!input_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        signal->get_data(ix, elements_to_read, input_matrix.buffer);

        for (size_t jx = 0; jx < elements_to_read; jx++) {
            uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]);

            // rgb to 0..1
            float r = static_cast<float>(pixel >> 16 & 0xff) / 255.0f;
            float g = static_cast<float>(pixel >> 8 & 0xff) / 255.0f;
            float b = static_cast<float>(pixel & 0xff) / 255.0f;

            if (channel_count == 3) {
                output_matrix->buffer[output_ix++] = r;
                output_matrix->buffer[output_ix++] = g;
                output_matrix->buffer[output_ix++] = b;
            }
            else {
                // ITU-R 601-2 luma transform
                // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert
                float v = (0.299f * r) + (0.587f * g) + (0.114f * b);
                output_matrix->buffer[output_ix++] = v;
            }
        }

        bytes_left -= elements_to_read;
    }

    return EIDSP_OK;
}

#if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI)

__attribute__((unused)) int extract_drpai_features_quantized(signal_t *signal, matrix_u8_t *output_matrix, void *config_ptr, const float frequency) {
    ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr);

    int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3;

    size_t output_ix = 0;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
    const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE;
#else
    const size_t page_size = 1024;
#endif

    // buffered read from the signal
    size_t bytes_left = signal->total_length;
    for (size_t ix = 0; ix < signal->total_length; ix += page_size) {
        size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
        matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer);
#else
        matrix_t input_matrix(elements_to_read, config.axes);
#endif
        if (!input_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        signal->get_data(ix, elements_to_read, input_matrix.buffer);

        for (size_t jx = 0; jx < elements_to_read; jx++) {
            uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]);

            if (channel_count == 3) {
                uint8_t r = static_cast<uint8_t>(pixel >> 16 & 0xff);
                uint8_t g = static_cast<uint8_t>(pixel >> 8 & 0xff);
                uint8_t b = static_cast<uint8_t>(pixel & 0xff);

                output_matrix->buffer[output_ix++] = r;
                output_matrix->buffer[output_ix++] = g;
                output_matrix->buffer[output_ix++] = b;
            }
            else {
                //NOTE: not implementing greyscale yet
            }
        }
        bytes_left -= elements_to_read;
    }

    return EIDSP_OK;
}

#endif //(EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI)

#if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE != EI_CLASSIFIER_DRPAI)

__attribute__((unused)) int extract_image_features_quantized(signal_t *signal, matrix_i8_t *output_matrix, void *config_ptr, float scale, float zero_point, const float frequency,
                                                             int image_scaling) {
    ei_dsp_config_image_t config = *((ei_dsp_config_image_t*)config_ptr);

    int16_t channel_count = strcmp(config.channels, "Grayscale") == 0 ? 1 : 3;

    size_t output_ix = 0;

    const int32_t iRedToGray = (int32_t)(0.299f * 65536.0f);
    const int32_t iGreenToGray = (int32_t)(0.587f * 65536.0f);
    const int32_t iBlueToGray = (int32_t)(0.114f * 65536.0f);

    static const float torch_mean[] = { 0.485, 0.456, 0.406 };
    static const float torch_std[] = { 0.229, 0.224, 0.225 };

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
    const size_t page_size = EI_DSP_IMAGE_BUFFER_STATIC_SIZE;
#else
    const size_t page_size = 1024;
#endif

    // buffered read from the signal
    size_t bytes_left = signal->total_length;
    for (size_t ix = 0; ix < signal->total_length; ix += page_size) {
        size_t elements_to_read = bytes_left > page_size ? page_size : bytes_left;

#if defined(EI_DSP_IMAGE_BUFFER_STATIC_SIZE)
        matrix_t input_matrix(elements_to_read, config.axes, ei_dsp_image_buffer);
#else
        matrix_t input_matrix(elements_to_read, config.axes);
#endif
        if (!input_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        signal->get_data(ix, elements_to_read, input_matrix.buffer);

        for (size_t jx = 0; jx < elements_to_read; jx++) {
            uint32_t pixel = static_cast<uint32_t>(input_matrix.buffer[jx]);

            if (channel_count == 3) {
                // fast code path
                if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) {
                    int32_t r = static_cast<int32_t>(pixel >> 16 & 0xff);
                    int32_t g = static_cast<int32_t>(pixel >> 8 & 0xff);
                    int32_t b = static_cast<int32_t>(pixel & 0xff);

                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(r + zero_point);
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(g + zero_point);
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(b + zero_point);
                }
                // slow code path
                else {
                    float r = static_cast<float>(pixel >> 16 & 0xff);
                    float g = static_cast<float>(pixel >> 8 & 0xff);
                    float b = static_cast<float>(pixel & 0xff);

                    if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) {
                        r /= 255.0f;
                        g /= 255.0f;
                        b /= 255.0f;
                    }
                    else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) {
                        r /= 255.0f;
                        g /= 255.0f;
                        b /= 255.0f;

                        r = (r - torch_mean[0]) / torch_std[0];
                        g = (g - torch_mean[1]) / torch_std[1];
                        b = (b - torch_mean[2]) / torch_std[2];
                    }

                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(r / scale) + zero_point);
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(g / scale) + zero_point);
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(b / scale) + zero_point);
                }
            }
            else {
                // fast code path
                if (scale == 0.003921568859368563f && zero_point == -128 && image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) {
                    int32_t r = static_cast<int32_t>(pixel >> 16 & 0xff);
                    int32_t g = static_cast<int32_t>(pixel >> 8 & 0xff);
                    int32_t b = static_cast<int32_t>(pixel & 0xff);

                    // ITU-R 601-2 luma transform
                    // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert
                    int32_t gray = (iRedToGray * r) + (iGreenToGray * g) + (iBlueToGray * b);
                    gray >>= 16; // scale down to int8_t
                    gray += zero_point;
                    if (gray < - 128) gray = -128;
                    else if (gray > 127) gray = 127;
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(gray);
                }
                // slow code path
                else {
                    float r = static_cast<float>(pixel >> 16 & 0xff);
                    float g = static_cast<float>(pixel >> 8 & 0xff);
                    float b = static_cast<float>(pixel & 0xff);

                    if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_NONE) {
                        r /= 255.0f;
                        g /= 255.0f;
                        b /= 255.0f;
                    }
                    else if (image_scaling == EI_CLASSIFIER_IMAGE_SCALING_TORCH) {
                        r /= 255.0f;
                        g /= 255.0f;
                        b /= 255.0f;

                        r = (r - torch_mean[0]) / torch_std[0];
                        g = (g - torch_mean[1]) / torch_std[1];
                        b = (b - torch_mean[2]) / torch_std[2];
                    }

                    // ITU-R 601-2 luma transform
                    // see: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert
                    float v = (0.299f * r) + (0.587f * g) + (0.114f * b);
                    output_matrix->buffer[output_ix++] = static_cast<int8_t>(round(v / scale) + zero_point);
                }
            }
        }

        bytes_left -= elements_to_read;

    }
    return EIDSP_OK;
}
#endif // (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1) && (EI_CLASSIFIER_INFERENCING_ENGINE != EI_CLASSIFIER_DRPAI)

/**
 * Clear all state regarding continuous audio. Invoke this function after continuous audio loop ends.
 */
__attribute__((unused)) int ei_dsp_clear_continuous_audio_state() {
    if (ei_dsp_cont_current_frame) {
        ei_free(ei_dsp_cont_current_frame);
    }

    ei_dsp_cont_current_frame = nullptr;
    ei_dsp_cont_current_frame_size = 0;
    ei_dsp_cont_current_frame_ix = 0;

    return EIDSP_OK;
}

/**
 * @brief      Calculates the cepstral mean and variable normalization.
 *
 * @param      matrix      Source and destination matrix
 * @param      config_ptr  ei_dsp_config_mfcc_t struct pointer
 */
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfcc(ei_matrix *matrix, void *config_ptr)
{
    ei_dsp_config_mfcc_t *config = (ei_dsp_config_mfcc_t *)config_ptr;

    uint32_t original_matrix_size = matrix->rows * matrix->cols;

    /* Modify rows and colums ration for matrix normalization */
    matrix->rows = original_matrix_size / config->num_cepstral;
    matrix->cols = config->num_cepstral;

    // cepstral mean and variance normalization
    int ret = speechpy::processing::cmvnw(matrix, config->win_size, true, false);
    if (ret != EIDSP_OK) {
        ei_printf("ERR: cmvnw failed (%d)\n", ret);
        return;
    }

    /* Reset rows and columns ratio */
    matrix->rows = 1;
    matrix->cols = original_matrix_size;
}

/**
 * @brief      Calculates the cepstral mean and variable normalization.
 *
 * @param      matrix      Source and destination matrix
 * @param      config_ptr  ei_dsp_config_mfe_t struct pointer
 */
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_mfe(ei_matrix *matrix, void *config_ptr)
{
    ei_dsp_config_mfe_t *config = (ei_dsp_config_mfe_t *)config_ptr;

    uint32_t original_matrix_size = matrix->rows * matrix->cols;

    /* Modify rows and colums ration for matrix normalization */
    matrix->rows = (original_matrix_size) / config->num_filters;
    matrix->cols = config->num_filters;

    if (config->implementation_version < 3) {
        // cepstral mean and variance normalization
        int ret = speechpy::processing::cmvnw(matrix, config->win_size, false, true);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: cmvnw failed (%d)\n", ret);
            return;
        }
    }
    else {
        // normalization
        int ret = speechpy::processing::mfe_normalization(matrix, config->noise_floor_db);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: normalization failed (%d)\n", ret);
            return;
        }
    }

    /* Reset rows and columns ratio */
    matrix->rows = 1;
    matrix->cols = (original_matrix_size);
}

/**
 * @brief      Calculates the cepstral mean and variable normalization.
 *
 * @param      matrix      Source and destination matrix
 * @param      config_ptr  ei_dsp_config_spectrogram_t struct pointer
 */
__attribute__((unused)) void calc_cepstral_mean_and_var_normalization_spectrogram(ei_matrix *matrix, void *config_ptr)
{
    ei_dsp_config_spectrogram_t *config = (ei_dsp_config_spectrogram_t *)config_ptr;

    uint32_t original_matrix_size = matrix->rows * matrix->cols;

    /* Modify rows and colums ration for matrix normalization */
    matrix->cols = config->fft_length / 2 + 1;
    matrix->rows = (original_matrix_size) / matrix->cols;

    if (config->implementation_version < 3) {
        int ret = numpy::normalize(matrix);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: normalization failed (%d)\n", ret);
            return;
        }
    }
    else {
        // normalization
        int ret = speechpy::processing::spectrogram_normalization(matrix, config->noise_floor_db);
        if (ret != EIDSP_OK) {
            ei_printf("ERR: normalization failed (%d)\n", ret);
            return;
        }
    }

    /* Reset rows and columns ratio */
    matrix->rows = 1;
    matrix->cols = (original_matrix_size);
}

#ifdef __cplusplus
}
#endif // __cplusplus

#endif // _EDGE_IMPULSE_RUN_DSP_H_