File size: 93,447 Bytes

b7b614e

/*
 * Copyright (c) 2022 EdgeImpulse Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS
 * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#ifndef _EIDSP_NUMPY_H_
#define _EIDSP_NUMPY_H_

// it's valid to include the SDK without a model, but there's information that we need
// in model_metadata.h (like the FFT tables used).
// if the compiler does not support the __has_include directive we'll assume that the
// file exists.
#ifndef __has_include
#define __has_include 1
#endif // __has_include

#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <cfloat>
#include "ei_vector.h"
#include <algorithm>
#include "numpy_types.h"
#include "config.hpp"
#include "returntypes.hpp"
#include "memory.hpp"
#include "ei_utils.h"
#include "dct/fast-dct-fft.h"
#include "kissfft/kiss_fftr.h"
#if __has_include("model-parameters/model_metadata.h")
#include "model-parameters/model_metadata.h"
#endif
#if EIDSP_USE_CMSIS_DSP
#include "edge-impulse-sdk/CMSIS/DSP/Include/arm_math.h"
#include "edge-impulse-sdk/CMSIS/DSP/Include/arm_const_structs.h"
#endif

// For the following CMSIS includes, we want to use the C fallback, so include whether or not we set the CMSIS flag
#include "edge-impulse-sdk/CMSIS/DSP/Include/dsp/statistics_functions.h"

#ifdef __MBED__
#include "mbed.h"
#else
#include <functional>
#endif // __MBED__

#define EI_MAX_UINT16 65535

namespace ei {

using fvec = ei_vector<float>;
using ivec = ei_vector<int>;

// clang-format off
// lookup table for quantized values between 0.0f and 1.0f
static constexpr float quantized_values_one_zero[] = { (0.0f / 1.0f), (1.0f / 100.0f), (2.0f / 100.0f), (3.0f / 100.0f), (4.0f / 100.0f), (1.0f / 22.0f), (1.0f / 21.0f), (1.0f / 20.0f), (1.0f / 19.0f), (1.0f / 18.0f), (1.0f / 17.0f), (6.0f / 100.0f), (1.0f / 16.0f), (1.0f / 15.0f), (7.0f / 100.0f), (1.0f / 14.0f), (1.0f / 13.0f), (8.0f / 100.0f), (1.0f / 12.0f), (9.0f / 100.0f), (1.0f / 11.0f), (2.0f / 21.0f), (1.0f / 10.0f), (2.0f / 19.0f), (11.0f / 100.0f), (1.0f / 9.0f), (2.0f / 17.0f), (12.0f / 100.0f), (1.0f / 8.0f), (13.0f / 100.0f), (2.0f / 15.0f), (3.0f / 22.0f), (14.0f / 100.0f), (1.0f / 7.0f), (3.0f / 20.0f), (2.0f / 13.0f), (3.0f / 19.0f), (16.0f / 100.0f), (1.0f / 6.0f), (17.0f / 100.0f), (3.0f / 17.0f), (18.0f / 100.0f), (2.0f / 11.0f), (3.0f / 16.0f), (19.0f / 100.0f), (4.0f / 21.0f), (1.0f / 5.0f), (21.0f / 100.0f), (4.0f / 19.0f), (3.0f / 14.0f), (22.0f / 100.0f), (2.0f / 9.0f), (5.0f / 22.0f), (23.0f / 100.0f), (3.0f / 13.0f), (4.0f / 17.0f), (5.0f / 21.0f), (24.0f / 100.0f), (1.0f / 4.0f), (26.0f / 100.0f), (5.0f / 19.0f), (4.0f / 15.0f), (27.0f / 100.0f), (3.0f / 11.0f), (5.0f / 18.0f), (28.0f / 100.0f), (2.0f / 7.0f), (29.0f / 100.0f), (5.0f / 17.0f), (3.0f / 10.0f), (4.0f / 13.0f), (31.0f / 100.0f), (5.0f / 16.0f), (6.0f / 19.0f), (7.0f / 22.0f), (32.0f / 100.0f), (33.0f / 100.0f), (1.0f / 3.0f), (34.0f / 100.0f), (7.0f / 20.0f), (6.0f / 17.0f), (5.0f / 14.0f), (36.0f / 100.0f), (4.0f / 11.0f), (7.0f / 19.0f), (37.0f / 100.0f), (3.0f / 8.0f), (38.0f / 100.0f), (8.0f / 21.0f), (5.0f / 13.0f), (7.0f / 18.0f), (39.0f / 100.0f), (2.0f / 5.0f), (9.0f / 22.0f), (41.0f / 100.0f), (7.0f / 17.0f), (5.0f / 12.0f), (42.0f / 100.0f), (8.0f / 19.0f), (3.0f / 7.0f), (43.0f / 100.0f), (7.0f / 16.0f), (44.0f / 100.0f), (4.0f / 9.0f), (9.0f / 20.0f), (5.0f / 11.0f), (46.0f / 100.0f), (6.0f / 13.0f), (7.0f / 15.0f), (47.0f / 100.0f), (8.0f / 17.0f), (9.0f / 19.0f), (10.0f / 21.0f), (48.0f / 100.0f), (49.0f / 100.0f), (1.0f / 2.0f), (51.0f / 100.0f), (52.0f / 100.0f), (11.0f / 21.0f), (10.0f / 19.0f), (9.0f / 17.0f), (53.0f / 100.0f), (8.0f / 15.0f), (7.0f / 13.0f), (54.0f / 100.0f), (6.0f / 11.0f), (11.0f / 20.0f), (5.0f / 9.0f), (56.0f / 100.0f), (9.0f / 16.0f), (57.0f / 100.0f), (4.0f / 7.0f), (11.0f / 19.0f), (58.0f / 100.0f), (7.0f / 12.0f), (10.0f / 17.0f), (59.0f / 100.0f), (13.0f / 22.0f), (3.0f / 5.0f), (61.0f / 100.0f), (11.0f / 18.0f), (8.0f / 13.0f), (13.0f / 21.0f), (62.0f / 100.0f), (5.0f / 8.0f), (63.0f / 100.0f), (12.0f / 19.0f), (7.0f / 11.0f), (64.0f / 100.0f), (9.0f / 14.0f), (11.0f / 17.0f), (13.0f / 20.0f), (66.0f / 100.0f), (2.0f / 3.0f), (67.0f / 100.0f), (68.0f / 100.0f), (15.0f / 22.0f), (13.0f / 19.0f), (11.0f / 16.0f), (69.0f / 100.0f), (9.0f / 13.0f), (7.0f / 10.0f), (12.0f / 17.0f), (71.0f / 100.0f), (5.0f / 7.0f), (72.0f / 100.0f), (13.0f / 18.0f), (8.0f / 11.0f), (73.0f / 100.0f), (11.0f / 15.0f), (14.0f / 19.0f), (74.0f / 100.0f), (3.0f / 4.0f), (76.0f / 100.0f), (16.0f / 21.0f), (13.0f / 17.0f), (10.0f / 13.0f), (77.0f / 100.0f), (17.0f / 22.0f), (7.0f / 9.0f), (78.0f / 100.0f), (11.0f / 14.0f), (15.0f / 19.0f), (79.0f / 100.0f), (4.0f / 5.0f), (17.0f / 21.0f), (81.0f / 100.0f), (13.0f / 16.0f), (9.0f / 11.0f), (82.0f / 100.0f), (14.0f / 17.0f), (83.0f / 100.0f), (5.0f / 6.0f), (84.0f / 100.0f), (16.0f / 19.0f), (11.0f / 13.0f), (17.0f / 20.0f), (6.0f / 7.0f), (86.0f / 100.0f), (19.0f / 22.0f), (13.0f / 15.0f), (87.0f / 100.0f), (7.0f / 8.0f), (88.0f / 100.0f), (15.0f / 17.0f), (8.0f / 9.0f), (89.0f / 100.0f), (17.0f / 19.0f), (9.0f / 10.0f), (19.0f / 21.0f), (10.0f / 11.0f), (91.0f / 100.0f), (11.0f / 12.0f), (92.0f / 100.0f), (12.0f / 13.0f), (13.0f / 14.0f), (93.0f / 100.0f), (14.0f / 15.0f), (15.0f / 16.0f), (94.0f / 100.0f), (16.0f / 17.0f), (17.0f / 18.0f), (18.0f / 19.0f), (19.0f / 20.0f), (20.0f / 21.0f), (21.0f / 22.0f), (96.0f / 100.0f), (97.0f / 100.0f), (98.0f / 100.0f), (99.0f / 100.0f), (1.0f / 1.0f) ,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
// clang-format on

class numpy {
public:

    static float sqrt(float x) {
#if EIDSP_USE_CMSIS_DSP
        float temp;
        arm_sqrt_f32(x, &temp);
        return temp;
#else
        return sqrtf(x);
#endif
    }

    /**
     * Roll array elements along a given axis.
     * Elements that roll beyond the last position are re-introduced at the first.
     * @param input_array
     * @param input_array_size
     * @param shift The number of places by which elements are shifted.
     * @returns EIDSP_OK if OK
     */
    static int roll(float *input_array, size_t input_array_size, int shift) {
        if (shift < 0) {
            shift = input_array_size + shift;
        }

        if (shift == 0) {
            return EIDSP_OK;
        }

        // so we need to allocate a buffer of the size of shift...
        EI_DSP_MATRIX(shift_matrix, 1, shift);

        // we copy from the end of the buffer into the shift buffer
        memcpy(shift_matrix.buffer, input_array + input_array_size - shift, shift * sizeof(float));

        // now we do a memmove to shift the array
        memmove(input_array + shift, input_array, (input_array_size - shift) * sizeof(float));

        // and copy the shift buffer back to the beginning of the array
        memcpy(input_array, shift_matrix.buffer, shift * sizeof(float));

        return EIDSP_OK;
    }

    /**
     * Roll array elements along a given axis.
     * Elements that roll beyond the last position are re-introduced at the first.
     * @param input_array
     * @param input_array_size
     * @param shift The number of places by which elements are shifted.
     * @returns EIDSP_OK if OK
     */
    static int roll(int *input_array, size_t input_array_size, int shift) {
        if (shift < 0) {
            shift = input_array_size + shift;
        }

        if (shift == 0) {
            return EIDSP_OK;
        }

        // so we need to allocate a buffer of the size of shift...
        EI_DSP_MATRIX(shift_matrix, 1, shift);

        // we copy from the end of the buffer into the shift buffer
        memcpy(shift_matrix.buffer, input_array + input_array_size - shift, shift * sizeof(int));

        // now we do a memmove to shift the array
        memmove(input_array + shift, input_array, (input_array_size - shift) * sizeof(int));

        // and copy the shift buffer back to the beginning of the array
        memcpy(input_array, shift_matrix.buffer, shift * sizeof(int));

        return EIDSP_OK;
    }

    /**
     * Roll array elements along a given axis.
     * Elements that roll beyond the last position are re-introduced at the first.
     * @param input_array
     * @param input_array_size
     * @param shift The number of places by which elements are shifted.
     * @returns EIDSP_OK if OK
     */
    static int roll(int16_t *input_array, size_t input_array_size, int shift) {
        if (shift < 0) {
            shift = input_array_size + shift;
        }

        if (shift == 0) {
            return EIDSP_OK;
        }

        // so we need to allocate a buffer of the size of shift...
        EI_DSP_MATRIX(shift_matrix, 1, shift);

        // we copy from the end of the buffer into the shift buffer
        memcpy(shift_matrix.buffer, input_array + input_array_size - shift, shift * sizeof(int16_t));

        // now we do a memmove to shift the array
        memmove(input_array + shift, input_array, (input_array_size - shift) * sizeof(int16_t));

        // and copy the shift buffer back to the beginning of the array
        memcpy(input_array, shift_matrix.buffer, shift * sizeof(int16_t));

        return EIDSP_OK;
    }

    static float sum(float *input_array, size_t input_array_size) {
        float res = 0.0f;
        for (size_t ix = 0; ix < input_array_size; ix++) {
            res += input_array[ix];
        }
        return res;
    }

    /**
     * Multiply two matrices (MxN * NxK matrix)
     * @param matrix1 Pointer to matrix1 (MxN)
     * @param matrix2 Pointer to matrix2 (NxK)
     * @param out_matrix Pointer to out matrix (MxK)
     * @returns EIDSP_OK if OK
     */
    static int dot(matrix_t *matrix1, matrix_t *matrix2, matrix_t *out_matrix) {
        if (matrix1->cols != matrix2->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        // no. of rows in matrix1 determines the
        if (matrix1->rows != out_matrix->rows || matrix2->cols != out_matrix->cols) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

#if EIDSP_USE_CMSIS_DSP
        if (matrix1->rows > EI_MAX_UINT16 || matrix1->cols > EI_MAX_UINT16 || matrix2->rows > EI_MAX_UINT16 ||
            matrix2->cols > EI_MAX_UINT16 || out_matrix->rows > EI_MAX_UINT16 || out_matrix->cols > EI_MAX_UINT16) {
            return EIDSP_NARROWING;
        }

        const arm_matrix_instance_f32 m1 = { static_cast<uint16_t>(matrix1->rows), static_cast<uint16_t>(matrix1->cols), matrix1->buffer };
        const arm_matrix_instance_f32 m2 = { static_cast<uint16_t>(matrix2->rows), static_cast<uint16_t>(matrix2->cols), matrix2->buffer };
        arm_matrix_instance_f32 mo = { static_cast<uint16_t>(out_matrix->rows), static_cast<uint16_t>(out_matrix->cols), out_matrix->buffer };
        int status = arm_mat_mult_f32(&m1, &m2, &mo);
        if (status != ARM_MATH_SUCCESS) {
            EIDSP_ERR(status);
        }
#else
        memset(out_matrix->buffer, 0, out_matrix->rows * out_matrix->cols * sizeof(float));

        for (size_t i = 0; i < matrix1->rows; i++) {
            dot_by_row(i,
                matrix1->buffer + (i * matrix1->cols),
                matrix1->cols,
                matrix2,
                out_matrix);
        }
#endif

        return EIDSP_OK;
    }

    /**
     * Multiply two matrices (MxN * NxK matrix)
     * @param matrix1 Pointer to matrix1 (MxN)
     * @param matrix2 Pointer to quantized matrix2 (NxK)
     * @param out_matrix Pointer to out matrix (MxK)
     * @returns EIDSP_OK if OK
     */
    static int dot(matrix_t *matrix1,
                    quantized_matrix_t *matrix2,
                    matrix_t *out_matrix)
    {
        if (matrix1->cols != matrix2->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        // no. of rows in matrix1 determines the
        if (matrix1->rows != out_matrix->rows || matrix2->cols != out_matrix->cols) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        memset(out_matrix->buffer, 0, out_matrix->rows * out_matrix->cols * sizeof(float));

        for (size_t i = 0; i < matrix1->rows; i++) {
            dot_by_row(i,
                matrix1->buffer + (i * matrix1->cols),
                matrix1->cols,
                matrix2,
                out_matrix);
        }

        return EIDSP_OK;
    }

    /**
     * Multiply two matrices lazily per row in matrix 1 (MxN * NxK matrix)
     * @param i matrix1 row index
     * @param row matrix1 row
     * @param matrix1_cols matrix1 row size (1xN)
     * @param matrix2 Pointer to matrix2 (NxK)
     * @param out_matrix Pointer to out matrix (MxK)
     * @returns EIDSP_OK if OK
     */
    static  int dot_by_row(int i, float *row, uint32_t matrix1_cols, matrix_t *matrix2, matrix_t *out_matrix) {
        if (matrix1_cols != matrix2->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

#if EIDSP_USE_CMSIS_DSP
        if (matrix1_cols > EI_MAX_UINT16 || matrix2->rows > EI_MAX_UINT16 || matrix2->cols > EI_MAX_UINT16 ||
            out_matrix->cols > EI_MAX_UINT16) {
            return EIDSP_NARROWING;
        }

        const arm_matrix_instance_f32 m1 = { 1, static_cast<uint16_t>(matrix1_cols), row };
        const arm_matrix_instance_f32 m2 = { static_cast<uint16_t>(matrix2->rows), static_cast<uint16_t>(matrix2->cols), matrix2->buffer };
        arm_matrix_instance_f32 mo = { 1, static_cast<uint16_t>(out_matrix->cols), out_matrix->buffer + (i * out_matrix->cols) };
        int status = arm_mat_mult_f32(&m1, &m2, &mo);
        if (status != ARM_MATH_SUCCESS) {
            EIDSP_ERR(status);
        }
#else
        for (size_t j = 0; j < matrix2->cols; j++) {
            float tmp = 0.0f;
            for (size_t k = 0; k < matrix1_cols; k++) {
                tmp += row[k] * matrix2->buffer[k * matrix2->cols + j];
            }
            out_matrix->buffer[i * matrix2->cols + j] += tmp;
        }
#endif

        return EIDSP_OK;
    }

    /**
     * Multiply two matrices lazily per row in matrix 1 (MxN * NxK matrix)
     * @param i matrix1 row index
     * @param row matrix1 row
     * @param matrix1_cols matrix1 row size
     * @param matrix2 Pointer to matrix2 (NxK)
     * @param out_matrix Pointer to out matrix (MxK)
     * @returns EIDSP_OK if OK
     */
    static  int dot_by_row(int i, float *row, size_t matrix1_cols,
        quantized_matrix_t *matrix2, matrix_t *out_matrix)
    {
        if (matrix1_cols != matrix2->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (uint16_t j = 0; j < matrix2->cols; j++) {
            float tmp = 0.0;
            for (uint16_t k = 0; k < matrix1_cols; k++) {
                uint8_t u8 = matrix2->buffer[k * matrix2->cols + j];
                if (u8) { // this matrix appears to be very sparsely populated
                    tmp += row[k] * quantized_values_one_zero[u8];
                }
            }
            out_matrix->buffer[i * matrix2->cols + j] = tmp;
        }

        return EIDSP_OK;
    }

    static void transpose_in_place(matrix_t *matrix) {
        size_t size = matrix->cols * matrix->rows - 1;
        float temp; // temp for swap
        size_t next; // next item to swap
        size_t cycleBegin; // index of start of cycle
        size_t i; // location in matrix
        size_t all_done_mark = 1;
        ei_vector<bool> done(size+1,false);

        i = 1; // Note that matrix[0] and last element of matrix won't move
        while (1)
        {
            cycleBegin = i;
            temp = matrix->buffer[i];
            do
            {
                size_t col = i % matrix->cols;
                size_t row = i / matrix->cols;
                // swap row and col to make new idx, b/c we want to know where in the transposed matrix
                next = col*matrix->rows + row;
                float temp2 = matrix->buffer[next];
                matrix->buffer[next] = temp;
                temp = temp2;
                done[next] = true;
                i = next;
            }
            while (i != cycleBegin);

            // start next cycle by find next not done
            for (i = all_done_mark; done[i]; i++) {
                all_done_mark++; // move the high water mark so we don't look again
                if(i>=size) { goto LOOP_END; }
            }
        }
        LOOP_END:
        // finally, swap the row and column dimensions
        std::swap(matrix->rows, matrix->cols);
    }

    /**
     * Transpose an array, souce is destination (from MxN to NxM)
     * Note: this temporary allocates a copy of the matrix on the heap.
     * @param matrix
     * @param rows
     * @param columns
     * @deprecated You probably want to use transpose_in_place
     * @returns EIDSP_OK if OK
     */
    static int transpose(matrix_t *matrix) {
        int r = transpose(matrix->buffer, matrix->cols, matrix->rows);
        if (r != 0) {
            return r;
        }

        uint16_t old_rows = matrix->rows;
        uint16_t old_cols = matrix->cols;

        matrix->rows = old_cols;
        matrix->cols = old_rows;

        return EIDSP_OK;
    }

    /**
     * Transpose an array, source is destination (from MxN to NxM)
     * @param matrix
     * @param rows
     * @param columns
     * @deprecated You probably want to use transpose_in_place
     * @returns EIDSP_OK if OK
     */
    static int transpose(float *matrix, int rows, int columns) {
        EI_DSP_MATRIX(temp_matrix, rows, columns);
        if (!temp_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

#if EIDSP_USE_CMSIS_DSP
        if (rows > EI_MAX_UINT16 || columns > EI_MAX_UINT16) {
            return EIDSP_NARROWING;
        }

        const arm_matrix_instance_f32 i_m = {
            static_cast<uint16_t>(columns),
            static_cast<uint16_t>(rows),
            matrix
        };
        arm_matrix_instance_f32 o_m = {
            static_cast<uint16_t>(rows),
            static_cast<uint16_t>(columns),
            temp_matrix.buffer
        };
        arm_status status = arm_mat_trans_f32(&i_m, &o_m);
        if (status != ARM_MATH_SUCCESS) {
            return status;
        }
#else
        for (int j = 0; j < rows; j++){
            for (int i = 0; i < columns; i++){
                temp_matrix.buffer[j * columns + i] = matrix[i * rows + j];
            }
        }
#endif

        memcpy(matrix, temp_matrix.buffer, rows * columns * sizeof(float));

        return EIDSP_OK;
    }

    /**
     * Transpose an array in place (from MxN to NxM)
     * Note: this temporary allocates a copy of the matrix on the heap.
     * @param matrix
     * @param rows
     * @param columns
     * @returns EIDSP_OK if OK
     */
    static int transpose(quantized_matrix_t *matrix) {
        int r = transpose(matrix->buffer, matrix->cols, matrix->rows);
        if (r != 0) {
            return r;
        }

        uint16_t old_rows = matrix->rows;
        uint16_t old_cols = matrix->cols;

        matrix->rows = old_cols;
        matrix->cols = old_rows;

        return EIDSP_OK;
    }

    /**
     * Transpose an array in place (from MxN to NxM)
     * @param matrix
     * @param rows
     * @param columns
     * @returns EIDSP_OK if OK
     */
    static int transpose(uint8_t *matrix, int rows, int columns) {
        // dequantization function is not used actually...
        EI_DSP_QUANTIZED_MATRIX(temp_matrix, rows, columns, &dequantize_zero_one);
        if (!temp_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        for (int j = 0; j < rows; j++){
            for (int i = 0; i < columns; i++){
                temp_matrix.buffer[j * columns + i] = matrix[i * rows + j];
            }
        }

        memcpy(matrix, temp_matrix.buffer, rows * columns * sizeof(uint8_t));

        return EIDSP_OK;
    }

    /**
     * Return the Discrete Cosine Transform of arbitrary type sequence 2.
     * @param input Input array (of size N)
     * @param N number of items in input and output array
     * @returns EIDSP_OK if OK
     */
    static int dct2(float *input, size_t N, DCT_NORMALIZATION_MODE normalization = DCT_NORMALIZATION_NONE) {
        if (N == 0) {
            return EIDSP_OK;
        }

        int ret = ei::dct::transform(input, N);
        if (ret != EIDSP_OK) {
            EIDSP_ERR(ret);
        }

        // for some reason the output is 2x too low...
        for (size_t ix = 0; ix < N; ix++) {
            input[ix] *= 2;
        }

        if (normalization == DCT_NORMALIZATION_ORTHO) {
            input[0] = input[0] * sqrt(1.0f / static_cast<float>(4 * N));
            for (size_t ix = 1; ix < N; ix++) {
                input[ix] = input[ix] * sqrt(1.0f / static_cast<float>(2 * N));
            }
        }

        return EIDSP_OK;
    }

    /**
     * Discrete Cosine Transform of arbitrary type sequence 2 on a matrix.
     * @param matrix
     * @returns EIDSP_OK if OK
     */
    static int dct2(matrix_t *matrix, DCT_NORMALIZATION_MODE normalization = DCT_NORMALIZATION_NONE) {
        for (size_t row = 0; row < matrix->rows; row++) {
            int r = dct2(matrix->buffer + (row * matrix->cols), matrix->cols, normalization);
            if (r != EIDSP_OK) {
                return r;
            }
        }

        return EIDSP_OK;
    }

    /**
     * Quantize a float value between zero and one
     * @param value Float value
     */
    static uint8_t quantize_zero_one(float value) {
        const size_t length = sizeof(quantized_values_one_zero) / sizeof(float);

        // look in the table
        for (size_t ix = 0; ix < length; ix++) {
            if (quantized_values_one_zero[ix] == value) return ix;
        }

        // no match?

        if (value < quantized_values_one_zero[0]) {
            return quantized_values_one_zero[0];
        }
        if (value > quantized_values_one_zero[length - 1]) {
            return quantized_values_one_zero[length - 1];
        }

        int lo = 0;
        int hi = length - 1;

        while (lo <= hi) {
            int mid = (hi + lo) / 2;

            if (value < quantized_values_one_zero[mid]) {
                hi = mid - 1;
            } else if (value > quantized_values_one_zero[mid]) {
                lo = mid + 1;
            } else {
                return quantized_values_one_zero[mid];
            }
        }

        // lo == hi + 1
        return (quantized_values_one_zero[lo] - value) < (value - quantized_values_one_zero[hi]) ?
            lo :
            hi;
    }

    /**
     * Dequantize a float value between zero and one
     * @param value
     */
    static float dequantize_zero_one(uint8_t value) {
        return quantized_values_one_zero[value];
    }

    /**
     * Pad an array.
     * Pads with the reflection of the vector mirrored along the edge of the array.
     * @param input Input matrix (MxN)
     * @param output Output matrix of size (M+pad_before+pad_after x N)
     * @param pad_before Number of items to pad before
     * @param pad_after Number of items to pad after
     * @returns 0 if OK
     */
    static int pad_1d_symmetric(matrix_t *input, matrix_t *output, uint16_t pad_before, uint16_t pad_after) {
        if (output->cols != input->cols) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (output->rows != input->rows + pad_before + pad_after) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (input->rows == 0) {
            EIDSP_ERR(EIDSP_INPUT_MATRIX_EMPTY);
        }

        uint32_t pad_before_index = 0;
        bool pad_before_direction_up = true;

        for (int32_t ix = pad_before - 1; ix >= 0; ix--) {
            memcpy(output->buffer + (input->cols * ix),
                input->buffer + (pad_before_index * input->cols),
                input->cols * sizeof(float));

            if (pad_before_index == 0 && !pad_before_direction_up) {
                pad_before_direction_up = true;
            }
            else if (pad_before_index == input->rows - 1 && pad_before_direction_up) {
                pad_before_direction_up = false;
            }
            else if (pad_before_direction_up) {
                pad_before_index++;
            }
            else {
                pad_before_index--;
            }
        }

        memcpy(output->buffer + (input->cols * pad_before),
            input->buffer,
            input->rows * input->cols * sizeof(float));

        int32_t pad_after_index = input->rows - 1;
        bool pad_after_direction_up = false;

        for (int32_t ix = 0; ix < pad_after; ix++) {
            memcpy(output->buffer + (input->cols * (ix + pad_before + input->rows)),
                input->buffer + (pad_after_index * input->cols),
                input->cols * sizeof(float));

            if (pad_after_index == 0 && !pad_after_direction_up) {
                pad_after_direction_up = true;
            }
            else if (pad_after_index == static_cast<int32_t>(input->rows) - 1 && pad_after_direction_up) {
                pad_after_direction_up = false;
            }
            else if (pad_after_direction_up) {
                pad_after_index++;
            }
            else {
                pad_after_index--;
            }
        }

        return EIDSP_OK;
    }

    /**
     * Scale a matrix in place
     * @param matrix
     * @param scale
     * @returns 0 if OK
     */
    static int scale(matrix_t *matrix, float scale) {
        if (scale == 1.0f) return EIDSP_OK;

#if EIDSP_USE_CMSIS_DSP
        if (matrix->rows > EI_MAX_UINT16 || matrix->cols > EI_MAX_UINT16) {
            return EIDSP_NARROWING;
        }

        const arm_matrix_instance_f32 mi = { static_cast<uint16_t>(matrix->rows), static_cast<uint16_t>(matrix->cols), matrix->buffer };
        arm_matrix_instance_f32 mo = { static_cast<uint16_t>(matrix->rows), static_cast<uint16_t>(matrix->cols), matrix->buffer };
        int status = arm_mat_scale_f32(&mi, scale, &mo);
        if (status != ARM_MATH_SUCCESS) {
            return status;
        }
#else
        for (size_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] *= scale;
        }
#endif
        return EIDSP_OK;
    }


    /**
     * Scale a matrix in place, per row
     * @param matrix Input matrix (MxN)
     * @param scale_matrix Scale matrix (Mx1)
     * @returns 0 if OK
     */
    static int scale(matrix_t *matrix, matrix_t *scale_matrix) {
        if (matrix->rows != scale_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (scale_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < matrix->rows; row++) {
            EI_DSP_MATRIX_B(temp, 1, matrix->cols, matrix->buffer + (row * matrix->cols));
            int ret = scale(&temp, scale_matrix->buffer[row]);
            if (ret != EIDSP_OK) {
                EIDSP_ERR(ret);
            }
        }

        return EIDSP_OK;
    }

    /**
     * Add on matrix in place
     * @param matrix
     * @param addition
     * @returns 0 if OK
     */
    static int add(matrix_t *matrix, float addition) {
        for (uint32_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] += addition;
        }
        return EIDSP_OK;
    }

    /**
     * Add on a matrix in place, per row
     * @param matrix Input matrix (MxN)
     * @param add Scale matrix (Mx1)
     * @returns 0 if OK
     */
    static int add(matrix_t *matrix, matrix_t *add_matrix) {
        if (matrix->rows != add_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (add_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < matrix->rows; row++) {
            EI_DSP_MATRIX_B(temp, 1, matrix->cols, matrix->buffer + (row * matrix->cols));
            int ret = add(&temp, add_matrix->buffer[row]);
            if (ret != EIDSP_OK) {
                EIDSP_ERR(ret);
            }
        }

        return EIDSP_OK;
    }

    /**
     * Subtract from matrix in place
     * @param matrix
     * @param subtraction
     * @returns 0 if OK
     */
    static int subtract(matrix_t *matrix, float subtraction) {
        for (uint32_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] -= subtraction;
        }
        return EIDSP_OK;
    }

    /**
     * Add on a matrix in place, per row
     * @param matrix Input matrix (MxN)
     * @param add Scale matrix (Mx1)
     * @returns 0 if OK
     */
    static int subtract(matrix_t *matrix, matrix_t *subtract_matrix) {
        if (matrix->rows != subtract_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (subtract_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < matrix->rows; row++) {
            EI_DSP_MATRIX_B(temp, 1, matrix->cols, matrix->buffer + (row * matrix->cols));
            int ret = subtract(&temp, subtract_matrix->buffer[row]);
            if (ret != EIDSP_OK) {
                EIDSP_ERR(ret);
            }
        }

        return EIDSP_OK;
    }

    /**
     * Calculate the root mean square of a matrix, one per row
     * @param matrix Matrix of size (MxN)
     * @param output_matrix Matrix of size (Mx1)
     * @returns 0 if OK
     */
    static int rms(matrix_t *matrix, matrix_t *output_matrix) {
        if (matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float rms_result;
            arm_rms_f32(matrix->buffer + (row * matrix->cols), matrix->cols, &rms_result);
            output_matrix->buffer[row] = rms_result;
#else
            float sum = 0.0;
            for(size_t ix = 0; ix < matrix->cols; ix++) {
                float v = matrix->buffer[(row * matrix->cols) + ix];
                sum += v * v;
            }
            output_matrix->buffer[row] = sqrt(sum / static_cast<float>(matrix->cols));
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Calculate the mean over a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int mean(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float mean;
            arm_mean_f32(input_matrix->buffer + (row * input_matrix->cols), input_matrix->cols, &mean);
            output_matrix->buffer[row] = mean;
#else
            float sum = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                sum += input_matrix->buffer[( row * input_matrix->cols ) + col];
            }

            output_matrix->buffer[row] = sum / input_matrix->cols;
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Calculate the mean over a matrix on axis 0
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Nx1)
     * @returns 0 if OK
     */
    static int mean_axis0(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->cols != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t col = 0; col < input_matrix->cols; col++) {
            // Note - not using CMSIS-DSP here
            // gathering up the current columnand moving it into sequential memory to use
            // SIMD to calculate the mean would take more time than the simple loop
            // so disable this case. The alternative is to use 2 transposes and on a "big" ARM
            // platform that will take more time

            float sum = 0.0f;

            for (size_t row = 0; row < input_matrix->rows; row++) {
                sum += input_matrix->buffer[( row * input_matrix->cols ) + col];
            }

            output_matrix->buffer[col] = sum / input_matrix->rows;
        }

        return EIDSP_OK;
    }

    /**
     * Calculate the standard deviation over a matrix on axis 0
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Nx1)
     * @returns 0 if OK
     */
    static int std_axis0(matrix_t *input_matrix, matrix_t *output_matrix) {
#if EIDSP_USE_CMSIS_DSP
        return std_axis0_CMSIS(input_matrix, output_matrix);
#else

        if (input_matrix->cols != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t col = 0; col < input_matrix->cols; col++) {
            float sum = 0.0f;

            for (size_t row = 0; row < input_matrix->rows; row++) {
                sum += input_matrix->buffer[(row * input_matrix->cols) + col];
            }

            float mean = sum / input_matrix->rows;

            float std = 0.0f;
            float tmp;
            for (size_t row = 0; row < input_matrix->rows; row++) {
                tmp = input_matrix->buffer[(row * input_matrix->cols) + col] - mean;
                std += tmp * tmp;
            }

            output_matrix->buffer[col] = sqrt(std / input_matrix->rows);
        }

        return EIDSP_OK;
#endif
    }

    /**
     * Get the minimum value in a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int min(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float min;
            uint32_t ix;
            arm_min_f32(input_matrix->buffer + (row * input_matrix->cols), input_matrix->cols, &min, &ix);
            output_matrix->buffer[row] = min;
#else
            float min = FLT_MAX;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                float v = input_matrix->buffer[( row * input_matrix->cols ) + col];
                if (v < min) {
                    min = v;
                }
            }

            output_matrix->buffer[row] = min;
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Get the maximum value in a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int max(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float max;
            uint32_t ix;
            arm_max_f32(input_matrix->buffer + (row * input_matrix->cols), input_matrix->cols, &max, &ix);
            output_matrix->buffer[row] = max;
#else
            float max = -FLT_MAX;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                float v = input_matrix->buffer[( row * input_matrix->cols ) + col];
                if (v > max) {
                    max = v;
                }
            }

            output_matrix->buffer[row] = max;
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Get the stdev value in a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int stdev(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float std;
            float var;
            cmsis_arm_variance(&input_matrix->buffer[(row * input_matrix->cols)], input_matrix->cols, &var);
            arm_sqrt_f32(var, &std);
            output_matrix->buffer[row] = std;
#else
            float sum = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                sum += input_matrix->buffer[(row * input_matrix->cols) + col];
            }

            float mean = sum / input_matrix->cols;

            float std = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                float diff;
                diff = input_matrix->buffer[(row * input_matrix->cols) + col] - mean;
                std += diff * diff;
            }

            output_matrix->buffer[row] = sqrt(std / input_matrix->cols);
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Get the skewness value in a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int skew(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float mean;
            float var;

            // Calculate the mean & variance
            arm_mean_f32(input_matrix->buffer + (row * input_matrix->cols), input_matrix->cols, &mean);
            cmsis_arm_variance(&input_matrix->buffer[(row * input_matrix->cols)], input_matrix->cols, &var);

            // Calculate m_3
            float m_3;
            cmsis_arm_third_moment(&input_matrix->buffer[(row * input_matrix->cols)], input_matrix->cols, mean, &m_3);

            // Calculate (variance)^(3/2)
            arm_sqrt_f32(var * var * var, &var);

            // Calculate skew = (m_3) / (variance)^(3/2)
            if (var == 0.0f) {
                output_matrix->buffer[row] = 0.0f;
            } else {
                output_matrix->buffer[row] = m_3 / var;
            }
#else
            float sum = 0.0f;
            float mean;

            // Calculate the mean
            for (size_t col = 0; col < input_matrix->cols; col++) {
                sum += input_matrix->buffer[( row * input_matrix->cols ) + col];
            }
            mean = sum / input_matrix->cols;

            // Calculate the m values
            float m_3 = 0.0f;
            float m_2 = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                float diff;
                diff = input_matrix->buffer[( row * input_matrix->cols ) + col] - mean;
                m_3 += diff * diff * diff;
                m_2 += diff * diff;
            }
            m_3 = m_3 / input_matrix->cols;
            m_2 = m_2 / input_matrix->cols;

            // Calculate (m_2)^(3/2)
            m_2 = sqrt(m_2 * m_2 * m_2);

            // Calculate skew = (m_3) / (m_2)^(3/2)
            if (m_2 == 0.0f) {
                output_matrix->buffer[row] = 0.0f;
            } else {
                output_matrix->buffer[row] = m_3 / m_2;
            }
#endif
        }

        return EIDSP_OK;
    }

    /**
     * Get the kurtosis value in a matrix per row
     * @param input_matrix Input matrix (MxN)
     * @param output_matrix Output matrix (Mx1)
     */
    static int kurtosis(matrix_t *input_matrix, matrix_t *output_matrix) {
        if (input_matrix->rows != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }
        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        for (size_t row = 0; row < input_matrix->rows; row++) {
#if EIDSP_USE_CMSIS_DSP
            float mean;
            float var;

            // Calculate mean & variance
            arm_mean_f32(input_matrix->buffer + (row * input_matrix->cols), input_matrix->cols, &mean);
            cmsis_arm_variance(&input_matrix->buffer[(row * input_matrix->cols)], input_matrix->cols, &var);

            // Calculate m_4
            float m_4;
            cmsis_arm_fourth_moment(&input_matrix->buffer[(row * input_matrix->cols)], input_matrix->cols, mean, &m_4);

            // Calculate Fisher kurtosis = (m_4 / variance^2) - 3
            var = var * var;
            if (var == 0.0f) {
                output_matrix->buffer[row] = -3.0f;
            } else {
                output_matrix->buffer[row] = (m_4 / var) - 3.0f;
            }
#else
            // Calculate the mean
            float mean = 0.0f;
            float sum = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                sum += input_matrix->buffer[( row * input_matrix->cols ) + col];
            }
            mean = sum / input_matrix->cols;

            // Calculate m_4 & variance
            float m_4 = 0.0f;
            float variance = 0.0f;

            for (size_t col = 0; col < input_matrix->cols; col++) {
                float diff;
                diff = input_matrix->buffer[(row * input_matrix->cols) + col] - mean;
                float square_diff = diff * diff;
                variance += square_diff;
                m_4 += square_diff * square_diff;
            }
            m_4 = m_4 / input_matrix->cols;
            variance = variance / input_matrix->cols;

            // Square the variance
            variance = variance * variance;
            // Calculate Fisher kurtosis = (m_4 / variance^2) - 3
            if (variance == 0.0f) {
                output_matrix->buffer[row] = -3.0f;
            } else {
                output_matrix->buffer[row] = (m_4 / variance) - 3.0f;
            }
#endif
        }

        return EIDSP_OK;
    }


    /**
     * Compute the one-dimensional discrete Fourier Transform for real input.
     * This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of
     * a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
     * @param src Source buffer
     * @param src_size Size of the source buffer
     * @param output Output buffer
     * @param output_size Size of the output buffer, should be n_fft / 2 + 1
     * @returns 0 if OK
     */
    static int rfft(const float *src, size_t src_size, float *output, size_t output_size, size_t n_fft) {
        size_t n_fft_out_features = (n_fft / 2) + 1;
        if (output_size != n_fft_out_features) {
            EIDSP_ERR(EIDSP_BUFFER_SIZE_MISMATCH);
        }

        // truncate if needed
        if (src_size > n_fft) {
            src_size = n_fft;
        }

        // declare input and output arrays
        EI_DSP_MATRIX(fft_input, 1, n_fft);
        if (!fft_input.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        // copy from src to fft_input
        memcpy(fft_input.buffer, src, src_size * sizeof(float));
        // pad to the rigth with zeros
        memset(fft_input.buffer + src_size, 0, (n_fft - src_size) * sizeof(kiss_fft_scalar));

#if EIDSP_USE_CMSIS_DSP
        if (n_fft != 32 && n_fft != 64 && n_fft != 128 && n_fft != 256 &&
            n_fft != 512 && n_fft != 1024 && n_fft != 2048 && n_fft != 4096) {
            int ret = software_rfft(fft_input.buffer, output, n_fft, n_fft_out_features);
            if (ret != EIDSP_OK) {
                EIDSP_ERR(ret);
            }
        }
        else {
            // hardware acceleration only works for the powers above...
            arm_rfft_fast_instance_f32 rfft_instance;
            int status = cmsis_rfft_init_f32(&rfft_instance, n_fft);
            if (status != ARM_MATH_SUCCESS) {
                return status;
            }

            EI_DSP_MATRIX(fft_output, 1, n_fft);
            if (!fft_output.buffer) {
                EIDSP_ERR(EIDSP_OUT_OF_MEM);
            }

            arm_rfft_fast_f32(&rfft_instance, fft_input.buffer, fft_output.buffer, 0);

            output[0] = fft_output.buffer[0];
            output[n_fft_out_features - 1] = fft_output.buffer[1];

            size_t fft_output_buffer_ix = 2;
            for (size_t ix = 1; ix < n_fft_out_features - 1; ix += 1) {
                float rms_result;
                arm_rms_f32(fft_output.buffer + fft_output_buffer_ix, 2, &rms_result);
                output[ix] = rms_result * sqrt(2);

                fft_output_buffer_ix += 2;
            }
        }
#else
        int ret = software_rfft(fft_input.buffer, output, n_fft, n_fft_out_features);
        if (ret != EIDSP_OK) {
            EIDSP_ERR(ret);
        }
#endif

        return EIDSP_OK;
    }


    /**
     * Compute the one-dimensional discrete Fourier Transform for real input.
     * This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of
     * a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
     * @param src Source buffer
     * @param src_size Size of the source buffer
     * @param output Output buffer
     * @param output_size Size of the output buffer, should be n_fft / 2 + 1
     * @returns 0 if OK
     */
    static int rfft(const float *src, size_t src_size, fft_complex_t *output, size_t output_size, size_t n_fft) {
        size_t n_fft_out_features = (n_fft / 2) + 1;
        if (output_size != n_fft_out_features) {
            EIDSP_ERR(EIDSP_BUFFER_SIZE_MISMATCH);
        }

        // truncate if needed
        if (src_size > n_fft) {
            src_size = n_fft;
        }

        // declare input and output arrays
        float *fft_input_buffer = NULL;
        if (src_size == n_fft) {
            fft_input_buffer = (float*)src;
        }

        EI_DSP_MATRIX_B(fft_input, 1, n_fft, fft_input_buffer);
        if (!fft_input.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        if (!fft_input_buffer) {
            // copy from src to fft_input
            memcpy(fft_input.buffer, src, src_size * sizeof(float));
            // pad to the rigth with zeros
            memset(fft_input.buffer + src_size, 0, (n_fft - src_size) * sizeof(float));
        }

#if EIDSP_USE_CMSIS_DSP
        if (n_fft != 32 && n_fft != 64 && n_fft != 128 && n_fft != 256 &&
            n_fft != 512 && n_fft != 1024 && n_fft != 2048 && n_fft != 4096) {
            int ret = software_rfft(fft_input.buffer, output, n_fft, n_fft_out_features);
            if (ret != EIDSP_OK) {
                EIDSP_ERR(ret);
            }
        }
        else {
            // hardware acceleration only works for the powers above...
            arm_rfft_fast_instance_f32 rfft_instance;
            int status = cmsis_rfft_init_f32(&rfft_instance, n_fft);
            if (status != ARM_MATH_SUCCESS) {
                return status;
            }

            EI_DSP_MATRIX(fft_output, 1, n_fft);
            if (!fft_output.buffer) {
                EIDSP_ERR(EIDSP_OUT_OF_MEM);
            }

            arm_rfft_fast_f32(&rfft_instance, fft_input.buffer, fft_output.buffer, 0);

            output[0].r = fft_output.buffer[0];
            output[0].i = 0.0f;
            output[n_fft_out_features - 1].r = fft_output.buffer[1];
            output[n_fft_out_features - 1].i = 0.0f;

            size_t fft_output_buffer_ix = 2;
            for (size_t ix = 1; ix < n_fft_out_features - 1; ix += 1) {
                output[ix].r = fft_output.buffer[fft_output_buffer_ix];
                output[ix].i = fft_output.buffer[fft_output_buffer_ix + 1];

                fft_output_buffer_ix += 2;
            }
        }
#else
        int ret = software_rfft(fft_input.buffer, output, n_fft, n_fft_out_features);
        if (ret != EIDSP_OK) {
            EIDSP_ERR(ret);
        }
#endif

        return EIDSP_OK;
    }


    /**
     * Return evenly spaced numbers over a specified interval.
     * Returns num evenly spaced samples, calculated over the interval [start, stop].
     * The endpoint of the interval can optionally be excluded.
     *
     * Based on https://github.com/ntessore/algo/blob/master/linspace.c
     * Licensed in public domain (see LICENSE in repository above)
     *
     * @param start The starting value of the sequence.
     * @param stop The end value of the sequence.
     * @param number Number of samples to generate.
     * @param out Out array, with size `number`
     * @returns 0 if OK
     */
    static int linspace(float start, float stop, uint32_t number, float *out)
    {
        if (number < 1 || !out) {
            EIDSP_ERR(EIDSP_PARAMETER_INVALID);
        }

        if (number == 1) {
            out[0] = start;
            return EIDSP_OK;
        }

        // step size
        float step = (stop - start) / (number - 1);

        // do steps
        for (uint32_t ix = 0; ix < number - 1; ix++) {
            out[ix] = start + ix * step;
        }

        // last entry always stop
        out[number - 1] = stop;

        return EIDSP_OK;
    }

    /**
     * Return evenly spaced q31 numbers over a specified interval.
     * Returns num evenly spaced samples, calculated over the interval [start, stop].
     * The endpoint of the interval can optionally be excluded.
     *
     * Based on https://github.com/ntessore/algo/blob/master/linspace.c
     * Licensed in public domain (see LICENSE in repository above)
     *
     * @param start The starting value of the sequence.
     * @param stop The end value of the sequence.
     * @param number Number of samples to generate.
     * @param out Out array, with size `number`
     * @returns 0 if OK
     */
    static int linspace(EIDSP_i32 start, EIDSP_i32 stop, uint32_t number, EIDSP_i32 *out)
    {
        if (number < 1 || !out) {
            EIDSP_ERR(EIDSP_PARAMETER_INVALID);
        }

        if (number == 1) {
            out[0] = start;
            return EIDSP_OK;
        }

        // step size
        EIDSP_i32 step = (stop - start) / (number - 1);

        // do steps
        for (uint32_t ix = 0; ix < number - 1; ix++) {
            out[ix] = start + ix * step;
        }

        // last entry always stop
        out[number - 1] = stop;

        return EIDSP_OK;
    }

    /**
     * Convert an int32_t buffer into a float buffer, maps to -1..1
     * @param input
     * @param output
     * @param length
     * @returns 0 if OK
     */
    static int int32_to_float(const EIDSP_i32 *input, float *output, size_t length) {
#if EIDSP_USE_CMSIS_DSP
        arm_q31_to_float((q31_t *)input, output, length);
#else
        for (size_t ix = 0; ix < length; ix++) {
            output[ix] = (float)(input[ix]) / 2147483648.f;
        }
#endif
        return EIDSP_OK;
    }

    /**
     * Convert an float buffer into a fixedpoint 32 bit buffer, input values are
     * limited between -1 and 1
     * @param input
     * @param output
     * @param length
     * @returns 0 if OK
     */
    static int float_to_int32(const float *input, EIDSP_i32 *output, size_t length) {
#if EIDSP_USE_CMSIS_DSP
        arm_float_to_q31((float *)input, (q31_t *)output, length);
#else
        for (size_t ix = 0; ix < length; ix++) {
            output[ix] = (EIDSP_i32)saturate((int64_t)(input[ix] * 2147483648.f), 32);
        }
#endif
        return EIDSP_OK;
    }

    /**
     * Convert an int16_t buffer into a float buffer, maps to -1..1
     * @param input
     * @param output
     * @param length
     * @returns 0 if OK
     */
    static int int16_to_float(const EIDSP_i16 *input, float *output, size_t length) {
#if EIDSP_USE_CMSIS_DSP
        arm_q15_to_float((q15_t *)input, output, length);
#else
        for (size_t ix = 0; ix < length; ix++) {
            output[ix] = (float)(input[ix]) / 32768.f;
        }
#endif
        return EIDSP_OK;
    }

    /**
     * Convert an float buffer into a fixedpoint 16 bit buffer, input values are
     * limited between -1 and 1
     * @param input
     * @param output
     * @param length
     * @returns 0 if OK
     */
    static int float_to_int16(const float *input, EIDSP_i16 *output, size_t length) {
#if EIDSP_USE_CMSIS_DSP
        arm_float_to_q15((float *)input, output, length);
#else
        for (size_t ix = 0; ix < length; ix++) {
            output[ix] = (EIDSP_i16)saturate((int32_t)(input[ix] * 32768.f), 16);
        }
#endif
        return EIDSP_OK;
    }

    /**
     * Convert an int8_t buffer into a float buffer, maps to -1..1
     * @param input
     * @param output
     * @param length
     * @returns 0 if OK
     */
    static int int8_to_float(const EIDSP_i8 *input, float *output, size_t length) {
#if EIDSP_USE_CMSIS_DSP
        arm_q7_to_float((q7_t *)input, output, length);
#else
        for (size_t ix = 0; ix < length; ix++) {
            output[ix] = (float)(input[ix]) / 128;
        }
#endif
        return EIDSP_OK;
    }

#if EIDSP_SIGNAL_C_FN_POINTER == 0
    /**
     * Create a signal structure from a buffer.
     * This is useful for data that you keep in memory anyway. If you need to load from
     * flash, then create the structure yourself.
     * @param data Buffer, make sure to keep this pointer alive
     * @param data_size Size of the buffer
     * @param signal Output signal
     * @returns EIDSP_OK if ok
     */
    static int signal_from_buffer(const float *data, size_t data_size, signal_t *signal)
    {
        signal->total_length = data_size;
#ifdef __MBED__
        signal->get_data = mbed::callback(&numpy::signal_get_data, data);
#else
        signal->get_data = [data](size_t offset, size_t length, float *out_ptr) {
            return numpy::signal_get_data(data, offset, length, out_ptr);
        };
#endif
        return EIDSP_OK;
    }

#endif

#if defined ( __GNUC__ )
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
    /**
     * > 50% faster then the math.h log() function
     * in return for a small loss in accuracy (0.00001 average diff with log())
     * From: https://stackoverflow.com/questions/39821367/very-fast-approximate-logarithm-natural-log-function-in-c/39822314#39822314
     * Licensed under the CC BY-SA 3.0
     * @param a Input number
     * @returns Natural log value of a
     */
    __attribute__((always_inline)) static inline float log(float a)
    {
        int32_t g = (int32_t) * ((int32_t *)&a);
        int32_t e = (g - 0x3f2aaaab) & 0xff800000;
        g = g - e;
        float m = (float) * ((float *)&g);
        float i = (float)e * 1.19209290e-7f; // 0x1.0p-23
        /* m in [2/3, 4/3] */
        float f = m - 1.0f;
        float s = f * f;
        /* Compute log1p(f) for f in [-1/3, 1/3] */
        float r = fmaf(0.230836749f, f, -0.279208571f); // 0x1.d8c0f0p-3, -0x1.1de8dap-2
        float t = fmaf(0.331826031f, f, -0.498910338f); // 0x1.53ca34p-2, -0x1.fee25ap-2
        r = fmaf(r, s, t);
        r = fmaf(r, s, f);
        r = fmaf(i, 0.693147182f, r); // 0x1.62e430p-1 // log(2)

        return r;
    }

    /**
     * Fast log10 and log2 functions, significantly faster than the ones from math.h (~6x for log10 on M4F)
     * From https://community.arm.com/developer/tools-software/tools/f/armds-forum/4292/cmsis-dsp-new-functionality-proposal/22621#22621
     * @param a Input number
     * @returns Log2 value of a
     */
    __attribute__((always_inline)) static inline float log2(float a)
    {
        int e;
        float f = frexpf(fabsf(a), &e);
        float y = 1.23149591368684f;
        y *= f;
        y += -4.11852516267426f;
        y *= f;
        y += 6.02197014179219f;
        y *= f;
        y += -3.13396450166353f;
        y += e;
        return y;
    }

    /**
     * Fast log10 and log2 functions, significantly faster than the ones from math.h (~6x for log10 on M4F)
     * From https://community.arm.com/developer/tools-software/tools/f/armds-forum/4292/cmsis-dsp-new-functionality-proposal/22621#22621
     * @param a Input number
     * @returns Log10 value of a
     */
    __attribute__((always_inline)) static inline float log10(float a)
    {
        return numpy::log2(a) * 0.3010299956639812f;
    }
#if defined ( __GNUC__ )
#pragma GCC diagnostic pop
#endif

    /**
     * Calculate the natural log value of a matrix. Does an in-place replacement.
     * @param matrix Matrix (MxN)
     * @returns 0 if OK
     */
    static int log(matrix_t *matrix)
    {
        for (uint32_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] = numpy::log(matrix->buffer[ix]);
        }

        return EIDSP_OK;
    }

    /**
     * Calculate the log10 of a matrix. Does an in-place replacement.
     * @param matrix Matrix (MxN)
     * @returns 0 if OK
     */
    static int log10(matrix_t *matrix)
    {
        for (uint32_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] = numpy::log10(matrix->buffer[ix]);
        }

        return EIDSP_OK;
    }

    /**
     * @brief      Signed Saturate
     *
     * @param[in]  val   The value to be saturated
     * @param[in]  sat   Bit position to saturate to (1..32)
     *
     * @return     Saturated value
     */
    static int32_t saturate(int64_t val, uint32_t sat)
    {
        if ((sat >= 1U) && (sat <= 32U)) {
            int64_t max = (int64_t)((1U << (sat - 1U)) - 1U);
            int64_t min = -1 - max;
            if (val > max) {
                return (int32_t)max;
            } else if (val < min) {
                return (int32_t)min;
            }
        }
        return (int32_t)val;
    }

    /**
     * Normalize a matrix to 0..1. Does an in-place replacement.
     * Normalization done per row.
     * @param matrix
     */
    static int normalize(matrix_t *matrix) {
        // Python implementation:
        //  matrix = (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))
        int r;

        matrix_t temp_matrix(1, matrix->rows * matrix->cols, matrix->buffer);

        matrix_t min_matrix(1, 1);
        if (!min_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        r = min(&temp_matrix, &min_matrix);
        if (r != EIDSP_OK) {
            EIDSP_ERR(r);
        }

        matrix_t max_matrix(1, 1);
        if (!max_matrix.buffer) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }
        r = max(&temp_matrix, &max_matrix);
        if (r != EIDSP_OK) {
            EIDSP_ERR(r);
        }

        float min_max_diff = (max_matrix.buffer[0] - min_matrix.buffer[0]);
        /* Prevent divide by 0 by setting minimum value for divider */
        float row_scale = min_max_diff < 0.001 ? 1.0f : 1.0f / min_max_diff;

        r = subtract(&temp_matrix, min_matrix.buffer[0]);
        if (r != EIDSP_OK) {
            EIDSP_ERR(r);
        }

        r = scale(&temp_matrix, row_scale);
        if (r != EIDSP_OK) {
            EIDSP_ERR(r);
        }

        return EIDSP_OK;
    }

    /**
     * Clip (limit) the values in an array. Does an in-place replacement.
     * Values outside the interval are clipped to the interval edges.
     * For example, if an interval of [0, 1] is specified, values smaller than 0 become 0,
     * and values larger than 1 become 1.
     * @param matrix
     * @param min Min value to be clipped
     * @param max Max value to be clipped
     */
    static int clip(matrix_t *matrix, float min, float max) {
        if (max < min) {
            EIDSP_ERR(EIDSP_PARAMETER_INVALID);
        }

        for (size_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            if (matrix->buffer[ix] < min) {
                matrix->buffer[ix] = min;
            }
            else if (matrix->buffer[ix] > max) {
                matrix->buffer[ix] = max;
            }
        }

        return EIDSP_OK;
    }

    /**
     * Cut the data behind the comma on a matrix. Does an in-place replacement.
     * E.g. around([ 3.01, 4.89 ]) becomes [3, 4]
     * @param matrix
     */
    static int round(matrix_t *matrix) {
        for (size_t ix = 0; ix < matrix->rows * matrix->cols; ix++) {
            matrix->buffer[ix] = ::round(matrix->buffer[ix]);
        }

        return EIDSP_OK;
    }

    static int software_rfft(float *fft_input, float *output, size_t n_fft, size_t n_fft_out_features) {
        kiss_fft_cpx *fft_output = (kiss_fft_cpx*)ei_dsp_malloc(n_fft_out_features * sizeof(kiss_fft_cpx));
        if (!fft_output) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        size_t kiss_fftr_mem_length;

        // create fftr context
        kiss_fftr_cfg cfg = kiss_fftr_alloc(n_fft, 0, NULL, NULL, &kiss_fftr_mem_length);
        if (!cfg) {
            ei_dsp_free(fft_output, n_fft_out_features * sizeof(kiss_fft_cpx));
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        ei_dsp_register_alloc(kiss_fftr_mem_length, cfg);

        // execute the rfft operation
        kiss_fftr(cfg, fft_input, fft_output);

        // and write back to the output
        for (size_t ix = 0; ix < n_fft_out_features; ix++) {
            output[ix] = sqrt(pow(fft_output[ix].r, 2) + pow(fft_output[ix].i, 2));
        }

        ei_dsp_free(cfg, kiss_fftr_mem_length);
        ei_dsp_free(fft_output, n_fft_out_features * sizeof(kiss_fft_cpx));

        return EIDSP_OK;
    }

    static int software_rfft(float *fft_input, fft_complex_t *output, size_t n_fft, size_t n_fft_out_features)
    {
        // create fftr context
        size_t kiss_fftr_mem_length;

        kiss_fftr_cfg cfg = kiss_fftr_alloc(n_fft, 0, NULL, NULL, &kiss_fftr_mem_length);
        if (!cfg) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        ei_dsp_register_alloc(kiss_fftr_mem_length, cfg);

        // execute the rfft operation
        kiss_fftr(cfg, fft_input, (kiss_fft_cpx*)output);

        ei_dsp_free(cfg, kiss_fftr_mem_length);

        return EIDSP_OK;
    }

    static int signal_get_data(const float *in_buffer, size_t offset, size_t length, float *out_ptr)
    {
        memcpy(out_ptr, in_buffer + offset, length * sizeof(float));
        return 0;
    }

    static int signal_get_data_i16(int16_t *in_buffer, size_t offset, size_t length, int16_t *out_ptr)
    {
        memcpy(out_ptr, in_buffer + offset, length * sizeof(int16_t));
        return 0;
    }

#if EIDSP_USE_CMSIS_DSP
    /**
     * @brief      The CMSIS std variance function with the same behaviour as the NumPy
     * implementation
     * @details    Variance in CMSIS version is calculated using fSum / (float32_t)(blockSize - 1)
     * @param[in]  pSrc       Pointer to float block
     * @param[in]  blockSize  Number of floats in block
     * @param      pResult    The variance
     */
    static void cmsis_arm_variance(const float32_t *pSrc, uint32_t blockSize, float32_t *pResult)
    {
        uint32_t blkCnt;
        float32_t sum = 0.0f;
        float32_t fSum = 0.0f;
        float32_t fMean, fValue;
        const float32_t *pInput = pSrc;

        if (blockSize <= 1U) {
            *pResult = 0;
            return;
        }
        blkCnt = blockSize >> 2U;

        while (blkCnt > 0U) {
            sum += *pInput++;
            sum += *pInput++;
            sum += *pInput++;
            sum += *pInput++;
            blkCnt--;
        }

        /* Loop unrolling: Compute remaining outputs */
        blkCnt = blockSize % 0x4U;

        while (blkCnt > 0U) {
            sum += *pInput++;
            blkCnt--;
        }

        fMean = sum / (float32_t)blockSize;

        pInput = pSrc;

        /* Loop unrolling: Compute 4 outputs at a time */
        blkCnt = blockSize >> 2U;

        while (blkCnt > 0U) {
            fValue = *pInput++ - fMean;
            fSum += fValue * fValue;
            fValue = *pInput++ - fMean;
            fSum += fValue * fValue;
            fValue = *pInput++ - fMean;
            fSum += fValue * fValue;
            fValue = *pInput++ - fMean;
            fSum += fValue * fValue;
            blkCnt--;
        }

        /* Loop unrolling: Compute remaining outputs */
        blkCnt = blockSize % 0x4U;

        while (blkCnt > 0U) {
            fValue = *pInput++ - fMean;
            fSum += fValue * fValue;
            blkCnt--;
        }

        /* Variance */
        *pResult = fSum / (float32_t)(blockSize);
    }

    /**
     * @brief      Copy of the numpy version explicitely using the CMSIS lib
     *             for STD and Matrix transpose
     * @param      input_matrix   The input matrix
     * @param      output_matrix  The output matrix
     *
     * @return     EIDSP error
     */
    static int std_axis0_CMSIS(matrix_t *input_matrix, matrix_t *output_matrix)
    {
        arm_matrix_instance_f32 arm_in_matrix, arm_transposed_matrix;

        if (input_matrix->cols != output_matrix->rows) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        if (output_matrix->cols != 1) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        /* Copy input matrix to arm matrix */
        arm_in_matrix.numRows = input_matrix->rows;
        arm_in_matrix.numCols = input_matrix->cols;
        arm_in_matrix.pData = &input_matrix->buffer[0];
        /* Create transposed matrix */
        arm_transposed_matrix.numRows = input_matrix->cols;
        arm_transposed_matrix.numCols = input_matrix->rows;
        arm_transposed_matrix.pData = (float *)ei_calloc(input_matrix->cols * input_matrix->rows * sizeof(float), 1);

        if (arm_transposed_matrix.pData == NULL) {
            EIDSP_ERR(EIDSP_OUT_OF_MEM);
        }

        int ret = arm_mat_trans_f32(&arm_in_matrix, &arm_transposed_matrix);
        if (ret != EIDSP_OK) {
            EIDSP_ERR(ret);
        }

        for (size_t row = 0; row < arm_transposed_matrix.numRows; row++) {
            float std;
            float var;

            cmsis_arm_variance(arm_transposed_matrix.pData + (row * arm_transposed_matrix.numCols),
                               arm_transposed_matrix.numCols, &var);
            arm_sqrt_f32(var, &std);

            output_matrix->buffer[row] = std;
        }

        ei_free(arm_transposed_matrix.pData);

        return EIDSP_OK;
    }

    /**
     * @brief      A copy of the CMSIS power function, adapted to calculate the third central moment
     * @details    Calculates the sum of cubes of a block with the mean value subtracted.
     * @param[in]  pSrc       Pointer to float block
     * @param[in]  blockSize  Number of floats in block
     * @param[in]  mean       The mean to subtract from each value before cubing
     * @param      pResult    The third central moment of the input
     */
    static void cmsis_arm_third_moment(const float32_t * pSrc, uint32_t blockSize, float32_t mean, float32_t * pResult)
    {
        uint32_t blkCnt;
        float32_t sum = 0.0f;
        float32_t in;

        /* Loop unrolling: Compute 4 outputs at a time */
        blkCnt = blockSize >> 2U;

        while (blkCnt > 0U) {

            /* Compute Power and store result in a temporary variable, sum. */
            in = *pSrc++;
            in = in - mean;
            sum += in * in * in;

            in = *pSrc++;
            in = in - mean;
            sum += in * in * in;

            in = *pSrc++;
            in = in - mean;
            sum += in * in * in;

            in = *pSrc++;
            in = in - mean;
            sum += in * in * in;

            /* Decrement loop counter */
            blkCnt--;
        }

        /* Loop unrolling: Compute remaining outputs */
        blkCnt = blockSize % 0x4U;

        while (blkCnt > 0U) {
            /* Compute Power and store result in a temporary variable, sum. */
            in = *pSrc++;
            in = in - mean;
            sum += in * in * in;

            /* Decrement loop counter */
            blkCnt--;
        }

        sum = sum / blockSize;
        /* Store result to destination */
        *pResult = sum;
    }

    /**
     * @brief      A copy of the CMSIS power function, adapted to calculate the fourth central moment
     * @details    Calculates the sum of fourth powers of a block with the mean value subtracted.
     * @param[in]  pSrc       Pointer to float block
     * @param[in]  blockSize  Number of floats in block
     * @param[in]  mean       The mean to subtract from each value before calculating fourth power
     * @param      pResult    The fourth central moment of the input
     */
    static void cmsis_arm_fourth_moment(const float32_t * pSrc, uint32_t blockSize, float32_t mean, float32_t * pResult)
    {
        uint32_t blkCnt;
        float32_t sum = 0.0f;
        float32_t in;

        /* Loop unrolling: Compute 4 outputs at a time */
        blkCnt = blockSize >> 2U;

        while (blkCnt > 0U) {

            /* Compute Power and store result in a temporary variable, sum. */
            in = *pSrc++;
            in = in - mean;
            float square;
            square = in * in;
            sum += square * square;

            in = *pSrc++;
            in = in - mean;
            square = in * in;
            sum += square * square;

            in = *pSrc++;
            in = in - mean;
            square = in * in;
            sum += square * square;

            in = *pSrc++;
            in = in - mean;
            square = in * in;
            sum += square * square;

            /* Decrement loop counter */
            blkCnt--;
        }

        /* Loop unrolling: Compute remaining outputs */
        blkCnt = blockSize % 0x4U;

        while (blkCnt > 0U) {
            /* Compute Power and store result in a temporary variable, sum. */
            in = *pSrc++;
            in = in - mean;
            float square;
            square = in * in;
            sum += square * square;

            /* Decrement loop counter */
            blkCnt--;
        }

        sum = sum / blockSize;
        /* Store result to destination */
        *pResult = sum;
    }
#endif // EIDSP_USE_CMSIS_DSP

    static uint8_t count_leading_zeros(uint32_t data)
    {
      if (data == 0U) { return 32U; }

      uint32_t count = 0U;
      uint32_t mask = 0x80000000U;

      while ((data & mask) == 0U)
      {
        count += 1U;
        mask = mask >> 1U;
      }
      return count;
    }

    static void sqrt_q15(int16_t in, int16_t *pOut)
    {
        int32_t bits_val1;
        int16_t number, temp1, var1, signBits1, half;
        float temp_float1;
        union {
            int32_t fracval;
            float floatval;
        } tempconv;

        number = in;

        /* If the input is a positive number then compute the signBits. */
        if (number > 0) {
            signBits1 = count_leading_zeros(number) - 17;

            /* Shift by the number of signBits1 */
            if ((signBits1 % 2) == 0) {
                number = number << signBits1;
            } else {
                number = number << (signBits1 - 1);
            }

            /* Calculate half value of the number */
            half = number >> 1;
            /* Store the number for later use */
            temp1 = number;

            /* Convert to float */
            temp_float1 = number * 3.051757812500000e-005f;
            /* Store as integer */
            tempconv.floatval = temp_float1;
            bits_val1 = tempconv.fracval;
            /* Subtract the shifted value from the magic number to give intial guess */
            bits_val1 = 0x5f3759df - (bits_val1 >> 1); /* gives initial guess */
            /* Store as float */
            tempconv.fracval = bits_val1;
            temp_float1 = tempconv.floatval;
            /* Convert to integer format */
            var1 = (int32_t)(temp_float1 * 16384);

            /* 1st iteration */
            var1 =
                ((int16_t)(
                    (int32_t)var1 *
                        (0x3000 -
                         ((int16_t)((((int16_t)(((int32_t)var1 * var1) >> 15)) * (int32_t)half) >> 15))) >>
                    15))
                << 2;
            /* 2nd iteration */
            var1 =
                ((int16_t)(
                    (int32_t)var1 *
                        (0x3000 -
                         ((int16_t)((((int16_t)(((int32_t)var1 * var1) >> 15)) * (int32_t)half) >> 15))) >>
                    15))
                << 2;
            /* 3rd iteration */
            var1 =
                ((int16_t)(
                    (int32_t)var1 *
                        (0x3000 -
                         ((int16_t)((((int16_t)(((int32_t)var1 * var1) >> 15)) * (int32_t)half) >> 15))) >>
                    15))
                << 2;

            /* Multiply the inverse square root with the original value */
            var1 = ((int16_t)(((int32_t)temp1 * var1) >> 15)) << 1;

            /* Shift the output down accordingly */
            if ((signBits1 % 2) == 0) {
                var1 = var1 >> (signBits1 / 2);
            } else {
                var1 = var1 >> ((signBits1 - 1) / 2);
            }
            *pOut = var1;
        }
        /* If the number is a negative number then store zero as its square root value */
        else {
            *pOut = 0;
        }
    }

#if EIDSP_USE_CMSIS_DSP
    /**
     * Initialize a CMSIS-DSP fast rfft structure
     * We do it this way as this means we can compile out fast_init calls which hints the compiler
     * to which tables can be removed
     */
    static int cmsis_rfft_init_f32(arm_rfft_fast_instance_f32 *rfft_instance, const size_t n_fft)
    {
// ARM cores (ex M55) with Helium extensions (MVEF) need special treatment (Issue 2843)
#if EI_CLASSIFIER_HAS_FFT_INFO == 1 && !defined(ARM_MATH_MVEF) && !defined(EI_CLASSIFIER_LOAD_ALL_FFTS)
        arm_status status;
        switch (n_fft) {
#if EI_CLASSIFIER_LOAD_FFT_32 == 1
            case 32: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 16U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len16.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len16.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len16.pTwiddle;
                rfft_instance->fftLenRFFT = 32U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_32;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_64 == 1
            case 64: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 32U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len32.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len32.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len32.pTwiddle;
                rfft_instance->fftLenRFFT = 64U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_64;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_128 == 1
            case 128: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 64U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len64.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len64.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len64.pTwiddle;
                rfft_instance->fftLenRFFT = 128U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_128;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_256 == 1
            case 256: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 128U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len128.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len128.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len128.pTwiddle;
                rfft_instance->fftLenRFFT = 256U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_256;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_512 == 1
            case 512: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 256U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len256.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len256.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len256.pTwiddle;
                rfft_instance->fftLenRFFT = 512U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_512;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_1024 == 1
            case 1024: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 512U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len512.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len512.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len512.pTwiddle;
                rfft_instance->fftLenRFFT = 1024U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_1024;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_2048 == 1
            case 2048: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 1024U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len1024.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len1024.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len1024.pTwiddle;
                rfft_instance->fftLenRFFT = 2048U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_2048;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
#if EI_CLASSIFIER_LOAD_FFT_4096 == 1
            case 4096: {
                arm_cfft_instance_f32 *S = &(rfft_instance->Sint);
                S->fftLen = 2048U;
                S->pTwiddle = NULL;
                S->bitRevLength = arm_cfft_sR_f32_len2048.bitRevLength;
                S->pBitRevTable = arm_cfft_sR_f32_len2048.pBitRevTable;
                S->pTwiddle = arm_cfft_sR_f32_len2048.pTwiddle;
                rfft_instance->fftLenRFFT = 4096U;
                rfft_instance->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_4096;
                status = ARM_MATH_SUCCESS;
                break;
            }
#endif
            default:
                return EIDSP_FFT_TABLE_NOT_LOADED;
        }

        return status;
#else
        return arm_rfft_fast_init_f32(rfft_instance, n_fft);
#endif
    }
#endif // #if EIDSP_USE_CMSIS_DSP

    /**
     * Power spectrum of a frame
     * @param frame Row of a frame
     * @param frame_size Size of the frame
     * @param out_buffer Out buffer, size should be fft_points
     * @param out_buffer_size Buffer size
     * @param fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
     * @returns EIDSP_OK if OK
     */
    static int power_spectrum(
        float *frame,
        size_t frame_size,
        float *out_buffer,
        size_t out_buffer_size,
        uint16_t fft_points)
    {
        if (out_buffer_size != static_cast<size_t>(fft_points / 2 + 1)) {
            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
        }

        int r = numpy::rfft(frame, frame_size, out_buffer, out_buffer_size, fft_points);
        if (r != EIDSP_OK) {
            return r;
        }

        for (size_t ix = 0; ix < out_buffer_size; ix++) {
            out_buffer[ix] = (1.0 / static_cast<float>(fft_points)) *
                (out_buffer[ix] * out_buffer[ix]);
        }

        return EIDSP_OK;
    }

    static int welch_max_hold(
        float *input,
        size_t input_size,
        float *output,
        size_t start_bin,
        size_t stop_bin,
        size_t fft_points,
        bool do_overlap)
    {
        // save off one point to put back, b/c we're going to calculate in place
        float saved_point = 0;
        bool do_saved_point = false;
        size_t fft_out_size = fft_points / 2 + 1;
        float *fft_out;
        ei_unique_ptr_t p_fft_out(nullptr, ei_free);
        if (input_size < fft_points) {
            fft_out = (float *)ei_calloc(fft_out_size, sizeof(float));
            p_fft_out.reset(fft_out);
        }
        else {
            // set input as output for in place operation
            fft_out = input;
            // save off one point to put back, b/c we're going to calculate in place
            saved_point = input[fft_points / 2];
            do_saved_point = true;
        }

        // init the output to zeros
        memset(output, 0, sizeof(float) * (stop_bin - start_bin));
        int input_ix = 0;
        while (input_ix < (int)input_size) {
            // Figure out if we need any zero padding
            size_t n_input_points = input_ix + fft_points <= input_size ? fft_points
                                                                        : input_size - input_ix;
            EI_TRY(power_spectrum(
                input + input_ix,
                n_input_points,
                fft_out,
                fft_points / 2 + 1,
                fft_points));
            int j = 0;
            // keep the max of the last frame and everything before
            for (size_t i = start_bin; i < stop_bin; i++) {
                output[j] = std::max(output[j], fft_out[i]);
                j++;
            }
            if (do_overlap) {
                if (do_saved_point) {
                    // This step only matters first time through
                    input[fft_points / 2] = saved_point;
                    do_saved_point = false;
                }
                input_ix += fft_points / 2;
            }
            else {
                input_ix += fft_points;
            }
        }

        return EIDSP_OK;
    }

    static float variance(float *input, size_t size)
    {
        // Use CMSIS either way.  Will fall back to straight C when needed
        float temp;
#if EIDSP_USE_CMSIS_DSP
        arm_var_f32(input, size, &temp);
#else
        float mean = 0.0f;
        for (size_t i = 0; i < size; i++) {
            mean += input[i];
        }
        mean /= size;

        temp = 0.0f;
        for (size_t i = 0; i < size; i++) {
            temp += (input[i] - mean) * (input[i] - mean);
        }
        temp /= (size - 1);
#endif
        return temp;
    }

    /**
     * This function handle the issue with zero values if the are exposed
     * to become an argument for any log function.
     * @param input Array
     * @param input_size Size of array
     * @returns void
     */
    static void zero_handling(float *input, size_t input_size)
    {
        for (size_t ix = 0; ix < input_size; ix++) {
            if (input[ix] == 0) {
                input[ix] = 1e-10;
            }
        }
    }

    /**
     * This function handle the issue with zero values if the are exposed
     * to become an argument for any log function.
     * @param input Matrix
     * @returns void
     */
    static void zero_handling(matrix_t *input)
    {
        zero_handling(input->buffer, input->rows * input->cols);
    }

    __attribute__((unused)) static void scale(fvec& v, float scale) {
        for (auto& x : v) {
            x *= scale;
        }
    }

    __attribute__((unused)) static void sub(fvec& v, float b) {
        for (auto& x : v) {
            x -= b;
        }
    }

    __attribute__((unused)) static void mul(float* y, const float* x, float* b, size_t n) {
        for (size_t i = 0; i < n; i++) {
            y[i] = x[i] * b[i];
        }
    }

    __attribute__((unused)) static fvec diff(const float* v, size_t n) {
        fvec d(n - 1);
        for (size_t i = 0; i < d.size(); i++) {
            d[i] = v[i + 1] - v[i];
        }
        return d;
    }

    __attribute__((unused)) static float sum(const float* v, size_t n) {
        float sum = 0;
        for (size_t i = 0; i < n; i++) {
            sum += v[i];
        }
        return sum;
    }

    static float mean(const fvec& v) {
        float mean = 0;
        for (auto x : v) {
            mean += x;
        }
        mean /= v.size();
        return mean;
    }

    static float mean(const float* v, size_t n) {
        float mean = 0;
        for (size_t i = 0; i < n; i++) {
            mean += v[i];
        }
        mean /= n;
        return mean;
    }

    static float median(const float* v, size_t n) {
        fvec vc(n);
        std::copy(v, v + n, vc.begin());
        std::sort(vc.begin(), vc.end());
        if (vc.size() % 2 == 0) {
            return (vc[vc.size() / 2 - 1] + vc[vc.size() / 2]) / 2;
        }
        return vc[vc.size() / 2];
    }

    __attribute__((unused)) static float median(const fvec& v) {
        return median(v.data(), v.size());
    }

    static float stddev(const float* v, size_t n, float m /* mean */, int ddof = 0) {
        float var = 0;
        for (size_t i = 0; i < n; i++) {
            var += (v[i] - m) * (v[i] - m);
        }
        var /= n - ddof;
        return sqrt(var);
    }

    __attribute__((unused)) static float stddev(const float* v, size_t n) {
        return stddev(v, n, mean(v, n), 0);
    }

    __attribute__((unused)) static float stddev(const float* v, size_t n, int ddof) {
        return stddev(v, n, mean(v, n), ddof);
    }

    __attribute__((unused)) static float stddev(const fvec& v, int ddof = 0) {
        return stddev(v.data(), v.size(), mean(v), ddof);
    }

    static float rms(const float* v, size_t n) {
        float rms = 0;
        for (size_t i = 0; i < n; i++) {
            rms += v[i] * v[i];
        }
        rms /= n;
        return sqrt(rms);
    }

    __attribute__((unused)) static float rms(const fvec& v) {
        return rms(v.data(), v.size());
    }

    template <typename T>
    static float max(const ei_vector<T>& v) {
        return *std::max_element(v.begin(), v.end());
    }

    __attribute__((unused)) static float max(const float* v, size_t n) {
        return *std::max_element(v, v + n);
    }

    template <typename T>
    static float min(const ei_vector<T>& v) {
        return *std::min_element(v.begin(), v.end());
    }

    __attribute__((unused)) static float min(const float* v, size_t n) {
        return *std::min_element(v, v + n);
    }

    __attribute__((unused)) static int argmax(const fvec& v, int start, int end) {
        return std::max_element(v.begin() + start, v.begin() + end) - v.begin();
    }

    __attribute__((unused)) static fvec divide(float num, const float* den, size_t n) {
        fvec v(n);
        for (size_t i = 0; i < n; i++) {
            v[i] = num / den[i];
        }
        return v;
    }

    __attribute__((unused)) static ivec histogram(const float* x, size_t n, int a, int b, int inc) {
        int num_bins = (b - a) / inc;
        ivec bins(num_bins, 0);
        for (size_t i = 0; i < n; i++) {
            int bin = (int)((x[i] - a) / inc);
            if (bin >= 0 && bin < num_bins) {
                bins[bin]++;
            }
        }
        return bins;
    }

    __attribute__((unused)) static fvec cumsum(const float* v, size_t n) {
        fvec c(n);
        c[0] = v[0];
        for (size_t i = 1; i < n; i++) {
            c[i] = c[i - 1] + v[i];
        }
        return c;
    }

    __attribute__((unused)) static fvec arrange(float start, float end, float step) {
        assert(start < end);
        assert(step > 0);
        fvec v((size_t)((end - start) / step));
        for (size_t i = 0; i < v.size(); i++) {
            v[i] = start + i * step;
        }
        return v;
    }

    __attribute__((unused)) static void add(fvec& v, fvec& b) {
        for (size_t i = 0; i < v.size(); i++) {
            v[i] += b[i];
        }
    }

    __attribute__((unused)) static float trapz(const fvec& x, const fvec& y, size_t lo, size_t hi) {
        float area = 0;
        for (size_t i = lo; i < hi; i++) {
            area += (x[i + 1] - x[i]) * (y[i + 1] + y[i]) / 2;
        }
        return area;
    }

    __attribute__((unused)) static fvec quantile(const fvec& v, size_t start, size_t end, const fvec& q) {
        end = std::min(end, v.size());
        fvec vc(end - start);
        std::copy(v.begin() + start, v.begin() + end, vc.begin());
        std::sort(vc.begin(), vc.end());
        fvec res(q.size());
        for (size_t i = 0; i < q.size(); i++) {
            res[i] = vc[q[i] * vc.size()];
        }
        return res;
    }

    __attribute__((unused)) static fvec quantile(const float* v, size_t n, const fvec& q) {
        fvec vc(n);
        std::copy(v, v + n, vc.begin());
        std::sort(vc.begin(), vc.end());
        fvec res(q.size());
        for (size_t i = 0; i < q.size(); i++) {
            res[i] = vc[q[i] * vc.size()];
        }
        return res;
    }

    static float dot(const float* x, const float* y, size_t n) {
        float res = 0;
        for (size_t i = 0; i < n; i++) {
            res += x[i] * y[i];
        }
        return res;
    }


    __attribute__((unused)) static float cosine_similarity(const fvec& x, const fvec& y) {
        float xy = dot(x.data(), y.data(), x.size());
        float magx = dot(x.data(), x.data(), x.size());
        float magy = dot(y.data(), y.data(), y.size());
        xy /= sqrt(magx * magy);
        return xy;
    }

    __attribute__((unused)) static void ln(fvec& v) {
        for (auto& x : v) {
            x = log(x);
        }
    }

    static size_t next_power_of_2(size_t x) {
        size_t res = 1;
        while (res < x) {
            res *= 2;
        }
        return res;
    }

    static void detrend(float* data, size_t n) {
        // Calculate the mean of the data points
        float mean = 0.0;
        for (size_t i = 0; i < n; i++) {
            mean += data[i];
        }
        mean /= n;

        // Calculate the slope of the best-fit line
        float x_mean = (n + 1) / 2.0;
        float y_mean = mean;
        float numerator = 0.0;
        float denominator = 0.0;
        for (size_t i = 0; i < n; i++) {
            numerator += (i + 1 - x_mean) * (data[i] - y_mean);
            denominator += (i + 1 - x_mean) * (i + 1 - x_mean);
        }
        float slope = numerator / denominator;

        // Subtract the best-fit line from the data points to get the detrended data
        for (size_t i = 0; i < n; i++) {
            data[i] = data[i] - (slope * (i + 1));
        }

        // Calculate the mean of the detrended data
        float detrended_mean = 0.0;
        for (size_t i = 0; i < n; i++) {
            detrended_mean += data[i];
        }
        detrended_mean /= n;

        // Subtract the mean of the detrended data from each element
        for (size_t i = 0; i < n; i++) {
            data[i] -= detrended_mean;
        }
    }

    static fvec detrend(const fvec& data) {
        auto ret = data;
        detrend(ret.data(), ret.size());
        return ret;
    }

};

struct fmat {
    ei_matrix* mat = nullptr;
    fmat(size_t rows, size_t cols) {
        mat = new ei_matrix(rows, cols);
        assert(mat);
    }

    ~fmat() {
        delete mat;
    }

    void resize(size_t rows, size_t cols) {
        delete mat;
        mat = new ei_matrix(rows, cols);
    }

    float* operator[](size_t i) {
        if (mat == nullptr || i >= mat->rows) {
            return nullptr;
        }
        return mat->get_row_ptr(i);
    }

    void fill(float x) {
        if (mat == nullptr) {
            return;
        }
        for (size_t i = 0; i < mat->rows; i++) {
            for (size_t j = 0; j < mat->cols; j++) {
                (*this)[i][j] = x;
            }
        }
    }

    void fill_col(size_t col, float x) {
        if (mat == nullptr) {
            return;
        }
        for (size_t i = 0; i < mat->rows; i++) {
            (*this)[i][col] = x;
        }
    }

    void fill_row(size_t row, float x) {
        if (mat == nullptr) {
            return;
        }
        for (size_t i = 0; i < mat->cols; i++) {
            (*this)[row][i] = x;
        }
    }
};
} // namespace ei

#endif // _EIDSP_NUMPY_H_