Upload 1028 files

b7b614e over 2 years ago

30 kB

	/*
	* Copyright (c) 2022 EdgeImpulse Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an "AS
	* IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	* express or implied. See the License for the specific language
	* governing permissions and limitations under the License.
	*
	* SPDX-License-Identifier: Apache-2.0
	*/

	#ifndef _EIDSP_SPEECHPY_FEATURE_H_
	#define _EIDSP_SPEECHPY_FEATURE_H_

	#include <stdint.h>
	#include "../../porting/ei_classifier_porting.h"
	#include "../ei_utils.h"
	#include "functions.hpp"
	#include "processing.hpp"
	#include "../memory.hpp"
	#include "../returntypes.hpp"
	#include "../ei_vector.h"

	namespace ei {
	namespace speechpy {

	class feature {
	public:
	/**
	* Compute the Mel-filterbanks. Each filter will be stored in one rows.
	* The columns correspond to fft bins.
	*
	* @param filterbanks Matrix of size num_filter * coefficients
	* @param num_filter the number of filters in the filterbank
	* @param coefficients (fftpoints//2 + 1)
	* @param sampling_freq the samplerate of the signal we are working
	* with. It affects mel spacing.
	* @param low_freq lowest band edge of mel filters, default 0 Hz
	* @param high_freq highest band edge of mel filters, default samplerate / 2
	* @param output_transposed If set to true this will transpose the matrix (memory efficient).
	* This is more efficient than calling this function and then transposing
	* as the latter requires the filterbank to be allocated twice (for a short while).
	* @returns EIDSP_OK if OK
	*/
	static int filterbanks(
	#if EIDSP_QUANTIZE_FILTERBANK
	quantized_matrix_t *filterbanks,
	#else
	matrix_t *filterbanks,
	#endif
	uint16_t num_filter, int coefficients, uint32_t sampling_freq,
	uint32_t low_freq, uint32_t high_freq,
	bool output_transposed = false
	)
	{
	const size_t mels_mem_size = (num_filter + 2) * sizeof(float);
	const size_t hertz_mem_size = (num_filter + 2) * sizeof(float);
	const size_t freq_index_mem_size = (num_filter + 2) * sizeof(int);

	float mels = (float)ei_dsp_malloc(mels_mem_size);
	if (!mels) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	if (filterbanks->rows != num_filter \|\| filterbanks->cols != static_cast<uint32_t>(coefficients)) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	#if EIDSP_QUANTIZE_FILTERBANK
	memset(filterbanks->buffer, 0, filterbanks->rows * filterbanks->cols * sizeof(uint8_t));
	#else
	memset(filterbanks->buffer, 0, filterbanks->rows * filterbanks->cols * sizeof(float));
	#endif

	// Computing the Mel filterbank
	// converting the upper and lower frequencies to Mels.
	// num_filter + 2 is because for num_filter filterbanks we need
	// num_filter+2 point.
	numpy::linspace(
	functions::frequency_to_mel(static_cast<float>(low_freq)),
	functions::frequency_to_mel(static_cast<float>(high_freq)),
	num_filter + 2,
	mels);

	// we should convert Mels back to Hertz because the start and end-points
	// should be at the desired frequencies.
	float hertz = (float)ei_dsp_malloc(hertz_mem_size);
	if (!hertz) {
	ei_dsp_free(mels, mels_mem_size);
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}
	for (uint16_t ix = 0; ix < num_filter + 2; ix++) {
	hertz[ix] = functions::mel_to_frequency(mels[ix]);
	if (hertz[ix] < low_freq) {
	hertz[ix] = low_freq;
	}
	if (hertz[ix] > high_freq) {
	hertz[ix] = high_freq;
	}

	// here is a really annoying bug in Speechpy which calculates the frequency index wrong for the last bucket
	// the last 'hertz' value is not 8,000 (with sampling rate 16,000) but 7,999.999999
	// thus calculating the bucket to 64, not 65.
	// we're adjusting this here a tiny bit to ensure we have the same result
	if (ix == num_filter + 2 - 1) {
	hertz[ix] -= 0.001;
	}
	}
	ei_dsp_free(mels, mels_mem_size);

	// The frequency resolution required to put filters at the
	// exact points calculated above should be extracted.
	// So we should round those frequencies to the closest FFT bin.
	int freq_index = (int)ei_dsp_malloc(freq_index_mem_size);
	if (!freq_index) {
	ei_dsp_free(hertz, hertz_mem_size);
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}
	for (uint16_t ix = 0; ix < num_filter + 2; ix++) {
	freq_index[ix] = static_cast<int>(floor((coefficients + 1) * hertz[ix] / sampling_freq));
	}
	ei_dsp_free(hertz, hertz_mem_size);

	for (size_t i = 0; i < num_filter; i++) {
	int left = freq_index[i];
	int middle = freq_index[i + 1];
	int right = freq_index[i + 2];

	EI_DSP_MATRIX(z, 1, (right - left + 1));
	if (!z.buffer) {
	ei_dsp_free(freq_index, freq_index_mem_size);
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}
	numpy::linspace(left, right, (right - left + 1), z.buffer);
	functions::triangle(z.buffer, (right - left + 1), left, middle, right);

	// so... z now contains some values that we need to overwrite in the filterbank
	for (int zx = 0; zx < (right - left + 1); zx++) {
	size_t index = (i * filterbanks->cols) + (left + zx);

	if (output_transposed) {
	index = ((left + zx) * filterbanks->rows) + i;
	}

	#if EIDSP_QUANTIZE_FILTERBANK
	filterbanks->buffer[index] = numpy::quantize_zero_one(z.buffer[zx]);
	#else
	filterbanks->buffer[index] = z.buffer[zx];
	#endif
	}
	}

	if (output_transposed) {
	uint16_t r = filterbanks->rows;
	filterbanks->rows = filterbanks->cols;
	filterbanks->cols = r;
	}

	ei_dsp_free(freq_index, freq_index_mem_size);

	return EIDSP_OK;
	}

	/**
	* @brief Get the fft bin index from hertz
	*
	* @param fft_size Size of fft
	* @param hertz Desired hertz
	* @param sampling_freq In Hz
	* @return int the index of the bin closest to the hertz
	*/
	static int get_fft_bin_from_hertz(uint16_t fft_size, float hertz, uint32_t sampling_freq)
	{
	return static_cast<int>(floor((fft_size + 1) * hertz / sampling_freq));
	}

	/**
	* Compute Mel-filterbank energy features from an audio signal.
	* @param out_features Use `calculate_mfe_buffer_size` to allocate the right matrix.
	* @param out_energies A matrix in the form of Mx1 where M is the rows from `calculate_mfe_buffer_size`
	* @param signal: audio signal structure with functions to retrieve data from a signal
	* @param sampling_frequency (int): the sampling frequency of the signal
	* we are working with.
	* @param frame_length (float): the length of each frame in seconds.
	* Default is 0.020s
	* @param frame_stride (float): the step between successive frames in seconds.
	* Default is 0.02s (means no overlap)
	* @param num_filters (int): the number of filters in the filterbank,
	* default 40.
	* @param fft_length (int): number of FFT points. Default is 512.
	* @param low_frequency (int): lowest band edge of mel filters.
	* In Hz, default is 0.
	* @param high_frequency (int): highest band edge of mel filters.
	* In Hz, default is samplerate/2
	* @EIDSP_OK if OK
	*/
	static int mfe(matrix_t out_features, matrix_t out_energies,
	signal_t *signal,
	uint32_t sampling_frequency,
	float frame_length, float frame_stride, uint16_t num_filters,
	uint16_t fft_length, uint32_t low_frequency, uint32_t high_frequency,
	uint16_t version
	)
	{
	int ret = 0;

	if (high_frequency == 0) {
	high_frequency = sampling_frequency / 2;
	}

	if (version<4) {
	if (low_frequency == 0) {
	low_frequency = 300;
	}
	}

	stack_frames_info_t stack_frame_info = { 0 };
	stack_frame_info.signal = signal;

	ret = processing::stack_frames(
	&stack_frame_info,
	sampling_frequency,
	frame_length,
	frame_stride,
	false,
	version
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	if (stack_frame_info.frame_ixs.size() != out_features->rows) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	if (num_filters != out_features->cols) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	if (out_energies) {
	if (stack_frame_info.frame_ixs.size() != out_energies->rows \|\| out_energies->cols != 1) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}
	}

	for (uint32_t i = 0; i < out_features->rows * out_features->cols; i++) {
	*(out_features->buffer + i) = 0;
	}

	const size_t power_spectrum_frame_size = (fft_length / 2 + 1);
	// Computing the Mel filterbank
	// converting the upper and lower frequencies to Mels.
	// num_filter + 2 is because for num_filter filterbanks we need
	// num_filter+2 point.
	float *mels;
	const int MELS_SIZE = num_filters + 2;
	mels = (float*)ei_calloc(MELS_SIZE, sizeof(float));
	EI_ERR_AND_RETURN_ON_NULL(mels, EIDSP_OUT_OF_MEM);
	ei_unique_ptr_t __ptr__(mels,ei_free);
	uint16_t* bins = reinterpret_cast<uint16_t*>(mels); // alias the mels array so we can reuse the space

	numpy::linspace(
	functions::frequency_to_mel(static_cast<float>(low_frequency)),
	functions::frequency_to_mel(static_cast<float>(high_frequency)),
	num_filters + 2,
	mels);

	uint16_t max_bin = version >= 4 ? fft_length : power_spectrum_frame_size; // preserve a bug in v<4
	// go to -1 size b/c special handling, see after
	for (uint16_t ix = 0; ix < MELS_SIZE-1; ix++) {
	mels[ix] = functions::mel_to_frequency(mels[ix]);
	if (mels[ix] < low_frequency) {
	mels[ix] = low_frequency;
	}
	if (mels[ix] > high_frequency) {
	mels[ix] = high_frequency;
	}
	bins[ix] = get_fft_bin_from_hertz(max_bin, mels[ix], sampling_frequency);
	}

	// here is a really annoying bug in Speechpy which calculates the frequency index wrong for the last bucket
	// the last 'hertz' value is not 8,000 (with sampling rate 16,000) but 7,999.999999
	// thus calculating the bucket to 64, not 65.
	// we're adjusting this here a tiny bit to ensure we have the same result
	mels[MELS_SIZE-1] = functions::mel_to_frequency(mels[MELS_SIZE-1]);
	if (mels[MELS_SIZE-1] > high_frequency) {
	mels[MELS_SIZE-1] = high_frequency;
	}
	mels[MELS_SIZE-1] -= 0.001;
	bins[MELS_SIZE-1] = get_fft_bin_from_hertz(max_bin, mels[MELS_SIZE-1], sampling_frequency);

	EI_DSP_MATRIX(power_spectrum_frame, 1, power_spectrum_frame_size);
	if (!power_spectrum_frame.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	// get signal data from the audio file
	EI_DSP_MATRIX(signal_frame, 1, stack_frame_info.frame_length);

	for (size_t ix = 0; ix < stack_frame_info.frame_ixs.size(); ix++) {
	// don't read outside of the audio buffer... we'll automatically zero pad then
	size_t signal_offset = stack_frame_info.frame_ixs.at(ix);
	size_t signal_length = stack_frame_info.frame_length;
	if (signal_offset + signal_length > stack_frame_info.signal->total_length) {
	signal_length = signal_length -
	(stack_frame_info.signal->total_length - (signal_offset + signal_length));
	}

	ret = stack_frame_info.signal->get_data(
	signal_offset,
	signal_length,
	signal_frame.buffer
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	ret = numpy::power_spectrum(
	signal_frame.buffer,
	stack_frame_info.frame_length,
	power_spectrum_frame.buffer,
	power_spectrum_frame_size,
	fft_length
	);

	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	float energy = numpy::sum(power_spectrum_frame.buffer, power_spectrum_frame_size);
	if (energy == 0) {
	energy = 1e-10;
	}

	if (out_energies) {
	out_energies->buffer[ix] = energy;
	}

	auto row_ptr = out_features->get_row_ptr(ix);
	for (size_t i = 0; i < num_filters; i++) {
	size_t left = bins[i];
	size_t middle = bins[i+1];
	size_t right = bins[i+2];

	assert(right < power_spectrum_frame_size);
	// now we have weights and locations to move from fft to mel sgram
	// both left and right become zero weights, so skip them

	// middle always has weight of 1.0
	// since we skip left and right, if left = middle we need to handle that
	row_ptr[i] = power_spectrum_frame.buffer[middle];

	for (size_t bin = left+1; bin < right; bin++) {
	if (bin < middle) {
	row_ptr[i] +=
	((static_cast<float>(bin) - left) / (middle - left)) * // weight *
	power_spectrum_frame.buffer[bin];
	}
	// intentionally skip middle, handled above
	if (bin > middle) {
	row_ptr[i] +=
	((right - static_cast<float>(bin)) / (right - middle)) * // weight *
	power_spectrum_frame.buffer[bin];
	}
	}
	}

	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}

	numpy::zero_handling(out_features);

	return EIDSP_OK;
	}

	/**
	* Compute Mel-filterbank energy features from an audio signal.
	* @param out_features Use `calculate_mfe_buffer_size` to allocate the right matrix.
	* @param out_energies A matrix in the form of Mx1 where M is the rows from `calculate_mfe_buffer_size`
	* @param signal: audio signal structure with functions to retrieve data from a signal
	* @param sampling_frequency (int): the sampling frequency of the signal
	* we are working with.
	* @param frame_length (float): the length of each frame in seconds.
	* Default is 0.020s
	* @param frame_stride (float): the step between successive frames in seconds.
	* Default is 0.02s (means no overlap)
	* @param num_filters (int): the number of filters in the filterbank,
	* default 40.
	* @param fft_length (int): number of FFT points. Default is 512.
	* @param low_frequency (int): lowest band edge of mel filters.
	* In Hz, default is 0.
	* @param high_frequency (int): highest band edge of mel filters.
	* In Hz, default is samplerate/2
	* @EIDSP_OK if OK
	*/
	static int mfe_v3(matrix_t out_features, matrix_t out_energies,
	signal_t *signal,
	uint32_t sampling_frequency,
	float frame_length, float frame_stride, uint16_t num_filters,
	uint16_t fft_length, uint32_t low_frequency, uint32_t high_frequency,
	uint16_t version
	)
	{
	int ret = 0;

	if (high_frequency == 0) {
	high_frequency = sampling_frequency / 2;
	}

	if (low_frequency == 0) {
	low_frequency = 300;
	}

	stack_frames_info_t stack_frame_info = { 0 };
	stack_frame_info.signal = signal;

	ret = processing::stack_frames(
	&stack_frame_info,
	sampling_frequency,
	frame_length,
	frame_stride,
	false,
	version
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	if (stack_frame_info.frame_ixs.size() != out_features->rows) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	if (num_filters != out_features->cols) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	if (out_energies) {
	if (stack_frame_info.frame_ixs.size() != out_energies->rows \|\| out_energies->cols != 1) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}
	}

	for (uint32_t i = 0; i < out_features->rows * out_features->cols; i++) {
	*(out_features->buffer + i) = 0;
	}

	uint16_t coefficients = fft_length / 2 + 1;

	// calculate the filterbanks first... preferably I would want to do the matrix multiplications
	// whenever they happen, but OK...
	#if EIDSP_QUANTIZE_FILTERBANK
	EI_DSP_QUANTIZED_MATRIX(filterbanks, num_filters, coefficients, &numpy::dequantize_zero_one);
	#else
	EI_DSP_MATRIX(filterbanks, num_filters, coefficients);
	#endif
	if (!filterbanks.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	ret = feature::filterbanks(
	&filterbanks, num_filters, coefficients, sampling_frequency, low_frequency, high_frequency, true);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	for (size_t ix = 0; ix < stack_frame_info.frame_ixs.size(); ix++) {
	size_t power_spectrum_frame_size = (fft_length / 2 + 1);

	EI_DSP_MATRIX(power_spectrum_frame, 1, power_spectrum_frame_size);
	if (!power_spectrum_frame.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	// get signal data from the audio file
	EI_DSP_MATRIX(signal_frame, 1, stack_frame_info.frame_length);

	// don't read outside of the audio buffer... we'll automatically zero pad then
	size_t signal_offset = stack_frame_info.frame_ixs.at(ix);
	size_t signal_length = stack_frame_info.frame_length;
	if (signal_offset + signal_length > stack_frame_info.signal->total_length) {
	signal_length = signal_length -
	(stack_frame_info.signal->total_length - (signal_offset + signal_length));
	}

	ret = stack_frame_info.signal->get_data(
	signal_offset,
	signal_length,
	signal_frame.buffer
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	ret = numpy::power_spectrum(
	signal_frame.buffer,
	stack_frame_info.frame_length,
	power_spectrum_frame.buffer,
	power_spectrum_frame_size,
	fft_length
	);

	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	float energy = numpy::sum(power_spectrum_frame.buffer, power_spectrum_frame_size);
	if (energy == 0) {
	energy = 1e-10;
	}

	if (out_energies) {
	out_energies->buffer[ix] = energy;
	}

	// calculate the out_features directly here
	ret = numpy::dot_by_row(
	ix,
	power_spectrum_frame.buffer,
	power_spectrum_frame_size,
	&filterbanks,
	out_features
	);

	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}

	numpy::zero_handling(out_features);

	return EIDSP_OK;
	}

	/**
	* Compute spectrogram from a sensor signal.
	* @param out_features Use `calculate_mfe_buffer_size` to allocate the right matrix.
	* @param signal: audio signal structure with functions to retrieve data from a signal
	* @param sampling_frequency (int): the sampling frequency of the signal
	* we are working with.
	* @param frame_length (float): the length of each frame in seconds.
	* Default is 0.020s
	* @param frame_stride (float): the step between successive frames in seconds.
	* Default is 0.02s (means no overlap)
	* @param fft_length (int): number of FFT points. Default is 512.
	* @EIDSP_OK if OK
	*/
	static int spectrogram(matrix_t *out_features,
	signal_t *signal, float sampling_frequency,
	float frame_length, float frame_stride, uint16_t fft_length,
	uint16_t version
	)
	{
	int ret = 0;

	stack_frames_info_t stack_frame_info = { 0 };
	stack_frame_info.signal = signal;

	ret = processing::stack_frames(
	&stack_frame_info,
	sampling_frequency,
	frame_length,
	frame_stride,
	false,
	version
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	if (stack_frame_info.frame_ixs.size() != out_features->rows) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	uint16_t coefficients = fft_length / 2 + 1;

	if (coefficients != out_features->cols) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	for (uint32_t i = 0; i < out_features->rows * out_features->cols; i++) {
	*(out_features->buffer + i) = 0;
	}

	for (size_t ix = 0; ix < stack_frame_info.frame_ixs.size(); ix++) {
	// get signal data from the audio file
	EI_DSP_MATRIX(signal_frame, 1, stack_frame_info.frame_length);

	// don't read outside of the audio buffer... we'll automatically zero pad then
	size_t signal_offset = stack_frame_info.frame_ixs.at(ix);
	size_t signal_length = stack_frame_info.frame_length;
	if (signal_offset + signal_length > stack_frame_info.signal->total_length) {
	signal_length = signal_length -
	(stack_frame_info.signal->total_length - (signal_offset + signal_length));
	}

	ret = stack_frame_info.signal->get_data(
	signal_offset,
	signal_length,
	signal_frame.buffer
	);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	// normalize data (only when version is above 3)
	if (version >= 3) {
	// it might be that everything is already normalized here...
	bool all_between_min_1_and_1 = true;
	for (size_t ix = 0; ix < signal_frame.rows * signal_frame.cols; ix++) {
	if (signal_frame.buffer[ix] < -1.0f \|\| signal_frame.buffer[ix] > 1.0f) {
	all_between_min_1_and_1 = false;
	break;
	}
	}

	if (!all_between_min_1_and_1) {
	ret = numpy::scale(&signal_frame, 1.0f / 32768.0f);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}
	}

	ret = numpy::power_spectrum(
	signal_frame.buffer,
	stack_frame_info.frame_length,
	out_features->buffer + (ix * coefficients),
	coefficients,
	fft_length
	);

	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}

	numpy::zero_handling(out_features);

	return EIDSP_OK;
	}

	/**
	* Calculate the buffer size for MFE
	* @param signal_length: Length of the signal.
	* @param sampling_frequency (int): The sampling frequency of the signal.
	* @param frame_length (float): The length of the frame in second.
	* @param frame_stride (float): The stride between frames.
	* @param num_filters
	*/
	static matrix_size_t calculate_mfe_buffer_size(
	size_t signal_length,
	uint32_t sampling_frequency,
	float frame_length, float frame_stride, uint16_t num_filters,
	uint16_t version)
	{
	int32_t rows = processing::calculate_no_of_stack_frames(
	signal_length,
	sampling_frequency,
	frame_length,
	frame_stride,
	false,
	version);
	int32_t cols = num_filters;

	matrix_size_t size_matrix;
	size_matrix.rows = (uint32_t)rows;
	size_matrix.cols = (uint32_t)cols;
	return size_matrix;
	}

	/**
	* Compute MFCC features from an audio signal.
	* @param out_features Use `calculate_mfcc_buffer_size` to allocate the right matrix.
	* @param signal: audio signal structure from which to compute features.
	* has functions to retrieve data from a signal lazily.
	* @param sampling_frequency (int): the sampling frequency of the signal
	* we are working with.
	* @param frame_length (float): the length of each frame in seconds.
	* Default is 0.020s
	* @param frame_stride (float): the step between successive frames in seconds.
	* Default is 0.01s (means no overlap)
	* @param num_cepstral (int): Number of cepstral coefficients.
	* @param num_filters (int): the number of filters in the filterbank,
	* default 40.
	* @param fft_length (int): number of FFT points. Default is 512.
	* @param low_frequency (int): lowest band edge of mel filters.
	* In Hz, default is 0.
	* @param high_frequency (int): highest band edge of mel filters.
	* In Hz, default is samplerate/2
	* @param dc_elimination Whether the first dc component should
	* be eliminated or not.
	* @returns 0 if OK
	*/
	static int mfcc(matrix_t out_features, signal_t signal,
	uint32_t sampling_frequency, float frame_length, float frame_stride,
	uint8_t num_cepstral, uint16_t num_filters, uint16_t fft_length,
	uint32_t low_frequency, uint32_t high_frequency, bool dc_elimination,
	uint16_t version)
	{
	if (out_features->cols != num_cepstral) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	matrix_size_t mfe_matrix_size =
	calculate_mfe_buffer_size(
	signal->total_length,
	sampling_frequency,
	frame_length,
	frame_stride,
	num_filters,
	version);

	if (out_features->rows != mfe_matrix_size.rows) {
	EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
	}

	int ret = EIDSP_OK;

	// allocate some memory for the MFE result
	EI_DSP_MATRIX(features_matrix, mfe_matrix_size.rows, mfe_matrix_size.cols);
	if (!features_matrix.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	EI_DSP_MATRIX(energy_matrix, mfe_matrix_size.rows, 1);
	if (!energy_matrix.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	ret = mfe(&features_matrix, &energy_matrix, signal,
	sampling_frequency, frame_length, frame_stride, num_filters, fft_length,
	low_frequency, high_frequency, version);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	// ok... now we need to calculate the MFCC from this...
	// first do log() over all features...
	ret = numpy::log(&features_matrix);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	// now do DST type 2
	ret = numpy::dct2(&features_matrix, DCT_NORMALIZATION_ORTHO);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	// replace first cepstral coefficient with log of frame energy for DC elimination
	if (dc_elimination) {
	for (size_t row = 0; row < features_matrix.rows; row++) {
	features_matrix.buffer[row * features_matrix.cols] = numpy::log(energy_matrix.buffer[row]);
	}
	}

	// copy to the output...
	for (size_t row = 0; row < features_matrix.rows; row++) {
	for(int i = 0; i < num_cepstral; i++) {
	(out_features->buffer + (num_cepstral row) + i) = (features_matrix.buffer + (features_matrix.cols row) + i);
	}
	}

	return EIDSP_OK;
	}

	/**
	* Calculate the buffer size for MFCC
	* @param signal_length: Length of the signal.
	* @param sampling_frequency (int): The sampling frequency of the signal.
	* @param frame_length (float): The length of the frame in second.
	* @param frame_stride (float): The stride between frames.
	* @param num_cepstral
	*/
	static matrix_size_t calculate_mfcc_buffer_size(
	size_t signal_length,
	uint32_t sampling_frequency,
	float frame_length, float frame_stride, uint16_t num_cepstral,
	uint16_t version)
	{
	int32_t rows = processing::calculate_no_of_stack_frames(
	signal_length,
	sampling_frequency,
	frame_length,
	frame_stride,
	false,
	version);
	int32_t cols = num_cepstral;

	matrix_size_t size_matrix;
	size_matrix.rows = (uint32_t)rows;
	size_matrix.cols = (uint32_t)cols;
	return size_matrix;
	}
	};

	} // namespace speechpy
	} // namespace ei

	#endif // _EIDSP_SPEECHPY_FEATURE_H_