Upload 1028 files

b7b614e over 2 years ago

20.3 kB

	/*
	* Copyright (c) 2022 EdgeImpulse Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an "AS
	* IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	* express or implied. See the License for the specific language
	* governing permissions and limitations under the License.
	*
	* SPDX-License-Identifier: Apache-2.0
	*/

	#ifndef _EIDSP_SPEECHPY_PROCESSING_H_
	#define _EIDSP_SPEECHPY_PROCESSING_H_

	#include "../numpy.hpp"

	namespace ei {
	namespace speechpy {

	// one stack frame returned by stack_frames
	typedef struct ei_stack_frames_info {
	signal_t *signal;
	ei_vector<uint32_t> frame_ixs;
	int frame_length;
	} stack_frames_info_t;

	namespace processing {
	/**
	* Lazy Preemphasising on the signal.
	* @param signal: The input signal.
	* @param shift (int): The shift step.
	* @param cof (float): The preemphasising coefficient. 0 equals to no filtering.
	*/
	class preemphasis {
	public:
	preemphasis(ei_signal_t *signal, int shift, float cof, bool rescale)
	: _signal(signal), _shift(shift), _cof(cof), _rescale(rescale)
	{
	_prev_buffer = (float)ei_dsp_calloc(shift sizeof(float), 1);
	_end_of_signal_buffer = (float)ei_dsp_calloc(shift sizeof(float), 1);
	_next_offset_should_be = 0;

	if (shift < 0) {
	_shift = signal->total_length + shift;
	}

	if (!_prev_buffer \|\| !_end_of_signal_buffer) return;

	// we need to get the shift bytes from the end of the buffer...
	signal->get_data(signal->total_length - shift, shift, _end_of_signal_buffer);
	}

	/**
	* Get preemphasized data from the underlying audio buffer...
	* This retrieves data from the signal then preemphasizes it.
	* @param offset Offset in the audio signal
	* @param length Length of the audio signal
	*/
	int get_data(size_t offset, size_t length, float *out_buffer) {
	if (!_prev_buffer \|\| !_end_of_signal_buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}
	if (offset + length > _signal->total_length) {
	EIDSP_ERR(EIDSP_OUT_OF_BOUNDS);
	}

	int ret;
	if (static_cast<int32_t>(offset) - _shift >= 0) {
	ret = _signal->get_data(offset - _shift, _shift, _prev_buffer);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}
	// else we'll use the end_of_signal_buffer; so no need to check

	ret = _signal->get_data(offset, length, out_buffer);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}

	// it might be that everything is already normalized here...
	bool all_between_min_1_and_1 = true;

	// now we have the signal and we can preemphasize
	for (size_t ix = 0; ix < length; ix++) {
	float now = out_buffer[ix];

	// under shift? read from end
	if (offset + ix < static_cast<uint32_t>(_shift)) {
	out_buffer[ix] = now - (_cof * _end_of_signal_buffer[offset + ix]);
	}
	// otherwise read from history buffer
	else {
	out_buffer[ix] = now - (_cof * _prev_buffer[0]);
	}

	if (_rescale && all_between_min_1_and_1) {
	if (out_buffer[ix] < -1.0f \|\| out_buffer[ix] > 1.0f) {
	all_between_min_1_and_1 = false;
	}
	}

	// roll through and overwrite last element
	if (_shift != 1) {
	numpy::roll(_prev_buffer, _shift, -1);
	}
	_prev_buffer[_shift - 1] = now;
	}

	_next_offset_should_be += length;

	// rescale from [-1 .. 1] ?
	if (_rescale && !all_between_min_1_and_1) {
	matrix_t scale_matrix(length, 1, out_buffer);
	ret = numpy::scale(&scale_matrix, 1.0f / 32768.0f);
	if (ret != 0) {
	EIDSP_ERR(ret);
	}
	}

	return EIDSP_OK;
	}

	~preemphasis() {
	if (_prev_buffer) {
	ei_dsp_free(_prev_buffer, _shift * sizeof(float));
	}
	if (_end_of_signal_buffer) {
	ei_dsp_free(_end_of_signal_buffer, _shift * sizeof(float));
	}
	}

	private:
	ei_signal_t *_signal;
	int _shift;
	float _cof;
	float *_prev_buffer;
	float *_end_of_signal_buffer;
	size_t _next_offset_should_be;
	bool _rescale;
	};
	}

	namespace processing {
	/**
	* Preemphasising on the signal. This modifies the signal in place!
	* For memory consumption reasons you probably want the preemphasis class,
	* which lazily loads the signal in.
	* @param signal (array): The input signal.
	* @param shift (int): The shift step.
	* @param cof (float): The preemphasising coefficient. 0 equals to no filtering.
	* @returns 0 when successful
	*/
	__attribute__((unused)) static int preemphasis(float *signal, size_t signal_size, int shift = 1, float cof = 0.98f)
	{
	if (shift < 0) {
	shift = signal_size + shift;
	}

	// so we need to keep some history
	float prev_buffer = (float)ei_dsp_calloc(shift * sizeof(float), 1);

	// signal - cof * xt::roll(signal, shift)
	for (size_t ix = 0; ix < signal_size; ix++) {
	float now = signal[ix];

	// under shift? read from end
	if (ix < static_cast<uint32_t>(shift)) {
	signal[ix] = now - (cof * signal[signal_size - shift + ix]);
	}
	// otherwise read from history buffer
	else {
	signal[ix] = now - (cof * prev_buffer[0]);
	}

	// roll through and overwrite last element
	numpy::roll(prev_buffer, shift, -1);
	prev_buffer[shift - 1] = now;
	}

	ei_dsp_free(prev_buffer, shift * sizeof(float));

	return EIDSP_OK;
	}

	/**
	* frame_length is a float and can thus be off by a little bit, e.g.
	* frame_length = 0.018f actually can yield 0.018000011f
	* thus screwing up our frame calculations here...
	*/
	static float ceil_unless_very_close_to_floor(float v) {
	if (v > floor(v) && v - floor(v) < 0.001f) {
	v = (floor(v));
	}
	else {
	v = (ceil(v));
	}
	return v;
	}

	/**
	* Calculate the length of a signal that will be sused for the settings provided.
	* @param signal_size: The number of frames in the signal
	* @param sampling_frequency (int): The sampling frequency of the signal.
	* @param frame_length (float): The length of the frame in second.
	* @param frame_stride (float): The stride between frames.
	* @returns Number of frames required, or a negative number if an error occured
	*/
	static int calculate_signal_used(
	size_t signal_size,
	uint32_t sampling_frequency,
	float frame_length,
	float frame_stride,
	bool zero_padding,
	uint16_t version)
	{
	int frame_sample_length;
	int length;
	if (version == 1) {
	frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length));
	frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride);
	length = frame_sample_length;
	}
	else {
	frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length));
	float frame_stride_arg = frame_stride;
	frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg);
	length = (frame_sample_length - (int)frame_stride);
	}

	volatile int numframes;
	volatile int len_sig;

	if (zero_padding) {
	// Calculation of number of frames
	numframes = static_cast<int>(
	ceil(static_cast<float>(signal_size - length) / frame_stride));

	// Zero padding
	len_sig = static_cast<int>(static_cast<float>(numframes) * frame_stride) + frame_sample_length;
	}
	else {
	numframes = static_cast<int>(
	floor(static_cast<float>(signal_size - length) / frame_stride));
	len_sig = static_cast<int>(
	(static_cast<float>(numframes - 1) * frame_stride + frame_sample_length));
	}

	return len_sig;
	}

	/**
	* Frame a signal into overlapping frames.
	* @param info This is both the base object and where we'll store our results.
	* @param sampling_frequency (int): The sampling frequency of the signal.
	* @param frame_length (float): The length of the frame in second.
	* @param frame_stride (float): The stride between frames.
	* @param zero_padding (bool): If the samples is not a multiple of
	* frame_length(number of frames sample), zero padding will
	* be done for generating last frame.
	* @returns EIDSP_OK if OK
	*/
	static int stack_frames(stack_frames_info_t *info,
	float sampling_frequency,
	float frame_length,
	float frame_stride,
	bool zero_padding,
	uint16_t version)
	{
	if (!info->signal \|\| !info->signal->get_data \|\| info->signal->total_length == 0) {
	EIDSP_ERR(EIDSP_SIGNAL_SIZE_MISMATCH);
	}

	size_t length_signal = info->signal->total_length;
	int frame_sample_length;
	int length;
	if (version == 1) {
	frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length));
	frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride);
	length = frame_sample_length;
	}
	else {
	frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length));
	float frame_stride_arg = frame_stride;
	frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg);
	length = (frame_sample_length - (int)frame_stride);
	}

	volatile int numframes;
	volatile int len_sig;

	if (zero_padding) {
	// Calculation of number of frames
	numframes = static_cast<int>(
	ceil(static_cast<float>(length_signal - length) / frame_stride));

	// Zero padding
	len_sig = static_cast<int>(static_cast<float>(numframes) * frame_stride) + frame_sample_length;

	info->signal->total_length = static_cast<size_t>(len_sig);
	}
	else {
	numframes = static_cast<int>(
	floor(static_cast<float>(length_signal - length) / frame_stride));
	len_sig = static_cast<int>(
	(static_cast<float>(numframes - 1) * frame_stride + frame_sample_length));

	info->signal->total_length = static_cast<size_t>(len_sig);
	}

	info->frame_ixs.clear();

	int frame_count = 0;

	for (size_t ix = 0; ix < static_cast<uint32_t>(len_sig); ix += static_cast<size_t>(frame_stride)) {
	if (++frame_count > numframes) break;

	info->frame_ixs.push_back(ix);
	}

	info->frame_length = frame_sample_length;

	return EIDSP_OK;
	}

	/**
	* Calculate the number of stack frames for the settings provided.
	* This is needed to allocate the right buffer size for the output of f.e. the MFE
	* blocks.
	* @param signal_size: The number of frames in the signal
	* @param sampling_frequency (int): The sampling frequency of the signal.
	* @param frame_length (float): The length of the frame in second.
	* @param frame_stride (float): The stride between frames.
	* @param zero_padding (bool): If the samples is not a multiple of
	* frame_length(number of frames sample), zero padding will
	* be done for generating last frame.
	* @returns Number of frames required, or a negative number if an error occured
	*/
	static int32_t calculate_no_of_stack_frames(
	size_t signal_size,
	uint32_t sampling_frequency,
	float frame_length,
	float frame_stride,
	bool zero_padding,
	uint16_t version)
	{
	int frame_sample_length;
	int length;
	if (version == 1) {
	frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length));
	frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride);
	length = frame_sample_length;
	}
	else {
	frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length));
	float frame_stride_arg = frame_stride;
	frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg);
	length = (frame_sample_length - (int)frame_stride);
	}

	volatile int numframes;

	if (zero_padding) {
	// Calculation of number of frames
	numframes = static_cast<int>(
	ceil(static_cast<float>(signal_size - length) / frame_stride));
	}
	else {
	numframes = static_cast<int>(
	floor(static_cast<float>(signal_size - length) / frame_stride));
	}

	return numframes;
	}

	/**
	* This function performs local cepstral mean and
	* variance normalization on a sliding window. The code assumes that
	* there is one observation per row.
	* @param features_matrix input feature matrix, will be modified in place
	* @param win_size The size of sliding window for local normalization.
	* Default=301 which is around 3s if 100 Hz rate is
	* considered(== 10ms frame stide)
	* @param variance_normalization If the variance normilization should
	* be performed or not.
	* @param scale Scale output to 0..1
	* @returns 0 if OK
	*/
	static int cmvnw(matrix_t *features_matrix, uint16_t win_size = 301, bool variance_normalization = false,
	bool scale = false)
	{
	if (win_size == 0) {
	return EIDSP_OK;
	}

	uint16_t pad_size = (win_size - 1) / 2;

	int ret;
	float *features_buffer_ptr;

	// mean & variance normalization
	EI_DSP_MATRIX(vec_pad, features_matrix->rows + (pad_size * 2), features_matrix->cols);
	if (!vec_pad.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	EI_DSP_MATRIX(mean_matrix, vec_pad.cols, 1);
	if (!mean_matrix.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	EI_DSP_MATRIX(window_variance, vec_pad.cols, 1);
	if (!window_variance.buffer) {
	return EIDSP_OUT_OF_MEM;
	}

	for (size_t ix = 0; ix < features_matrix->rows; ix++) {
	// create a slice on the vec_pad
	EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols));
	if (!window.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	ret = numpy::mean_axis0(&window, &mean_matrix);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	// subtract the mean for the features
	for (size_t fm_col = 0; fm_col < features_matrix->cols; fm_col++) {
	features_matrix->buffer[(ix * features_matrix->cols) + fm_col] =
	features_matrix->buffer[(ix * features_matrix->cols) + fm_col] - mean_matrix.buffer[fm_col];
	}
	}

	ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	for (size_t ix = 0; ix < features_matrix->rows; ix++) {
	// create a slice on the vec_pad
	EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols));
	if (!window.buffer) {
	EIDSP_ERR(EIDSP_OUT_OF_MEM);
	}

	if (variance_normalization == true) {
	ret = numpy::std_axis0(&window, &window_variance);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}

	features_buffer_ptr = &features_matrix->buffer[ix * vec_pad.cols];
	for (size_t col = 0; col < vec_pad.cols; col++) {
	(features_buffer_ptr) = ((features_buffer_ptr)) /
	(window_variance.buffer[col] + 1e-10);
	features_buffer_ptr++;
	}
	}
	}

	if (scale) {
	ret = numpy::normalize(features_matrix);
	if (ret != EIDSP_OK) {
	EIDSP_ERR(ret);
	}
	}

	return EIDSP_OK;
	}

	/**
	* Perform normalization for MFE frames, this converts the signal to dB,
	* then add a hard filter, and quantize / dequantize the output
	* @param features_matrix input feature matrix, will be modified in place
	*/
	static int mfe_normalization(matrix_t *features_matrix, int noise_floor_db) {
	const float noise = static_cast<float>(noise_floor_db * -1);
	const float noise_scale = 1.0f / (static_cast<float>(noise_floor_db * -1) + 12.0f);

	for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) {
	float f = features_matrix->buffer[ix];
	if (f < 1e-30) {
	f = 1e-30;
	}
	f = numpy::log10(f);
	f *= 10.0f; // scale by 10
	f += noise;
	f *= noise_scale;
	// clip again

	/* Here is the python code we're duplicating:
	# Quantize to 8 bits and dequantize back to float32
	mfe = np.uint8(np.around(mfe * 2**8))
	# clip to 2**8
	mfe = np.clip(mfe, 0, 255)
	mfe = np.float32(mfe / 2**8)
	*/

	f = roundf(f*256)/256;

	if (f < 0.0f) f = 0.0f;
	else if (f > 1.0f) f = 1.0f;
	features_matrix->buffer[ix] = f;
	}

	return EIDSP_OK;
	}

	/**
	* Perform normalization for spectrogram frames, this converts the signal to dB,
	* then add a hard filter
	* @param features_matrix input feature matrix, will be modified in place
	*/
	static int spectrogram_normalization(matrix_t *features_matrix, int noise_floor_db) {
	const float noise = static_cast<float>(noise_floor_db * -1);
	const float noise_scale = 1.0f / (static_cast<float>(noise_floor_db * -1) + 12.0f);

	for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) {
	float f = features_matrix->buffer[ix];
	if (f < 1e-30) {
	f = 1e-30;
	}
	f = numpy::log10(f);
	f *= 10.0f; // scale by 10
	f += noise;
	f *= noise_scale;
	// clip again
	if (f < 0.0f) f = 0.0f;
	else if (f > 1.0f) f = 1.0f;
	features_matrix->buffer[ix] = f;
	}

	return EIDSP_OK;
	}
	};

	} // namespace speechpy
	} // namespace ei

	#endif // _EIDSP_SPEECHPY_PROCESSING_H_