|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef _EIDSP_SPEECHPY_PROCESSING_H_ |
|
|
#define _EIDSP_SPEECHPY_PROCESSING_H_ |
|
|
|
|
|
#include "../numpy.hpp" |
|
|
|
|
|
namespace ei { |
|
|
namespace speechpy { |
|
|
|
|
|
|
|
|
typedef struct ei_stack_frames_info { |
|
|
signal_t *signal; |
|
|
ei_vector<uint32_t> frame_ixs; |
|
|
int frame_length; |
|
|
} stack_frames_info_t; |
|
|
|
|
|
namespace processing { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class preemphasis { |
|
|
public: |
|
|
preemphasis(ei_signal_t *signal, int shift, float cof, bool rescale) |
|
|
: _signal(signal), _shift(shift), _cof(cof), _rescale(rescale) |
|
|
{ |
|
|
_prev_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); |
|
|
_end_of_signal_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); |
|
|
_next_offset_should_be = 0; |
|
|
|
|
|
if (shift < 0) { |
|
|
_shift = signal->total_length + shift; |
|
|
} |
|
|
|
|
|
if (!_prev_buffer || !_end_of_signal_buffer) return; |
|
|
|
|
|
|
|
|
signal->get_data(signal->total_length - shift, shift, _end_of_signal_buffer); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int get_data(size_t offset, size_t length, float *out_buffer) { |
|
|
if (!_prev_buffer || !_end_of_signal_buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
if (offset + length > _signal->total_length) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_BOUNDS); |
|
|
} |
|
|
|
|
|
int ret; |
|
|
if (static_cast<int32_t>(offset) - _shift >= 0) { |
|
|
ret = _signal->get_data(offset - _shift, _shift, _prev_buffer); |
|
|
if (ret != 0) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
ret = _signal->get_data(offset, length, out_buffer); |
|
|
if (ret != 0) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
|
|
|
bool all_between_min_1_and_1 = true; |
|
|
|
|
|
|
|
|
for (size_t ix = 0; ix < length; ix++) { |
|
|
float now = out_buffer[ix]; |
|
|
|
|
|
|
|
|
if (offset + ix < static_cast<uint32_t>(_shift)) { |
|
|
out_buffer[ix] = now - (_cof * _end_of_signal_buffer[offset + ix]); |
|
|
} |
|
|
|
|
|
else { |
|
|
out_buffer[ix] = now - (_cof * _prev_buffer[0]); |
|
|
} |
|
|
|
|
|
if (_rescale && all_between_min_1_and_1) { |
|
|
if (out_buffer[ix] < -1.0f || out_buffer[ix] > 1.0f) { |
|
|
all_between_min_1_and_1 = false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (_shift != 1) { |
|
|
numpy::roll(_prev_buffer, _shift, -1); |
|
|
} |
|
|
_prev_buffer[_shift - 1] = now; |
|
|
} |
|
|
|
|
|
_next_offset_should_be += length; |
|
|
|
|
|
|
|
|
if (_rescale && !all_between_min_1_and_1) { |
|
|
matrix_t scale_matrix(length, 1, out_buffer); |
|
|
ret = numpy::scale(&scale_matrix, 1.0f / 32768.0f); |
|
|
if (ret != 0) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
~preemphasis() { |
|
|
if (_prev_buffer) { |
|
|
ei_dsp_free(_prev_buffer, _shift * sizeof(float)); |
|
|
} |
|
|
if (_end_of_signal_buffer) { |
|
|
ei_dsp_free(_end_of_signal_buffer, _shift * sizeof(float)); |
|
|
} |
|
|
} |
|
|
|
|
|
private: |
|
|
ei_signal_t *_signal; |
|
|
int _shift; |
|
|
float _cof; |
|
|
float *_prev_buffer; |
|
|
float *_end_of_signal_buffer; |
|
|
size_t _next_offset_should_be; |
|
|
bool _rescale; |
|
|
}; |
|
|
} |
|
|
|
|
|
namespace processing { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((unused)) static int preemphasis(float *signal, size_t signal_size, int shift = 1, float cof = 0.98f) |
|
|
{ |
|
|
if (shift < 0) { |
|
|
shift = signal_size + shift; |
|
|
} |
|
|
|
|
|
|
|
|
float *prev_buffer = (float*)ei_dsp_calloc(shift * sizeof(float), 1); |
|
|
|
|
|
|
|
|
for (size_t ix = 0; ix < signal_size; ix++) { |
|
|
float now = signal[ix]; |
|
|
|
|
|
|
|
|
if (ix < static_cast<uint32_t>(shift)) { |
|
|
signal[ix] = now - (cof * signal[signal_size - shift + ix]); |
|
|
} |
|
|
|
|
|
else { |
|
|
signal[ix] = now - (cof * prev_buffer[0]); |
|
|
} |
|
|
|
|
|
|
|
|
numpy::roll(prev_buffer, shift, -1); |
|
|
prev_buffer[shift - 1] = now; |
|
|
} |
|
|
|
|
|
ei_dsp_free(prev_buffer, shift * sizeof(float)); |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static float ceil_unless_very_close_to_floor(float v) { |
|
|
if (v > floor(v) && v - floor(v) < 0.001f) { |
|
|
v = (floor(v)); |
|
|
} |
|
|
else { |
|
|
v = (ceil(v)); |
|
|
} |
|
|
return v; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int calculate_signal_used( |
|
|
size_t signal_size, |
|
|
uint32_t sampling_frequency, |
|
|
float frame_length, |
|
|
float frame_stride, |
|
|
bool zero_padding, |
|
|
uint16_t version) |
|
|
{ |
|
|
int frame_sample_length; |
|
|
int length; |
|
|
if (version == 1) { |
|
|
frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride); |
|
|
length = frame_sample_length; |
|
|
} |
|
|
else { |
|
|
frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
float frame_stride_arg = frame_stride; |
|
|
frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg); |
|
|
length = (frame_sample_length - (int)frame_stride); |
|
|
} |
|
|
|
|
|
volatile int numframes; |
|
|
volatile int len_sig; |
|
|
|
|
|
if (zero_padding) { |
|
|
|
|
|
numframes = static_cast<int>( |
|
|
ceil(static_cast<float>(signal_size - length) / frame_stride)); |
|
|
|
|
|
|
|
|
len_sig = static_cast<int>(static_cast<float>(numframes) * frame_stride) + frame_sample_length; |
|
|
} |
|
|
else { |
|
|
numframes = static_cast<int>( |
|
|
floor(static_cast<float>(signal_size - length) / frame_stride)); |
|
|
len_sig = static_cast<int>( |
|
|
(static_cast<float>(numframes - 1) * frame_stride + frame_sample_length)); |
|
|
} |
|
|
|
|
|
return len_sig; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int stack_frames(stack_frames_info_t *info, |
|
|
float sampling_frequency, |
|
|
float frame_length, |
|
|
float frame_stride, |
|
|
bool zero_padding, |
|
|
uint16_t version) |
|
|
{ |
|
|
if (!info->signal || !info->signal->get_data || info->signal->total_length == 0) { |
|
|
EIDSP_ERR(EIDSP_SIGNAL_SIZE_MISMATCH); |
|
|
} |
|
|
|
|
|
size_t length_signal = info->signal->total_length; |
|
|
int frame_sample_length; |
|
|
int length; |
|
|
if (version == 1) { |
|
|
frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride); |
|
|
length = frame_sample_length; |
|
|
} |
|
|
else { |
|
|
frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
float frame_stride_arg = frame_stride; |
|
|
frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg); |
|
|
length = (frame_sample_length - (int)frame_stride); |
|
|
} |
|
|
|
|
|
volatile int numframes; |
|
|
volatile int len_sig; |
|
|
|
|
|
if (zero_padding) { |
|
|
|
|
|
numframes = static_cast<int>( |
|
|
ceil(static_cast<float>(length_signal - length) / frame_stride)); |
|
|
|
|
|
|
|
|
len_sig = static_cast<int>(static_cast<float>(numframes) * frame_stride) + frame_sample_length; |
|
|
|
|
|
info->signal->total_length = static_cast<size_t>(len_sig); |
|
|
} |
|
|
else { |
|
|
numframes = static_cast<int>( |
|
|
floor(static_cast<float>(length_signal - length) / frame_stride)); |
|
|
len_sig = static_cast<int>( |
|
|
(static_cast<float>(numframes - 1) * frame_stride + frame_sample_length)); |
|
|
|
|
|
info->signal->total_length = static_cast<size_t>(len_sig); |
|
|
} |
|
|
|
|
|
info->frame_ixs.clear(); |
|
|
|
|
|
int frame_count = 0; |
|
|
|
|
|
for (size_t ix = 0; ix < static_cast<uint32_t>(len_sig); ix += static_cast<size_t>(frame_stride)) { |
|
|
if (++frame_count > numframes) break; |
|
|
|
|
|
info->frame_ixs.push_back(ix); |
|
|
} |
|
|
|
|
|
info->frame_length = frame_sample_length; |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int32_t calculate_no_of_stack_frames( |
|
|
size_t signal_size, |
|
|
uint32_t sampling_frequency, |
|
|
float frame_length, |
|
|
float frame_stride, |
|
|
bool zero_padding, |
|
|
uint16_t version) |
|
|
{ |
|
|
int frame_sample_length; |
|
|
int length; |
|
|
if (version == 1) { |
|
|
frame_sample_length = static_cast<int>(round(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
frame_stride = round(static_cast<float>(sampling_frequency) * frame_stride); |
|
|
length = frame_sample_length; |
|
|
} |
|
|
else { |
|
|
frame_sample_length = static_cast<int>(ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_length)); |
|
|
float frame_stride_arg = frame_stride; |
|
|
frame_stride = ceil_unless_very_close_to_floor(static_cast<float>(sampling_frequency) * frame_stride_arg); |
|
|
length = (frame_sample_length - (int)frame_stride); |
|
|
} |
|
|
|
|
|
volatile int numframes; |
|
|
|
|
|
if (zero_padding) { |
|
|
|
|
|
numframes = static_cast<int>( |
|
|
ceil(static_cast<float>(signal_size - length) / frame_stride)); |
|
|
} |
|
|
else { |
|
|
numframes = static_cast<int>( |
|
|
floor(static_cast<float>(signal_size - length) / frame_stride)); |
|
|
} |
|
|
|
|
|
return numframes; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int cmvnw(matrix_t *features_matrix, uint16_t win_size = 301, bool variance_normalization = false, |
|
|
bool scale = false) |
|
|
{ |
|
|
if (win_size == 0) { |
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
uint16_t pad_size = (win_size - 1) / 2; |
|
|
|
|
|
int ret; |
|
|
float *features_buffer_ptr; |
|
|
|
|
|
|
|
|
EI_DSP_MATRIX(vec_pad, features_matrix->rows + (pad_size * 2), features_matrix->cols); |
|
|
if (!vec_pad.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
|
|
|
ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
EI_DSP_MATRIX(mean_matrix, vec_pad.cols, 1); |
|
|
if (!mean_matrix.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
|
|
|
EI_DSP_MATRIX(window_variance, vec_pad.cols, 1); |
|
|
if (!window_variance.buffer) { |
|
|
return EIDSP_OUT_OF_MEM; |
|
|
} |
|
|
|
|
|
for (size_t ix = 0; ix < features_matrix->rows; ix++) { |
|
|
|
|
|
EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols)); |
|
|
if (!window.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
|
|
|
ret = numpy::mean_axis0(&window, &mean_matrix); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
|
|
|
for (size_t fm_col = 0; fm_col < features_matrix->cols; fm_col++) { |
|
|
features_matrix->buffer[(ix * features_matrix->cols) + fm_col] = |
|
|
features_matrix->buffer[(ix * features_matrix->cols) + fm_col] - mean_matrix.buffer[fm_col]; |
|
|
} |
|
|
} |
|
|
|
|
|
ret = numpy::pad_1d_symmetric(features_matrix, &vec_pad, pad_size, pad_size); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
for (size_t ix = 0; ix < features_matrix->rows; ix++) { |
|
|
|
|
|
EI_DSP_MATRIX_B(window, win_size, vec_pad.cols, vec_pad.buffer + (ix * vec_pad.cols)); |
|
|
if (!window.buffer) { |
|
|
EIDSP_ERR(EIDSP_OUT_OF_MEM); |
|
|
} |
|
|
|
|
|
if (variance_normalization == true) { |
|
|
ret = numpy::std_axis0(&window, &window_variance); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
|
|
|
features_buffer_ptr = &features_matrix->buffer[ix * vec_pad.cols]; |
|
|
for (size_t col = 0; col < vec_pad.cols; col++) { |
|
|
*(features_buffer_ptr) = (*(features_buffer_ptr)) / |
|
|
(window_variance.buffer[col] + 1e-10); |
|
|
features_buffer_ptr++; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if (scale) { |
|
|
ret = numpy::normalize(features_matrix); |
|
|
if (ret != EIDSP_OK) { |
|
|
EIDSP_ERR(ret); |
|
|
} |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int mfe_normalization(matrix_t *features_matrix, int noise_floor_db) { |
|
|
const float noise = static_cast<float>(noise_floor_db * -1); |
|
|
const float noise_scale = 1.0f / (static_cast<float>(noise_floor_db * -1) + 12.0f); |
|
|
|
|
|
for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) { |
|
|
float f = features_matrix->buffer[ix]; |
|
|
if (f < 1e-30) { |
|
|
f = 1e-30; |
|
|
} |
|
|
f = numpy::log10(f); |
|
|
f *= 10.0f; |
|
|
f += noise; |
|
|
f *= noise_scale; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
f = roundf(f*256)/256; |
|
|
|
|
|
if (f < 0.0f) f = 0.0f; |
|
|
else if (f > 1.0f) f = 1.0f; |
|
|
features_matrix->buffer[ix] = f; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int spectrogram_normalization(matrix_t *features_matrix, int noise_floor_db) { |
|
|
const float noise = static_cast<float>(noise_floor_db * -1); |
|
|
const float noise_scale = 1.0f / (static_cast<float>(noise_floor_db * -1) + 12.0f); |
|
|
|
|
|
for (size_t ix = 0; ix < features_matrix->rows * features_matrix->cols; ix++) { |
|
|
float f = features_matrix->buffer[ix]; |
|
|
if (f < 1e-30) { |
|
|
f = 1e-30; |
|
|
} |
|
|
f = numpy::log10(f); |
|
|
f *= 10.0f; |
|
|
f += noise; |
|
|
f *= noise_scale; |
|
|
|
|
|
if (f < 0.0f) f = 0.0f; |
|
|
else if (f > 1.0f) f = 1.0f; |
|
|
features_matrix->buffer[ix] = f; |
|
|
} |
|
|
|
|
|
return EIDSP_OK; |
|
|
} |
|
|
}; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
#endif |
|
|
|