#include "audio_encoder_lib.h" #include #include #include #include #include #include // For memcpy // Include specific ONNX Runtime headers for implementation #include // Include specific Eigen headers for implementation #include // Include specific KissFFT headers for implementation #include #include // Define M_PI if it's not already defined #ifndef M_PI #define M_PI 3.14159265358979323846 #endif // --- Global parameters for feature extraction (matching Python script) --- // These are constants derived from the Python preprocessing script and are // internal to the feature extraction logic. namespace { // Anonymous namespace for internal linkage const float PREEMPHASIS_COEFF = 0.97f; const int N_FFT = 512; // FFT size const int WIN_LENGTH = 400; // Window length (samples) const int HOP_LENGTH = 160; // Hop length (samples) const int N_MELS = 80; // Number of Mel filterbank channels const int TARGET_SAMPLE_RATE = 16000; // Target sample rate for feature extraction } // --- Implementation of AudioInferenceEngine methods --- AudioInferenceEngine::AudioInferenceEngine(const std::string& modelPath) { // 1. Initialize ONNX Runtime Environment env_ = std::make_unique(ORT_LOGGING_LEVEL_WARNING, "AudioInferenceEngine"); // 2. Configure Session Options Ort::SessionOptions session_options; session_options.SetIntraOpNumThreads(0); session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED); // 3. Create ONNX Runtime Session session_ = std::make_unique(*env_, modelPath.c_str(), session_options); // 4. Initialize Allocator allocator_ = std::make_unique(); // 5. Get Input and Output Node Names // It's crucial to allocate these names using the allocator and store them // as C-style strings for Ort::Session::Run. size_t numInputNodes = session_->GetInputCount(); if (numInputNodes == 0) { throw Ort::Exception("ONNX model has no input nodes.", ORT_FAIL); } input_node_names_.resize(numInputNodes); for (size_t i = 0; i < numInputNodes; ++i) { input_node_names_[i] = session_->GetInputNameAllocated(i, *allocator_).release(); // release() to manage lifetime } size_t numOutputNodes = session_->GetOutputCount(); if (numOutputNodes == 0) { throw Ort::Exception("ONNX model has no output nodes.", ORT_FAIL); } output_node_names_.resize(numOutputNodes); for (size_t i = 0; i < numOutputNodes; ++i) { output_node_names_[i] = session_->GetOutputNameAllocated(i, *allocator_).release(); // release() to manage lifetime } // 6. Precompute Mel filterbank // The Python example uses fmax=16000//2-80-230. float mel_fmax = static_cast(TARGET_SAMPLE_RATE) / 2.0f - 80.0f - 230.0f; mel_filterbank_ = speechlibMel(TARGET_SAMPLE_RATE, N_FFT, N_MELS, 0.0f, mel_fmax); if (mel_filterbank_.rows() == 0 || mel_filterbank_.cols() == 0) { throw std::runtime_error("Failed to create Mel filterbank during initialization."); } std::cout << "AudioInferenceEngine initialized successfully with model: " << modelPath << std::endl; } AudioInferenceEngine::~AudioInferenceEngine() { // Release allocated names for (const char* name : input_node_names_) { allocator_->Free(const_cast(reinterpret_cast(name))); } for (const char* name : output_node_names_) { allocator_->Free(const_cast(reinterpret_cast(name))); } // unique_ptr automatically handles deletion of env_ and session_ } /** * @brief Private helper: Loads audio data from a WAV file. */ std::vector AudioInferenceEngine::loadWavToFloatArray(const std::string& filename, int& actual_sample_rate) { std::ifstream file(filename, std::ios::binary); if (!file.is_open()) { std::cerr << "Error: Could not open WAV file: " << filename << std::endl; return {}; } WavHeader header; file.read(reinterpret_cast(&header), sizeof(WavHeader)); if (std::string(header.riff_id, 4) != "RIFF" || std::string(header.wave_id, 4) != "WAVE" || std::string(header.fmt_id, 4) != "fmt ") { std::cerr << "Error: Invalid WAV header (RIFF, WAVE, or fmt chunk missing/invalid)." << std::endl; file.close(); return {}; } if (header.audio_format != 1) { // 1 = PCM std::cerr << "Error: Only PCM audio format (1) is supported. Found: " << header.audio_format << std::endl; file.close(); return {}; } if (header.bits_per_sample != 16) { std::cerr << "Error: Only 16-bit PCM is supported. Found: " << header.bits_per_sample << " bits per sample." << std::endl; file.close(); return {}; } actual_sample_rate = header.sample_rate; WavDataChunk data_chunk; bool data_chunk_found = false; while (!file.eof()) { file.read(reinterpret_cast(&data_chunk.data_id), 4); file.read(reinterpret_cast(&data_chunk.data_size), 4); if (std::string(data_chunk.data_id, 4) == "data") { data_chunk_found = true; break; } else { file.seekg(data_chunk.data_size, std::ios::cur); } } if (!data_chunk_found) { std::cerr << "Error: 'data' chunk not found in WAV file." << std::endl; file.close(); return {}; } std::vector audioData; int16_t sample_buffer; long num_samples_to_read = data_chunk.data_size / sizeof(int16_t); for (long i = 0; i < num_samples_to_read; ++i) { file.read(reinterpret_cast(&sample_buffer), sizeof(int16_t)); float normalized_sample = static_cast(sample_buffer) / 32768.0f; if (header.num_channels == 1) { audioData.push_back(normalized_sample); } else if (header.num_channels == 2) { int16_t right_sample; if (file.read(reinterpret_cast(&right_sample), sizeof(int16_t))) { float normalized_right_sample = static_cast(right_sample) / 32768.0f; audioData.push_back((normalized_sample + normalized_right_sample) / 2.0f); i++; } else { std::cerr << "Warning: Unexpected end of file while reading stereo data." << std::endl; break; } } else { std::cerr << "Error: Unsupported number of channels: " << header.num_channels << std::endl; file.close(); return {}; } } file.close(); return audioData; } /** * @brief Private helper: Generates a Hamming window. */ std::vector AudioInferenceEngine::generateHammingWindow(int window_length) { std::vector window(window_length); for (int i = 0; i < window_length; ++i) { window[i] = 0.54f - 0.46f * std::cos(2 * M_PI * i / static_cast(window_length - 1)); } return window; } /** * @brief Private helper: Extracts spectrogram features. */ Eigen::MatrixXf AudioInferenceEngine::extractSpectrogram(const std::vector& wav, int fs) { int n_batch = (wav.size() - WIN_LENGTH) / HOP_LENGTH + 1; if (n_batch <= 0) { return Eigen::MatrixXf(0, N_FFT / 2 + 1); } std::vector fft_window = generateHammingWindow(WIN_LENGTH); kiss_fftr_cfg fft_cfg = kiss_fftr_alloc(N_FFT, 0 /* is_inverse_fft */, nullptr, nullptr); if (!fft_cfg) { std::cerr << "Error: Failed to allocate KissFFT configuration." << std::endl; return Eigen::MatrixXf(0, N_FFT / 2 + 1); } Eigen::MatrixXf spec_matrix(n_batch, N_FFT / 2 + 1); std::vector frame_buffer(WIN_LENGTH); kiss_fft_scalar fft_input[N_FFT]; kiss_fft_cpx fft_output[N_FFT / 2 + 1]; for (int i = 0; i < n_batch; ++i) { int start_idx = i * HOP_LENGTH; for (int j = 0; j < WIN_LENGTH; ++j) { frame_buffer[j] = wav[start_idx + j]; } // Apply pre-emphasis and scale by 32768 if (WIN_LENGTH > 0) { if (WIN_LENGTH > 1) { // Corrected pre-emphasis to match Python's np.roll and then overwrite first element // The first element of the frame is pre-emphasized against the second element. fft_input[0] = (frame_buffer[0] - PREEMPHASIS_COEFF * frame_buffer[1]) * 32768.0f; for (int j = 1; j < WIN_LENGTH; ++j) { fft_input[j] = (frame_buffer[j] - PREEMPHASIS_COEFF * frame_buffer[j - 1]) * 32768.0f; } } else { // WIN_LENGTH == 1 fft_input[0] = frame_buffer[0] * 32768.0f; } } for (int j = WIN_LENGTH; j < N_FFT; ++j) { fft_input[j] = 0.0f; } for (int j = 0; j < WIN_LENGTH; ++j) { fft_input[j] *= fft_window[j]; } kiss_fftr(fft_cfg, fft_input, fft_output); for (int j = 0; j <= N_FFT / 2; ++j) { spec_matrix(i, j) = std::sqrt(fft_output[j].r * fft_output[j].r + fft_output[j].i * fft_output[j].i); } } kiss_fftr_free(fft_cfg); return spec_matrix; } /** * @brief Private helper: Creates a Mel filter-bank matrix. */ Eigen::MatrixXf AudioInferenceEngine::speechlibMel(int sample_rate, int n_fft, int n_mels, float fmin, float fmax) { int bank_width = n_fft / 2 + 1; if (fmax == 0.0f) fmax = sample_rate / 2.0f; if (fmin == 0.0f) fmin = 0.0f; auto mel = [](float f) { return 1127.0f * std::log(1.0f + f / 700.0f); }; auto bin2mel = [&](int fft_bin) { return 1127.0f * std::log(1.0f + static_cast(fft_bin) * sample_rate / (static_cast(n_fft) * 700.0f)); }; auto f2bin = [&](float f) { return static_cast((f * n_fft / sample_rate) + 0.5f); }; int klo = f2bin(fmin) + 1; int khi = f2bin(fmax); khi = std::max(khi, klo); float mlo = mel(fmin); float mhi = mel(fmax); std::vector m_centers(n_mels + 2); float ms = (mhi - mlo) / (n_mels + 1); for (int i = 0; i < n_mels + 2; ++i) { m_centers[i] = mlo + i * ms; } Eigen::MatrixXf matrix = Eigen::MatrixXf::Zero(n_mels, bank_width); for (int m = 0; m < n_mels; ++m) { float left = m_centers[m]; float center = m_centers[m + 1]; float right = m_centers[m + 2]; for (int fft_bin = klo; fft_bin < bank_width; ++fft_bin) { float mbin = bin2mel(fft_bin); if (left < mbin && mbin < right) { matrix(m, fft_bin) = 1.0f - std::abs(center - mbin) / ms; } } } return matrix; } /** * @brief Public method: Preprocesses an audio WAV file. */ Eigen::MatrixXf AudioInferenceEngine::preprocessAudio(const std::string& wavFilePath) { int actual_wav_sample_rate = 0; std::vector audioWav = loadWavToFloatArray(wavFilePath, actual_wav_sample_rate); if (audioWav.empty()) { std::cerr << "Failed to load audio data from " << wavFilePath << "." << std::endl; return Eigen::MatrixXf(0, N_MELS); } if (actual_wav_sample_rate != TARGET_SAMPLE_RATE) { std::cerr << "Warning: WAV file sample rate (" << actual_wav_sample_rate << " Hz) does not match the target sample rate for feature extraction (" << TARGET_SAMPLE_RATE << " Hz)." << std::endl; std::cerr << "This example does NOT include resampling. Features will be extracted at " << TARGET_SAMPLE_RATE << " Hz, which might lead to incorrect results if the WAV file's sample rate is different." << std::endl; } Eigen::MatrixXf spec = extractSpectrogram(audioWav, TARGET_SAMPLE_RATE); if (spec.rows() == 0) { std::cerr << "Error: Spectrogram extraction failed." << std::endl; return Eigen::MatrixXf(0, N_MELS); } Eigen::MatrixXf spec_power = spec.array().square(); Eigen::MatrixXf fbank_power = spec_power * mel_filterbank_.transpose(); // Transpose mel_filterbank_ for correct multiplication fbank_power = fbank_power.array().max(1.0f); Eigen::MatrixXf log_fbank = fbank_power.array().log(); return log_fbank; } /** * @brief Public method: Runs inference on the loaded ONNX model. */ std::vector AudioInferenceEngine::runInference(const Eigen::MatrixXf& features) { if (features.rows() == 0 || features.cols() == 0) { std::cerr << "Error: Input features are empty for inference." << std::endl; return {}; } // Prepare Input Tensor Shape: [batch, frames, feature_size] std::vector inputTensorShape = {1, features.rows(), features.cols()}; // Flatten Eigen::MatrixXf into std::vector in row-major order std::vector inputTensorData(features.rows() * features.cols()); for (int r = 0; r < features.rows(); ++r) { for (int c = 0; c < features.cols(); ++c) { inputTensorData[r * features.cols() + c] = features(r, c); } } Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); Ort::Value inputTensor = Ort::Value::CreateTensor(memory_info, inputTensorData.data(), inputTensorData.size(), inputTensorShape.data(), inputTensorShape.size()); if (!inputTensor.IsTensor()) { std::cerr << "Error: Created input tensor is not valid!" << std::endl; return {}; } // Run Inference std::vector outputTensors = session_->Run(Ort::RunOptions{nullptr}, input_node_names_.data(), &inputTensor, 1, output_node_names_.data(), output_node_names_.size()); if (outputTensors.empty() || !outputTensors[0].IsTensor()) { std::cerr << "Error: No valid output tensors received from the model." << std::endl; return {}; } // Copy output data float* outputData = outputTensors[0].GetTensorMutableData(); Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo(); size_t outputSize = outputShapeInfo.GetElementCount(); std::vector result(outputData, outputData + outputSize); return result; } std::vector AudioInferenceEngine::runInference_tensor(const Ort::Value& inputTensor) { // Run Inference std::vector outputTensors = session_->Run(Ort::RunOptions{nullptr}, input_node_names_.data(), &inputTensor, 1, output_node_names_.data(), output_node_names_.size()); return outputTensors; }