| #include <stdexcept> |
| #include <cmath> |
| #include <iostream> |
|
|
| #include "vad_onnx.h" |
|
|
|
|
| static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str, |
| std::vector<const char *> &input_names_char) { |
| Ort::AllocatorWithDefaultOptions allocator; |
| size_t nodes_num = session->GetInputCount(); |
| input_names_str.resize(nodes_num); |
| input_names_char.resize(nodes_num); |
|
|
| for (size_t i = 0; i != nodes_num; ++i) { |
| auto t = session->GetInputNameAllocated(i, allocator); |
| input_names_str[i] = t.get(); |
| input_names_char[i] = input_names_str[i].c_str(); |
| } |
| } |
|
|
| static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_, |
| std::vector<const char *> &vad_out_names_) { |
| Ort::AllocatorWithDefaultOptions allocator; |
| size_t nodes_num = session->GetOutputCount(); |
| output_names_.resize(nodes_num); |
| vad_out_names_.resize(nodes_num); |
| for (size_t i = 0; i != nodes_num; ++i) { |
| auto t = session->GetOutputNameAllocated(i, allocator); |
| output_names_[i] = t.get(); |
| vad_out_names_[i] = output_names_[i].c_str(); |
| } |
| } |
|
|
| VadOnnx::VadOnnx(const std::string& model_path, |
| int batch_size, |
| int thread_num, |
| float threshold, |
| int sampling_rate, |
| int min_silence_duration_ms, |
| float max_speech_duration_s, |
| int speech_pad_ms) |
| : batch_size_(batch_size), |
| thread_num_(thread_num), |
| threshold_(threshold), |
| sample_rates_(sampling_rate), |
| min_silence_samples_(sampling_rate * min_silence_duration_ms / 1000.0), |
| speech_pad_samples_(sampling_rate * speech_pad_ms / 1000.0), |
| triggered_(false), |
| temp_end_(0), |
| current_sample_(0), |
| start_(0), |
| memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)) |
| { |
|
|
| init_onnx_model(model_path); |
|
|
| get_input_names(session.get(), input_names_, vad_in_names_); |
| get_output_names(session.get(), output_names_, vad_out_names_); |
|
|
| sr.resize(1); |
| sr[0] = sample_rates_; |
|
|
| if (batch_size_ != 1) { |
| state_shape = {2, batch_size_, 128}; |
| state_size = 2 * batch_size_ * 128; |
| } |
| state_.resize(state_size); |
| |
| context_size = (sample_rates_ == 16000) ? 64 : 32; |
| context_.resize(context_size); |
|
|
| effective_window_size = window_size_samples + context_size; |
| input_node_shape[0] = 1; |
| input_node_shape[1] = effective_window_size; |
|
|
| reset_states(); |
| } |
|
|
| VadOnnx::~VadOnnx() = default; |
|
|
| void VadOnnx::reset_states() { |
| std::memset(state_.data(), 0, state_.size() * sizeof(float)); |
| std::fill(context_.begin(), context_.end(), 0.0f); |
| triggered_ = false; |
| temp_end_ = 0; |
| current_sample_ = 0; |
| start_ = 0; |
| last_sr_ = 0; |
| last_batch_size_ = 0; |
| } |
|
|
| float VadOnnx::forward_infer(std::vector<float>& data_chunk) { |
| |
| std::vector<float> x_with_context(effective_window_size, 0.0f); |
| std::copy(context_.begin(), context_.end(), x_with_context.begin()); |
| std::copy(data_chunk.begin(), data_chunk.end(), x_with_context.begin() + context_size); |
| input = x_with_context; |
|
|
| |
| Ort::Value input_tensor = Ort::Value::CreateTensor<float>( |
| memory_info, input.data(), input.size(), input_node_shape.data(), 2); |
| Ort::Value state_tensor = Ort::Value::CreateTensor<float>( |
| memory_info, state_.data(), state_.size(), state_shape.data(), 3); |
| Ort::Value sr_tensor = Ort::Value::CreateTensor<int64_t>( |
| memory_info, sr.data(), 1, sr_shape.data(), 1); |
|
|
| ort_inputs.clear(); |
| ort_inputs.emplace_back(std::move(input_tensor)); |
| ort_inputs.emplace_back(std::move(state_tensor)); |
| ort_inputs.emplace_back(std::move(sr_tensor)); |
|
|
| |
| ort_outputs = session->Run( |
| Ort::RunOptions{nullptr}, vad_in_names_.data(), ort_inputs.data(), |
| ort_inputs.size(), vad_out_names_.data(), vad_out_names_.size()); |
|
|
|
|
| |
| float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0]; |
|
|
| |
| float* stateN = ort_outputs[1].GetTensorMutableData<float>(); |
| std::memcpy(state_.data(), stateN, state_size * sizeof(float)); |
|
|
| |
| std::copy(x_with_context.end() - context_size, x_with_context.end(), context_.begin()); |
|
|
| return speech_prob; |
| } |
|
|
| std::vector<float> VadOnnx::vad_dectect(std::vector<float>& audio) { |
| std::vector<float> result; |
|
|
| |
| int pad_num = (window_size_samples - (audio.size() % window_size_samples)) % window_size_samples; |
| audio.insert(audio.end(), pad_num, 0.0f); |
|
|
| for (size_t i = 0; i < audio.size(); i += window_size_samples) { |
| std::vector<float> chunk(audio.begin() + i, audio.begin() + i + window_size_samples); |
| auto prob = forward_infer(chunk); |
| result.emplace_back(prob); |
| } |
|
|
| return result; |
| } |
|
|
| std::map<std::string, double> VadOnnx::vad_dectect(std::vector<float>& audio, bool return_seconds) { |
| std::map<std::string, double> result; |
|
|
| |
| buffer_.insert(buffer_.end(), audio.begin(), audio.end()); |
|
|
| while (buffer_.size() > 0) { |
| std::map<std::string, double> tmp; |
| std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(buffer_.size()), window_size_samples)); |
| |
| if (chunk.size() < static_cast<size_t>(window_size_samples)) { |
| chunk.resize(window_size_samples, 0.0f); |
| } |
|
|
| current_sample_ += window_size_samples; |
|
|
| |
| float speech_prob = forward_infer(chunk); |
|
|
| if (speech_prob >= threshold_ && temp_end_ > 0) { |
| temp_end_ = 0; |
| } |
|
|
| if (speech_prob >= threshold_ && !triggered_) { |
| triggered_ = true; |
| start_ = std::max(0.0, current_sample_ - window_size_samples); |
| tmp["start"] = return_seconds ? start_ / sample_rates_ : start_; |
| } |
|
|
| if (speech_prob < (threshold_ - 0.15) && triggered_) { |
| if (temp_end_ == 0) { |
| temp_end_ = current_sample_; |
| } |
|
|
| if (current_sample_ - temp_end_ >= min_silence_samples_) { |
| double speech_end = temp_end_; |
| tmp["end"] = return_seconds ? speech_end / sample_rates_ : speech_end; |
| temp_end_ = 0; |
| triggered_ = false; |
| } |
| } |
|
|
| |
| if (window_size_samples >= buffer_.size()) { |
| buffer_.clear(); |
| } else { |
| std::copy(buffer_.begin() + window_size_samples, buffer_.end(), buffer_.begin()); |
| buffer_.resize(buffer_.size() - window_size_samples); |
| } |
|
|
| |
| if (result.empty()) { |
| result = tmp; |
| } else if (!tmp.empty()) { |
| |
| if (tmp.find("end") != tmp.end()) { |
| result["end"] = tmp["end"]; |
| } |
|
|
| |
| if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) { |
| result.erase("end"); |
| } |
| } |
| } |
|
|
| return result; |
| } |
|
|
| void VadOnnx::init_onnx_model(const std::string& model_path) { |
| init_engine_threads(1, 1); |
| init_exec_provider(); |
|
|
| |
| env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "VadOnnx"); |
| session = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options); |
| } |
|
|
| void VadOnnx::init_engine_threads(int inter_threads, int intra_threads) { |
| session_options.SetInterOpNumThreads(inter_threads); |
| session_options.SetIntraOpNumThreads(intra_threads); |
| session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL); |
| } |
|
|
| void VadOnnx::init_exec_provider() { |
| |
| std::vector<std::string> providers = Ort::GetAvailableProviders(); |
| |
| if (std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) { |
| OrtCUDAProviderOptions cuda_options{}; |
| session_options.AppendExecutionProvider_CUDA(cuda_options); |
| } |
| |
| |
| |
| |
| |
| } |
|
|