|
|
#include <iostream> |
|
|
#include <vector> |
|
|
#include <fstream> |
|
|
#include <cstdint> |
|
|
#include <cmath> |
|
|
#include <numeric> |
|
|
#include <algorithm> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <onnxruntime_cxx_api.h> |
|
|
|
|
|
|
|
|
|
|
|
#ifndef M_PI |
|
|
#define M_PI 3.14159265358979323846 |
|
|
#endif |
|
|
|
|
|
|
|
|
std::vector<float> loadPcmToFloatArray(const std::string& filename, int bitDepth, int numChannels) { |
|
|
|
|
|
std::ifstream file(filename, std::ios::binary); |
|
|
if (!file.is_open()) { |
|
|
std::cerr << "Error: Could not open PCM file: " << filename << std::endl; |
|
|
return {}; |
|
|
} |
|
|
|
|
|
std::vector<float> audioData; |
|
|
|
|
|
|
|
|
if (bitDepth == 16) { |
|
|
int16_t sample; |
|
|
|
|
|
|
|
|
while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioData.push_back(static_cast<float>(sample) / 32768.0f); |
|
|
} |
|
|
} else { |
|
|
std::cerr << "Error: Unsupported bit depth: " << bitDepth << ". This example only supports 16-bit PCM." << std::endl; |
|
|
return {}; |
|
|
} |
|
|
|
|
|
file.close(); |
|
|
return audioData; |
|
|
} |
|
|
|
|
|
int main() { |
|
|
|
|
|
std::string pcmFilename = "/mnt/data-2t/jeff/codes/llm/cpp/sample_data/pickup_breezy-common_voice_zh-TW_17376838-breezyvoice-00818.pcm"; |
|
|
int bitDepth = 16; |
|
|
int numChannels = 1; |
|
|
int sampleRate = 16000; |
|
|
std::string onnxModelPath = "/mnt/data-2t/jeff/codes/llm/cpp/onnx_files/speech_init_export/phi-4-mm-speech.onnx"; |
|
|
|
|
|
|
|
|
std::vector<float> audioInput = loadPcmToFloatArray(pcmFilename, bitDepth, numChannels); |
|
|
|
|
|
if (audioInput.empty()) { |
|
|
std::cerr << "Failed to load audio data from " << pcmFilename << ". Exiting." << std::endl; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
std::cout << "Successfully loaded " << audioInput.size() << " samples from " << pcmFilename << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
std::ifstream onnxModelCheck(onnxModelPath, std::ios::binary); |
|
|
if (!onnxModelCheck.is_open()) { |
|
|
std::cerr << "\nError: ONNX model file '" << onnxModelPath << "' not found." << std::endl; |
|
|
std::cerr << "Please provide a valid ONNX model file. If you need a simple dummy one for testing, " |
|
|
<< "you can create it using Python (e.g., with PyTorch) like this:" << std::endl; |
|
|
std::cerr << "```python" << std::endl; |
|
|
std::cerr << "import torch" << std::endl; |
|
|
std::cerr << "import torch.nn as nn" << std::endl; |
|
|
std::cerr << "" << std::endl; |
|
|
std::cerr << "class SimpleAudioModel(nn.Module):" << std::endl; |
|
|
std::cerr << " def __init__(self, input_size, output_size):" << std::endl; |
|
|
std::cerr << " super(SimpleAudioModel, self).__init__()" << std::endl; |
|
|
std::cerr << " # This is a very simple linear layer. Your actual model will be more complex." << std::endl; |
|
|
std::cerr << " # This model expects input of shape [batch_size, input_size]" << std::endl; |
|
|
std::cerr << " self.linear = nn.Linear(input_size, output_size)" << std::endl; |
|
|
std::cerr << "" << std::endl; |
|
|
std::cerr << " def forward(self, x):" << std::endl; |
|
|
std::cerr << " # If your model expects a different input shape (e.g., [batch_size, channels, samples])," << std::endl; |
|
|
std::cerr << " # you might need to reshape 'x' here before passing it to your layers (e.g., x.view(x.size(0), 1, -1))." << std::endl; |
|
|
std::cerr << " return self.linear(x)" << std::endl; |
|
|
std::cerr << "" << std::endl; |
|
|
std::cerr << "# --- IMPORTANT: Define model input and output sizes. Adjust these to match your actual model's requirements. ---" << std::endl; |
|
|
std::cerr << "# For this dummy model, we'll assume an input size matching our 2-second, 44.1kHz mono audio." << std::endl; |
|
|
std::cerr << "DUMMY_INPUT_SIZE = " << (sampleRate * 2) << " # Corresponds to " << (sampleRate * 2) / static_cast<float>(sampleRate) << " seconds of audio at " << sampleRate << " Hz mono" << std::endl; |
|
|
std::cerr << "DUMMY_OUTPUT_SIZE = 10 # Example: 10 classification scores or features" << std::endl; |
|
|
std::cerr << "" << std::endl; |
|
|
std::cerr << "model = SimpleAudioModel(DUMMY_INPUT_SIZE, DUMMY_OUTPUT_SIZE)" << std::endl; |
|
|
std::cerr << "dummy_input_tensor = torch.randn(1, DUMMY_INPUT_SIZE) # Batch size 1, DUMMY_INPUT_SIZE features" << std::endl; |
|
|
std::cerr << "" << std::endl; |
|
|
std::cerr << "torch.onnx.export(" << std::endl; |
|
|
std::cerr << " model," << std::endl; |
|
|
std::cerr << " dummy_input_tensor," << std::endl; |
|
|
std::cerr << " \"model.onnx\"," << std::endl; |
|
|
std::cerr << " verbose=True," << std::endl; |
|
|
std::cerr << " input_names=['input'], # Name of the input tensor in the ONNX graph" << std::endl; |
|
|
std::cerr << " output_names=['output'], # Name of the output tensor in the ONNX graph" << std::endl; |
|
|
std::cerr << " # Optional: Define dynamic axes if your batch size or sequence length can vary" << std::endl; |
|
|
std::cerr << " dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}" << std::endl; |
|
|
std::cerr << ")" << std::endl; |
|
|
std::cerr << "print(\"Dummy model.onnx created successfully. Remember to adjust DUMMY_INPUT_SIZE in this script to match the length of your audio data or ensure your C++ code pads/truncates the audio data to the model's expected input size.\")" << std::endl; |
|
|
std::cerr << "```" << std::endl; |
|
|
return 1; |
|
|
} |
|
|
onnxModelCheck.close(); |
|
|
std::cout << "ONNX model '" << onnxModelPath << "' found. Proceeding with inference." << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "AudioInference"); |
|
|
|
|
|
|
|
|
Ort::SessionOptions session_options; |
|
|
session_options.SetIntraOpNumThreads(1); |
|
|
session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED); |
|
|
|
|
|
|
|
|
Ort::Session session(env, onnxModelPath.c_str(), session_options); |
|
|
|
|
|
|
|
|
|
|
|
Ort::AllocatorWithDefaultOptions allocator; |
|
|
|
|
|
|
|
|
size_t numInputNodes = session.GetInputCount(); |
|
|
std::vector<const char*> inputNodeNames(numInputNodes); |
|
|
|
|
|
std::cout << "\n--- Model Input Information ---" << std::endl; |
|
|
|
|
|
for (size_t i = 0; i < numInputNodes; ++i) { |
|
|
|
|
|
inputNodeNames[i] = session.GetInputNameAllocated(i, allocator).get(); |
|
|
|
|
|
|
|
|
Ort::TypeInfo type_info = session.GetInputTypeInfo(i); |
|
|
auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); |
|
|
std::vector<int64_t> actualInputShape = tensor_info.GetShape(); |
|
|
|
|
|
std::cout << " Input " << i << " : Name='" << inputNodeNames[i] << "', Shape=["; |
|
|
for (size_t j = 0; j < actualInputShape.size(); ++j) { |
|
|
std::cout << actualInputShape[j] << (j == actualInputShape.size() - 1 ? "" : ", "); |
|
|
} |
|
|
std::cout << "]" << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<int64_t> inputTensorShape; |
|
|
|
|
|
if (actualInputShape.size() == 2 && actualInputShape[0] == 1) { |
|
|
|
|
|
int64_t expected_length = actualInputShape[1]; |
|
|
|
|
|
|
|
|
if (audioInput.size() != expected_length) { |
|
|
std::cout << " Warning: Loaded audio input size (" << audioInput.size() |
|
|
<< ") does not match model's expected input length (" << expected_length << ")." << std::endl; |
|
|
std::cout << " Padding/truncating audio data to match model input size." << std::endl; |
|
|
audioInput.resize(expected_length, 0.0f); |
|
|
} |
|
|
inputTensorShape = {1, expected_length}; |
|
|
} else if (actualInputShape.size() == 1) { |
|
|
|
|
|
int64_t expected_length = actualInputShape[0]; |
|
|
|
|
|
if (audioInput.size() != expected_length) { |
|
|
std::cout << " Warning: Loaded audio input size (" << audioInput.size() |
|
|
<< ") does not match model's expected input length (" << expected_length << ")." << std::endl; |
|
|
std::cout << " Padding/truncating audio data to match model input size." << std::endl; |
|
|
audioInput.resize(expected_length, 0.0f); |
|
|
} |
|
|
inputTensorShape = {expected_length}; |
|
|
} else { |
|
|
std::cerr << "Error: Model input shape is not supported by this example ([N] or [1, N]). " |
|
|
<< "Please adjust the input tensor shape creation logic in C++ to match your model's specific requirements." << std::endl; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memory_info, audioInput.data(), audioInput.size(), |
|
|
inputTensorShape.data(), inputTensorShape.size()); |
|
|
|
|
|
|
|
|
if (!inputTensor.IsTensor()) { |
|
|
std::cerr << "Error: Created input tensor is not valid! This might indicate a shape mismatch or data issue." << std::endl; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t numOutputNodes = session.GetOutputCount(); |
|
|
std::vector<const char*> outputNodeNames(numOutputNodes); |
|
|
|
|
|
std::cout << "\n--- Model Output Information ---" << std::endl; |
|
|
|
|
|
for (size_t k = 0; k < numOutputNodes; ++k) { |
|
|
outputNodeNames[k] = session.GetOutputNameAllocated(k, allocator).get(); |
|
|
Ort::TypeInfo type_info_out = session.GetOutputTypeInfo(k); |
|
|
auto tensor_info_out = type_info_out.GetTensorTypeAndShapeInfo(); |
|
|
std::vector<int64_t> outputShape = tensor_info_out.GetShape(); |
|
|
std::cout << " Output " << k << " : Name='" << outputNodeNames[k] << "', Shape=["; |
|
|
for (size_t l = 0; l < outputShape.size(); ++l) { |
|
|
std::cout << outputShape[l] << (l == outputShape.size() - 1 ? "" : ", "); |
|
|
} |
|
|
std::cout << "]" << std::endl; |
|
|
} |
|
|
|
|
|
|
|
|
std::cout << "\nRunning ONNX model inference..." << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<Ort::Value> outputTensors = session.Run(Ort::RunOptions{nullptr}, |
|
|
inputNodeNames.data(), &inputTensor, 1, |
|
|
outputNodeNames.data(), numOutputNodes); |
|
|
|
|
|
|
|
|
if (outputTensors.empty()) { |
|
|
std::cerr << "Error: No output tensors received from the model." << std::endl; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
|
|
|
if (outputTensors[0].IsTensor()) { |
|
|
|
|
|
float* outputData = outputTensors[0].GetTensorMutableData<float>(); |
|
|
Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo(); |
|
|
std::vector<int64_t> outputShape = outputShapeInfo.GetShape(); |
|
|
size_t outputSize = outputShapeInfo.GetElementCount(); |
|
|
|
|
|
std::cout << "\n--- Model Inference Result (first few elements) ---" << std::endl; |
|
|
|
|
|
for (size_t k = 0; k < std::min((size_t)10, outputSize); ++k) { |
|
|
std::cout << outputData[k] << (k == std::min((size_t)10, outputSize) - 1 ? "" : ", "); |
|
|
} |
|
|
std::cout << std::endl; |
|
|
|
|
|
std::cout << "Full output tensor size: " << outputSize << " elements." << std::endl; |
|
|
std::cout << "Full output tensor shape: ["; |
|
|
for (size_t k = 0; k < outputShape.size(); ++k) { |
|
|
std::cout << outputShape[k] << (k == outputShape.size() - 1 ? "" : ", "); |
|
|
} |
|
|
std::cout << "]" << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
std::cerr << "Error: First output tensor is not of the expected type (float tensor)." << std::endl; |
|
|
} |
|
|
} |
|
|
|
|
|
} catch (const Ort::Exception& e) { |
|
|
|
|
|
std::cerr << "ONNX Runtime Exception: " << e.what() << std::endl; |
|
|
return 1; |
|
|
} catch (const std::exception& e) { |
|
|
|
|
|
std::cerr << "Standard Exception: " << e.what() << std::endl; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
std::cout << "\nProgram finished successfully." << std::endl; |
|
|
return 0; |
|
|
} |