File size: 18,527 Bytes
a2dca42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
#include <iostream>
#include <vector>
#include <fstream> // For file input/output operations (e.g., std::ifstream, std::ofstream)
#include <cstdint> // For fixed-width integer types (e.g., int16_t)
#include <cmath> // For mathematical functions (e.g., std::sin, M_PI)
#include <numeric> // For numerical operations (not strictly used in this version but often useful)
#include <algorithm> // For algorithms like std::min
// Include the ONNX Runtime C++ API header
// You need to have ONNX Runtime installed and linked correctly in your build system.
// For example, using CMake, you might add:
// find_package(ONNXRuntime REQUIRED)
// target_link_libraries(your_executable PRIVATE ONNXRuntime::onnxruntime_cxx_api)
#include <onnxruntime_cxx_api.h>
// Define M_PI if it's not already defined by cmath or your compiler.
// This is common on Windows with MSVC unless _USE_MATH_DEFINES is set.
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
std::vector<float> loadPcmToFloatArray(const std::string& filename, int bitDepth, int numChannels) {
// Open the PCM file in binary mode for reading
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Error: Could not open PCM file: " << filename << std::endl;
return {}; // Return empty vector on failure
}
std::vector<float> audioData; // Vector to store the normalized float audio samples
// Check if the bit depth is supported (this example only handles 16-bit)
if (bitDepth == 16) {
int16_t sample; // Buffer to read a single 16-bit sample
// Read samples until the end of the file
while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
// Normalize 16-bit signed integer to float in range [-1.0, 1.0]
// The maximum positive value for int16_t is 32767.
// Dividing by 32768.0f (which is 2^15) ensures that 32767 maps to
// slightly less than 1.0, and -32768 maps to -1.0, maintaining
// the full dynamic range and avoiding overflow for -32768.
audioData.push_back(static_cast<float>(sample) / 32768.0f);
}
} else {
std::cerr << "Error: Unsupported bit depth: " << bitDepth << ". This example only supports 16-bit PCM." << std::endl;
return {}; // Return empty vector for unsupported bit depth
}
file.close(); // Close the file
return audioData; // Return the loaded audio data
}
int main() {
// --- Configuration for Audio and ONNX Model ---
std::string pcmFilename = "/mnt/data-2t/jeff/codes/llm/cpp/sample_data/pickup_breezy-common_voice_zh-TW_17376838-breezyvoice-00818.pcm"; // Name of the PCM audio file to load
int bitDepth = 16; // Bit depth of the PCM data (e.g., 16-bit)
int numChannels = 1; // Number of audio channels (e.g., 1 for mono)
int sampleRate = 16000; // Sample rate of the audio (e.g., 16000 Hz)
std::string onnxModelPath = "/mnt/data-2t/jeff/codes/llm/cpp/onnx_files/speech_init_export/phi-4-mm-speech.onnx"; // Path to your ONNX model file
// --- 2. Load PCM audio data into a float array ---
std::vector<float> audioInput = loadPcmToFloatArray(pcmFilename, bitDepth, numChannels);
if (audioInput.empty()) {
std::cerr << "Failed to load audio data from " << pcmFilename << ". Exiting." << std::endl;
return 1; // Exit if audio data loading failed
}
std::cout << "Successfully loaded " << audioInput.size() << " samples from " << pcmFilename << std::endl;
// --- 3. Check for ONNX model existence and provide guidance if missing ---
// This step is critical. You need a valid ONNX model.
std::ifstream onnxModelCheck(onnxModelPath, std::ios::binary);
if (!onnxModelCheck.is_open()) {
std::cerr << "\nError: ONNX model file '" << onnxModelPath << "' not found." << std::endl;
std::cerr << "Please provide a valid ONNX model file. If you need a simple dummy one for testing, "
<< "you can create it using Python (e.g., with PyTorch) like this:" << std::endl;
std::cerr << "```python" << std::endl;
std::cerr << "import torch" << std::endl;
std::cerr << "import torch.nn as nn" << std::endl;
std::cerr << "" << std::endl;
std::cerr << "class SimpleAudioModel(nn.Module):" << std::endl;
std::cerr << " def __init__(self, input_size, output_size):" << std::endl;
std::cerr << " super(SimpleAudioModel, self).__init__()" << std::endl;
std::cerr << " # This is a very simple linear layer. Your actual model will be more complex." << std::endl;
std::cerr << " # This model expects input of shape [batch_size, input_size]" << std::endl;
std::cerr << " self.linear = nn.Linear(input_size, output_size)" << std::endl;
std::cerr << "" << std::endl;
std::cerr << " def forward(self, x):" << std::endl;
std::cerr << " # If your model expects a different input shape (e.g., [batch_size, channels, samples])," << std::endl;
std::cerr << " # you might need to reshape 'x' here before passing it to your layers (e.g., x.view(x.size(0), 1, -1))." << std::endl;
std::cerr << " return self.linear(x)" << std::endl;
std::cerr << "" << std::endl;
std::cerr << "# --- IMPORTANT: Define model input and output sizes. Adjust these to match your actual model's requirements. ---" << std::endl;
std::cerr << "# For this dummy model, we'll assume an input size matching our 2-second, 44.1kHz mono audio." << std::endl;
std::cerr << "DUMMY_INPUT_SIZE = " << (sampleRate * 2) << " # Corresponds to " << (sampleRate * 2) / static_cast<float>(sampleRate) << " seconds of audio at " << sampleRate << " Hz mono" << std::endl;
std::cerr << "DUMMY_OUTPUT_SIZE = 10 # Example: 10 classification scores or features" << std::endl;
std::cerr << "" << std::endl;
std::cerr << "model = SimpleAudioModel(DUMMY_INPUT_SIZE, DUMMY_OUTPUT_SIZE)" << std::endl;
std::cerr << "dummy_input_tensor = torch.randn(1, DUMMY_INPUT_SIZE) # Batch size 1, DUMMY_INPUT_SIZE features" << std::endl;
std::cerr << "" << std::endl;
std::cerr << "torch.onnx.export(" << std::endl;
std::cerr << " model," << std::endl;
std::cerr << " dummy_input_tensor," << std::endl;
std::cerr << " \"model.onnx\"," << std::endl;
std::cerr << " verbose=True," << std::endl;
std::cerr << " input_names=['input'], # Name of the input tensor in the ONNX graph" << std::endl;
std::cerr << " output_names=['output'], # Name of the output tensor in the ONNX graph" << std::endl;
std::cerr << " # Optional: Define dynamic axes if your batch size or sequence length can vary" << std::endl;
std::cerr << " dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}" << std::endl;
std::cerr << ")" << std::endl;
std::cerr << "print(\"Dummy model.onnx created successfully. Remember to adjust DUMMY_INPUT_SIZE in this script to match the length of your audio data or ensure your C++ code pads/truncates the audio data to the model's expected input size.\")" << std::endl;
std::cerr << "```" << std::endl;
return 1; // Exit if the ONNX model is not found
}
onnxModelCheck.close();
std::cout << "ONNX model '" << onnxModelPath << "' found. Proceeding with inference." << std::endl;
// --- 4. ONNX Runtime Inference ---
try {
// Create an ONNX Runtime environment. This is the entry point for all ONNX Runtime operations.
// ORT_LOGGING_LEVEL_WARNING suppresses verbose output unless there's a warning or error.
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "AudioInference");
// Configure session options.
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1); // Use 1 thread for operations within a single node
session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED); // Apply all available graph optimizations
// Create an ONNX Runtime session by loading the model.
Ort::Session session(env, onnxModelPath.c_str(), session_options);
// Get model input and output names and shapes.
// An allocator is needed to manage memory for allocated strings (like node names).
Ort::AllocatorWithDefaultOptions allocator;
// --- Get Input Node Information ---
size_t numInputNodes = session.GetInputCount();
std::vector<const char*> inputNodeNames(numInputNodes); // To store input node names
std::cout << "\n--- Model Input Information ---" << std::endl;
// Iterate through all input nodes (models usually have one main input)
for (size_t i = 0; i < numInputNodes; ++i) {
// Get the input node name
inputNodeNames[i] = session.GetInputNameAllocated(i, allocator).get();
// Get the type and shape information for the input tensor
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
std::vector<int64_t> actualInputShape = tensor_info.GetShape(); // Get the shape the model *expects*
std::cout << " Input " << i << " : Name='" << inputNodeNames[i] << "', Shape=[";
for (size_t j = 0; j < actualInputShape.size(); ++j) {
std::cout << actualInputShape[j] << (j == actualInputShape.size() - 1 ? "" : ", ");
}
std::cout << "]" << std::endl;
// --- Prepare Input Tensor Shape ---
// This is a CRITICAL step. The `audioInput` vector must be reshaped
// to precisely match the ONNX model's expected input tensor shape.
// The dummy Python model provided above creates an input of shape [1, DUMMY_INPUT_SIZE].
// We need to ensure `audioInput` matches `DUMMY_INPUT_SIZE` or pad/truncate it.
std::vector<int64_t> inputTensorShape; // This will be the shape of the tensor we create
if (actualInputShape.size() == 2 && actualInputShape[0] == 1) {
// Case: Model expects a 2D input with batch size 1 (e.g., [1, num_features])
int64_t expected_length = actualInputShape[1]; // The expected number of features/samples
// Check if the loaded audio data size matches the model's expected input length
if (audioInput.size() != expected_length) {
std::cout << " Warning: Loaded audio input size (" << audioInput.size()
<< ") does not match model's expected input length (" << expected_length << ")." << std::endl;
std::cout << " Padding/truncating audio data to match model input size." << std::endl;
audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate the audio data
}
inputTensorShape = {1, expected_length}; // Set the tensor shape for ONNX Runtime
} else if (actualInputShape.size() == 1) {
// Case: Model expects a 1D input (e.g., [num_features])
int64_t expected_length = actualInputShape[0];
if (audioInput.size() != expected_length) {
std::cout << " Warning: Loaded audio input size (" << audioInput.size()
<< ") does not match model's expected input length (" << expected_length << ")." << std::endl;
std::cout << " Padding/truncating audio data to match model input size." << std::endl;
audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate
}
inputTensorShape = {expected_length}; // Set the tensor shape for ONNX Runtime
} else {
std::cerr << "Error: Model input shape is not supported by this example ([N] or [1, N]). "
<< "Please adjust the input tensor shape creation logic in C++ to match your model's specific requirements." << std::endl;
return 1; // Exit if the input shape is not handled
}
// Create an ONNX Runtime memory info object for CPU memory.
// This specifies where the tensor data is located (CPU in this case).
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
// Create the input tensor from the audio data.
// `audioInput.data()` provides a pointer to the raw float data.
// `audioInput.size()` is the total number of elements.
// `inputTensorShape.data()` provides the shape array.
// `inputTensorShape.size()` is the number of dimensions.
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memory_info, audioInput.data(), audioInput.size(),
inputTensorShape.data(), inputTensorShape.size());
// Verify that the created input tensor is valid
if (!inputTensor.IsTensor()) {
std::cerr << "Error: Created input tensor is not valid! This might indicate a shape mismatch or data issue." << std::endl;
return 1; // Exit if the tensor is invalid
}
// At this point, `inputTensor` is ready to be fed into the model.
// For simplicity, we assume there's only one input to the model.
// If your model has multiple inputs, you'd need to create multiple Ort::Value objects.
// --- Get Output Node Information ---
size_t numOutputNodes = session.GetOutputCount();
std::vector<const char*> outputNodeNames(numOutputNodes); // To store output node names
std::cout << "\n--- Model Output Information ---" << std::endl;
// Iterate through all output nodes
for (size_t k = 0; k < numOutputNodes; ++k) {
outputNodeNames[k] = session.GetOutputNameAllocated(k, allocator).get();
Ort::TypeInfo type_info_out = session.GetOutputTypeInfo(k);
auto tensor_info_out = type_info_out.GetTensorTypeAndShapeInfo();
std::vector<int64_t> outputShape = tensor_info_out.GetShape();
std::cout << " Output " << k << " : Name='" << outputNodeNames[k] << "', Shape=[";
for (size_t l = 0; l < outputShape.size(); ++l) {
std::cout << outputShape[l] << (l == outputShape.size() - 1 ? "" : ", ");
}
std::cout << "]" << std::endl;
}
// --- Run Inference ---
std::cout << "\nRunning ONNX model inference..." << std::endl;
// The `session.Run` method executes the model.
// Arguments:
// - Ort::RunOptions{nullptr}: Default run options.
// - inputNodeNames.data(): Array of C-style strings for input names.
// - &inputTensor: Pointer to the array of input tensors (here, just one).
// - 1: Number of input tensors.
// - outputNodeNames.data(): Array of C-style strings for output names.
// - numOutputNodes: Number of output tensors expected.
std::vector<Ort::Value> outputTensors = session.Run(Ort::RunOptions{nullptr},
inputNodeNames.data(), &inputTensor, 1,
outputNodeNames.data(), numOutputNodes);
// --- Process Output ---
if (outputTensors.empty()) {
std::cerr << "Error: No output tensors received from the model." << std::endl;
return 1; // Exit if no output
}
// Assuming the first output is a float tensor (common for most models)
if (outputTensors[0].IsTensor()) {
// Get a mutable pointer to the raw data of the output tensor
float* outputData = outputTensors[0].GetTensorMutableData<float>();
Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo();
std::vector<int64_t> outputShape = outputShapeInfo.GetShape();
size_t outputSize = outputShapeInfo.GetElementCount(); // Total number of elements in the output tensor
std::cout << "\n--- Model Inference Result (first few elements) ---" << std::endl;
// Print the first 10 elements of the output (or fewer if output is smaller)
for (size_t k = 0; k < std::min((size_t)10, outputSize); ++k) {
std::cout << outputData[k] << (k == std::min((size_t)10, outputSize) - 1 ? "" : ", ");
}
std::cout << std::endl;
std::cout << "Full output tensor size: " << outputSize << " elements." << std::endl;
std::cout << "Full output tensor shape: [";
for (size_t k = 0; k < outputShape.size(); ++k) {
std::cout << outputShape[k] << (k == outputShape.size() - 1 ? "" : ", ");
}
std::cout << "]" << std::endl;
// Here you would typically interpret the model's output based on its purpose.
// For example:
// - For classification: Find the index of the maximum value (highest probability).
// - For regression: Use the numerical output directly.
// - For feature extraction: Use the output vector as features for further processing.
} else {
std::cerr << "Error: First output tensor is not of the expected type (float tensor)." << std::endl;
}
} // End of loop for input nodes (assuming single input for simplicity in this example)
} catch (const Ort::Exception& e) {
// Catch ONNX Runtime specific exceptions
std::cerr << "ONNX Runtime Exception: " << e.what() << std::endl;
return 1;
} catch (const std::exception& e) {
// Catch other standard exceptions
std::cerr << "Standard Exception: " << e.what() << std::endl;
return 1;
}
std::cout << "\nProgram finished successfully." << std::endl;
return 0;
} |