File size: 18,527 Bytes
a2dca42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#include <iostream>
#include <vector>
#include <fstream>   // For file input/output operations (e.g., std::ifstream, std::ofstream)
#include <cstdint>   // For fixed-width integer types (e.g., int16_t)
#include <cmath>     // For mathematical functions (e.g., std::sin, M_PI)
#include <numeric>   // For numerical operations (not strictly used in this version but often useful)
#include <algorithm> // For algorithms like std::min

// Include the ONNX Runtime C++ API header
// You need to have ONNX Runtime installed and linked correctly in your build system.
// For example, using CMake, you might add:
// find_package(ONNXRuntime REQUIRED)
// target_link_libraries(your_executable PRIVATE ONNXRuntime::onnxruntime_cxx_api)
#include <onnxruntime_cxx_api.h>

// Define M_PI if it's not already defined by cmath or your compiler.
// This is common on Windows with MSVC unless _USE_MATH_DEFINES is set.
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif


std::vector<float> loadPcmToFloatArray(const std::string& filename, int bitDepth, int numChannels) {
    // Open the PCM file in binary mode for reading
    std::ifstream file(filename, std::ios::binary);
    if (!file.is_open()) {
        std::cerr << "Error: Could not open PCM file: " << filename << std::endl;
        return {}; // Return empty vector on failure
    }

    std::vector<float> audioData; // Vector to store the normalized float audio samples

    // Check if the bit depth is supported (this example only handles 16-bit)
    if (bitDepth == 16) {
        int16_t sample; // Buffer to read a single 16-bit sample

        // Read samples until the end of the file
        while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
            // Normalize 16-bit signed integer to float in range [-1.0, 1.0]
            // The maximum positive value for int16_t is 32767.
            // Dividing by 32768.0f (which is 2^15) ensures that 32767 maps to
            // slightly less than 1.0, and -32768 maps to -1.0, maintaining
            // the full dynamic range and avoiding overflow for -32768.
            audioData.push_back(static_cast<float>(sample) / 32768.0f);
        }
    } else {
        std::cerr << "Error: Unsupported bit depth: " << bitDepth << ". This example only supports 16-bit PCM." << std::endl;
        return {}; // Return empty vector for unsupported bit depth
    }

    file.close(); // Close the file
    return audioData; // Return the loaded audio data
}

int main() {
    // --- Configuration for Audio and ONNX Model ---
    std::string pcmFilename = "/mnt/data-2t/jeff/codes/llm/cpp/sample_data/pickup_breezy-common_voice_zh-TW_17376838-breezyvoice-00818.pcm"; // Name of the PCM audio file to load
    int bitDepth = 16;                     // Bit depth of the PCM data (e.g., 16-bit)
    int numChannels = 1;                   // Number of audio channels (e.g., 1 for mono)
    int sampleRate = 16000;                // Sample rate of the audio (e.g., 16000 Hz)
    std::string onnxModelPath = "/mnt/data-2t/jeff/codes/llm/cpp/onnx_files/speech_init_export/phi-4-mm-speech.onnx"; // Path to your ONNX model file

    // --- 2. Load PCM audio data into a float array ---
    std::vector<float> audioInput = loadPcmToFloatArray(pcmFilename, bitDepth, numChannels);

    if (audioInput.empty()) {
        std::cerr << "Failed to load audio data from " << pcmFilename << ". Exiting." << std::endl;
        return 1; // Exit if audio data loading failed
    }

    std::cout << "Successfully loaded " << audioInput.size() << " samples from " << pcmFilename << std::endl;

    // --- 3. Check for ONNX model existence and provide guidance if missing ---
    // This step is critical. You need a valid ONNX model.
    std::ifstream onnxModelCheck(onnxModelPath, std::ios::binary);
    if (!onnxModelCheck.is_open()) {
        std::cerr << "\nError: ONNX model file '" << onnxModelPath << "' not found." << std::endl;
        std::cerr << "Please provide a valid ONNX model file. If you need a simple dummy one for testing, "
                  << "you can create it using Python (e.g., with PyTorch) like this:" << std::endl;
        std::cerr << "```python" << std::endl;
        std::cerr << "import torch" << std::endl;
        std::cerr << "import torch.nn as nn" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "class SimpleAudioModel(nn.Module):" << std::endl;
        std::cerr << "    def __init__(self, input_size, output_size):" << std::endl;
        std::cerr << "        super(SimpleAudioModel, self).__init__()" << std::endl;
        std::cerr << "        # This is a very simple linear layer. Your actual model will be more complex." << std::endl;
        std::cerr << "        # This model expects input of shape [batch_size, input_size]" << std::endl;
        std::cerr << "        self.linear = nn.Linear(input_size, output_size)" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "    def forward(self, x):" << std::endl;
        std::cerr << "        # If your model expects a different input shape (e.g., [batch_size, channels, samples])," << std::endl;
        std::cerr << "        # you might need to reshape 'x' here before passing it to your layers (e.g., x.view(x.size(0), 1, -1))." << std::endl;
        std::cerr << "        return self.linear(x)" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "# --- IMPORTANT: Define model input and output sizes. Adjust these to match your actual model's requirements. ---" << std::endl;
        std::cerr << "# For this dummy model, we'll assume an input size matching our 2-second, 44.1kHz mono audio." << std::endl;
        std::cerr << "DUMMY_INPUT_SIZE = " << (sampleRate * 2) << " # Corresponds to " << (sampleRate * 2) / static_cast<float>(sampleRate) << " seconds of audio at " << sampleRate << " Hz mono" << std::endl;
        std::cerr << "DUMMY_OUTPUT_SIZE = 10   # Example: 10 classification scores or features" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "model = SimpleAudioModel(DUMMY_INPUT_SIZE, DUMMY_OUTPUT_SIZE)" << std::endl;
        std::cerr << "dummy_input_tensor = torch.randn(1, DUMMY_INPUT_SIZE) # Batch size 1, DUMMY_INPUT_SIZE features" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "torch.onnx.export(" << std::endl;
        std::cerr << "    model," << std::endl;
        std::cerr << "    dummy_input_tensor," << std::endl;
        std::cerr << "    \"model.onnx\"," << std::endl;
        std::cerr << "    verbose=True," << std::endl;
        std::cerr << "    input_names=['input'],   # Name of the input tensor in the ONNX graph" << std::endl;
        std::cerr << "    output_names=['output'], # Name of the output tensor in the ONNX graph" << std::endl;
        std::cerr << "    # Optional: Define dynamic axes if your batch size or sequence length can vary" << std::endl;
        std::cerr << "    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}" << std::endl;
        std::cerr << ")" << std::endl;
        std::cerr << "print(\"Dummy model.onnx created successfully. Remember to adjust DUMMY_INPUT_SIZE in this script to match the length of your audio data or ensure your C++ code pads/truncates the audio data to the model's expected input size.\")" << std::endl;
        std::cerr << "```" << std::endl;
        return 1; // Exit if the ONNX model is not found
    }
    onnxModelCheck.close();
    std::cout << "ONNX model '" << onnxModelPath << "' found. Proceeding with inference." << std::endl;


    // --- 4. ONNX Runtime Inference ---
    try {
        // Create an ONNX Runtime environment. This is the entry point for all ONNX Runtime operations.
        // ORT_LOGGING_LEVEL_WARNING suppresses verbose output unless there's a warning or error.
        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "AudioInference");

        // Configure session options.
        Ort::SessionOptions session_options;
        session_options.SetIntraOpNumThreads(1);         // Use 1 thread for operations within a single node
        session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED); // Apply all available graph optimizations

        // Create an ONNX Runtime session by loading the model.
        Ort::Session session(env, onnxModelPath.c_str(), session_options);

        // Get model input and output names and shapes.
        // An allocator is needed to manage memory for allocated strings (like node names).
        Ort::AllocatorWithDefaultOptions allocator;

        // --- Get Input Node Information ---
        size_t numInputNodes = session.GetInputCount();
        std::vector<const char*> inputNodeNames(numInputNodes); // To store input node names

        std::cout << "\n--- Model Input Information ---" << std::endl;
        // Iterate through all input nodes (models usually have one main input)
        for (size_t i = 0; i < numInputNodes; ++i) {
            // Get the input node name
            inputNodeNames[i] = session.GetInputNameAllocated(i, allocator).get();

            // Get the type and shape information for the input tensor
            Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
            auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
            std::vector<int64_t> actualInputShape = tensor_info.GetShape(); // Get the shape the model *expects*

            std::cout << "  Input " << i << " : Name='" << inputNodeNames[i] << "', Shape=[";
            for (size_t j = 0; j < actualInputShape.size(); ++j) {
                std::cout << actualInputShape[j] << (j == actualInputShape.size() - 1 ? "" : ", ");
            }
            std::cout << "]" << std::endl;

            // --- Prepare Input Tensor Shape ---
            // This is a CRITICAL step. The `audioInput` vector must be reshaped
            // to precisely match the ONNX model's expected input tensor shape.
            // The dummy Python model provided above creates an input of shape [1, DUMMY_INPUT_SIZE].
            // We need to ensure `audioInput` matches `DUMMY_INPUT_SIZE` or pad/truncate it.
            std::vector<int64_t> inputTensorShape; // This will be the shape of the tensor we create

            if (actualInputShape.size() == 2 && actualInputShape[0] == 1) {
                // Case: Model expects a 2D input with batch size 1 (e.g., [1, num_features])
                int64_t expected_length = actualInputShape[1]; // The expected number of features/samples

                // Check if the loaded audio data size matches the model's expected input length
                if (audioInput.size() != expected_length) {
                    std::cout << "  Warning: Loaded audio input size (" << audioInput.size()
                              << ") does not match model's expected input length (" << expected_length << ")." << std::endl;
                    std::cout << "  Padding/truncating audio data to match model input size." << std::endl;
                    audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate the audio data
                }
                inputTensorShape = {1, expected_length}; // Set the tensor shape for ONNX Runtime
            } else if (actualInputShape.size() == 1) {
                // Case: Model expects a 1D input (e.g., [num_features])
                int64_t expected_length = actualInputShape[0];

                if (audioInput.size() != expected_length) {
                    std::cout << "  Warning: Loaded audio input size (" << audioInput.size()
                              << ") does not match model's expected input length (" << expected_length << ")." << std::endl;
                    std::cout << "  Padding/truncating audio data to match model input size." << std::endl;
                    audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate
                }
                inputTensorShape = {expected_length}; // Set the tensor shape for ONNX Runtime
            } else {
                std::cerr << "Error: Model input shape is not supported by this example ([N] or [1, N]). "
                          << "Please adjust the input tensor shape creation logic in C++ to match your model's specific requirements." << std::endl;
                return 1; // Exit if the input shape is not handled
            }

            // Create an ONNX Runtime memory info object for CPU memory.
            // This specifies where the tensor data is located (CPU in this case).
            Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);

            // Create the input tensor from the audio data.
            // `audioInput.data()` provides a pointer to the raw float data.
            // `audioInput.size()` is the total number of elements.
            // `inputTensorShape.data()` provides the shape array.
            // `inputTensorShape.size()` is the number of dimensions.
            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memory_info, audioInput.data(), audioInput.size(),
                                                                      inputTensorShape.data(), inputTensorShape.size());

            // Verify that the created input tensor is valid
            if (!inputTensor.IsTensor()) {
                std::cerr << "Error: Created input tensor is not valid! This might indicate a shape mismatch or data issue." << std::endl;
                return 1; // Exit if the tensor is invalid
            }

            // At this point, `inputTensor` is ready to be fed into the model.
            // For simplicity, we assume there's only one input to the model.
            // If your model has multiple inputs, you'd need to create multiple Ort::Value objects.

            // --- Get Output Node Information ---
            size_t numOutputNodes = session.GetOutputCount();
            std::vector<const char*> outputNodeNames(numOutputNodes); // To store output node names

            std::cout << "\n--- Model Output Information ---" << std::endl;
            // Iterate through all output nodes
            for (size_t k = 0; k < numOutputNodes; ++k) {
                outputNodeNames[k] = session.GetOutputNameAllocated(k, allocator).get();
                Ort::TypeInfo type_info_out = session.GetOutputTypeInfo(k);
                auto tensor_info_out = type_info_out.GetTensorTypeAndShapeInfo();
                std::vector<int64_t> outputShape = tensor_info_out.GetShape();
                std::cout << "  Output " << k << " : Name='" << outputNodeNames[k] << "', Shape=[";
                for (size_t l = 0; l < outputShape.size(); ++l) {
                    std::cout << outputShape[l] << (l == outputShape.size() - 1 ? "" : ", ");
                }
                std::cout << "]" << std::endl;
            }

            // --- Run Inference ---
            std::cout << "\nRunning ONNX model inference..." << std::endl;
            // The `session.Run` method executes the model.
            // Arguments:
            //   - Ort::RunOptions{nullptr}: Default run options.
            //   - inputNodeNames.data(): Array of C-style strings for input names.
            //   - &inputTensor: Pointer to the array of input tensors (here, just one).
            //   - 1: Number of input tensors.
            //   - outputNodeNames.data(): Array of C-style strings for output names.
            //   - numOutputNodes: Number of output tensors expected.
            std::vector<Ort::Value> outputTensors = session.Run(Ort::RunOptions{nullptr},
                                                                inputNodeNames.data(), &inputTensor, 1,
                                                                outputNodeNames.data(), numOutputNodes);

            // --- Process Output ---
            if (outputTensors.empty()) {
                std::cerr << "Error: No output tensors received from the model." << std::endl;
                return 1; // Exit if no output
            }

            // Assuming the first output is a float tensor (common for most models)
            if (outputTensors[0].IsTensor()) {
                // Get a mutable pointer to the raw data of the output tensor
                float* outputData = outputTensors[0].GetTensorMutableData<float>();
                Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo();
                std::vector<int64_t> outputShape = outputShapeInfo.GetShape();
                size_t outputSize = outputShapeInfo.GetElementCount(); // Total number of elements in the output tensor

                std::cout << "\n--- Model Inference Result (first few elements) ---" << std::endl;
                // Print the first 10 elements of the output (or fewer if output is smaller)
                for (size_t k = 0; k < std::min((size_t)10, outputSize); ++k) {
                    std::cout << outputData[k] << (k == std::min((size_t)10, outputSize) - 1 ? "" : ", ");
                }
                std::cout << std::endl;

                std::cout << "Full output tensor size: " << outputSize << " elements." << std::endl;
                std::cout << "Full output tensor shape: [";
                for (size_t k = 0; k < outputShape.size(); ++k) {
                    std::cout << outputShape[k] << (k == outputShape.size() - 1 ? "" : ", ");
                }
                std::cout << "]" << std::endl;

                // Here you would typically interpret the model's output based on its purpose.
                // For example:
                // - For classification: Find the index of the maximum value (highest probability).
                // - For regression: Use the numerical output directly.
                // - For feature extraction: Use the output vector as features for further processing.
            } else {
                std::cerr << "Error: First output tensor is not of the expected type (float tensor)." << std::endl;
            }
        } // End of loop for input nodes (assuming single input for simplicity in this example)

    } catch (const Ort::Exception& e) {
        // Catch ONNX Runtime specific exceptions
        std::cerr << "ONNX Runtime Exception: " << e.what() << std::endl;
        return 1;
    } catch (const std::exception& e) {
        // Catch other standard exceptions
        std::cerr << "Standard Exception: " << e.what() << std::endl;
        return 1;
    }

    std::cout << "\nProgram finished successfully." << std::endl;
    return 0;
}