File size: 18,527 Bytes

a2dca42

#include <iostream>
#include <vector>
#include <fstream>   // For file input/output operations (e.g., std::ifstream, std::ofstream)
#include <cstdint>   // For fixed-width integer types (e.g., int16_t)
#include <cmath>     // For mathematical functions (e.g., std::sin, M_PI)
#include <numeric>   // For numerical operations (not strictly used in this version but often useful)
#include <algorithm> // For algorithms like std::min

// Include the ONNX Runtime C++ API header
// You need to have ONNX Runtime installed and linked correctly in your build system.
// For example, using CMake, you might add:
// find_package(ONNXRuntime REQUIRED)
// target_link_libraries(your_executable PRIVATE ONNXRuntime::onnxruntime_cxx_api)
#include <onnxruntime_cxx_api.h>

// Define M_PI if it's not already defined by cmath or your compiler.
// This is common on Windows with MSVC unless _USE_MATH_DEFINES is set.
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif


std::vector<float> loadPcmToFloatArray(const std::string& filename, int bitDepth, int numChannels) {
    // Open the PCM file in binary mode for reading
    std::ifstream file(filename, std::ios::binary);
    if (!file.is_open()) {
        std::cerr << "Error: Could not open PCM file: " << filename << std::endl;
        return {}; // Return empty vector on failure
    }

    std::vector<float> audioData; // Vector to store the normalized float audio samples

    // Check if the bit depth is supported (this example only handles 16-bit)
    if (bitDepth == 16) {
        int16_t sample; // Buffer to read a single 16-bit sample

        // Read samples until the end of the file
        while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
            // Normalize 16-bit signed integer to float in range [-1.0, 1.0]
            // The maximum positive value for int16_t is 32767.
            // Dividing by 32768.0f (which is 2^15) ensures that 32767 maps to
            // slightly less than 1.0, and -32768 maps to -1.0, maintaining
            // the full dynamic range and avoiding overflow for -32768.
            audioData.push_back(static_cast<float>(sample) / 32768.0f);
        }
    } else {
        std::cerr << "Error: Unsupported bit depth: " << bitDepth << ". This example only supports 16-bit PCM." << std::endl;
        return {}; // Return empty vector for unsupported bit depth
    }

    file.close(); // Close the file
    return audioData; // Return the loaded audio data
}

int main() {
    // --- Configuration for Audio and ONNX Model ---
    std::string pcmFilename = "/mnt/data-2t/jeff/codes/llm/cpp/sample_data/pickup_breezy-common_voice_zh-TW_17376838-breezyvoice-00818.pcm"; // Name of the PCM audio file to load
    int bitDepth = 16;                     // Bit depth of the PCM data (e.g., 16-bit)
    int numChannels = 1;                   // Number of audio channels (e.g., 1 for mono)
    int sampleRate = 16000;                // Sample rate of the audio (e.g., 16000 Hz)
    std::string onnxModelPath = "/mnt/data-2t/jeff/codes/llm/cpp/onnx_files/speech_init_export/phi-4-mm-speech.onnx"; // Path to your ONNX model file

    // --- 2. Load PCM audio data into a float array ---
    std::vector<float> audioInput = loadPcmToFloatArray(pcmFilename, bitDepth, numChannels);

    if (audioInput.empty()) {
        std::cerr << "Failed to load audio data from " << pcmFilename << ". Exiting." << std::endl;
        return 1; // Exit if audio data loading failed
    }

    std::cout << "Successfully loaded " << audioInput.size() << " samples from " << pcmFilename << std::endl;

    // --- 3. Check for ONNX model existence and provide guidance if missing ---
    // This step is critical. You need a valid ONNX model.
    std::ifstream onnxModelCheck(onnxModelPath, std::ios::binary);
    if (!onnxModelCheck.is_open()) {
        std::cerr << "\nError: ONNX model file '" << onnxModelPath << "' not found." << std::endl;
        std::cerr << "Please provide a valid ONNX model file. If you need a simple dummy one for testing, "
                  << "you can create it using Python (e.g., with PyTorch) like this:" << std::endl;
        std::cerr << "```python" << std::endl;
        std::cerr << "import torch" << std::endl;
        std::cerr << "import torch.nn as nn" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "class SimpleAudioModel(nn.Module):" << std::endl;
        std::cerr << "    def __init__(self, input_size, output_size):" << std::endl;
        std::cerr << "        super(SimpleAudioModel, self).__init__()" << std::endl;
        std::cerr << "        # This is a very simple linear layer. Your actual model will be more complex." << std::endl;
        std::cerr << "        # This model expects input of shape [batch_size, input_size]" << std::endl;
        std::cerr << "        self.linear = nn.Linear(input_size, output_size)" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "    def forward(self, x):" << std::endl;
        std::cerr << "        # If your model expects a different input shape (e.g., [batch_size, channels, samples])," << std::endl;
        std::cerr << "        # you might need to reshape 'x' here before passing it to your layers (e.g., x.view(x.size(0), 1, -1))." << std::endl;
        std::cerr << "        return self.linear(x)" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "# --- IMPORTANT: Define model input and output sizes. Adjust these to match your actual model's requirements. ---" << std::endl;
        std::cerr << "# For this dummy model, we'll assume an input size matching our 2-second, 44.1kHz mono audio." << std::endl;
        std::cerr << "DUMMY_INPUT_SIZE = " << (sampleRate * 2) << " # Corresponds to " << (sampleRate * 2) / static_cast<float>(sampleRate) << " seconds of audio at " << sampleRate << " Hz mono" << std::endl;
        std::cerr << "DUMMY_OUTPUT_SIZE = 10   # Example: 10 classification scores or features" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "model = SimpleAudioModel(DUMMY_INPUT_SIZE, DUMMY_OUTPUT_SIZE)" << std::endl;
        std::cerr << "dummy_input_tensor = torch.randn(1, DUMMY_INPUT_SIZE) # Batch size 1, DUMMY_INPUT_SIZE features" << std::endl;
        std::cerr << "" << std::endl;
        std::cerr << "torch.onnx.export(" << std::endl;
        std::cerr << "    model," << std::endl;
        std::cerr << "    dummy_input_tensor," << std::endl;
        std::cerr << "    \"model.onnx\"," << std::endl;
        std::cerr << "    verbose=True," << std::endl;
        std::cerr << "    input_names=['input'],   # Name of the input tensor in the ONNX graph" << std::endl;
        std::cerr << "    output_names=['output'], # Name of the output tensor in the ONNX graph" << std::endl;
        std::cerr << "    # Optional: Define dynamic axes if your batch size or sequence length can vary" << std::endl;
        std::cerr << "    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}" << std::endl;
        std::cerr << ")" << std::endl;
        std::cerr << "print(\"Dummy model.onnx created successfully. Remember to adjust DUMMY_INPUT_SIZE in this script to match the length of your audio data or ensure your C++ code pads/truncates the audio data to the model's expected input size.\")" << std::endl;
        std::cerr << "```" << std::endl;
        return 1; // Exit if the ONNX model is not found
    }
    onnxModelCheck.close();
    std::cout << "ONNX model '" << onnxModelPath << "' found. Proceeding with inference." << std::endl;


    // --- 4. ONNX Runtime Inference ---
    try {
        // Create an ONNX Runtime environment. This is the entry point for all ONNX Runtime operations.
        // ORT_LOGGING_LEVEL_WARNING suppresses verbose output unless there's a warning or error.
        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "AudioInference");

        // Configure session options.
        Ort::SessionOptions session_options;
        session_options.SetIntraOpNumThreads(1);         // Use 1 thread for operations within a single node
        session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED); // Apply all available graph optimizations

        // Create an ONNX Runtime session by loading the model.
        Ort::Session session(env, onnxModelPath.c_str(), session_options);

        // Get model input and output names and shapes.
        // An allocator is needed to manage memory for allocated strings (like node names).
        Ort::AllocatorWithDefaultOptions allocator;

        // --- Get Input Node Information ---
        size_t numInputNodes = session.GetInputCount();
        std::vector<const char*> inputNodeNames(numInputNodes); // To store input node names

        std::cout << "\n--- Model Input Information ---" << std::endl;
        // Iterate through all input nodes (models usually have one main input)
        for (size_t i = 0; i < numInputNodes; ++i) {
            // Get the input node name
            inputNodeNames[i] = session.GetInputNameAllocated(i, allocator).get();

            // Get the type and shape information for the input tensor
            Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
            auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
            std::vector<int64_t> actualInputShape = tensor_info.GetShape(); // Get the shape the model *expects*

            std::cout << "  Input " << i << " : Name='" << inputNodeNames[i] << "', Shape=[";
            for (size_t j = 0; j < actualInputShape.size(); ++j) {
                std::cout << actualInputShape[j] << (j == actualInputShape.size() - 1 ? "" : ", ");
            }
            std::cout << "]" << std::endl;

            // --- Prepare Input Tensor Shape ---
            // This is a CRITICAL step. The `audioInput` vector must be reshaped
            // to precisely match the ONNX model's expected input tensor shape.
            // The dummy Python model provided above creates an input of shape [1, DUMMY_INPUT_SIZE].
            // We need to ensure `audioInput` matches `DUMMY_INPUT_SIZE` or pad/truncate it.
            std::vector<int64_t> inputTensorShape; // This will be the shape of the tensor we create

            if (actualInputShape.size() == 2 && actualInputShape[0] == 1) {
                // Case: Model expects a 2D input with batch size 1 (e.g., [1, num_features])
                int64_t expected_length = actualInputShape[1]; // The expected number of features/samples

                // Check if the loaded audio data size matches the model's expected input length
                if (audioInput.size() != expected_length) {
                    std::cout << "  Warning: Loaded audio input size (" << audioInput.size()
                              << ") does not match model's expected input length (" << expected_length << ")." << std::endl;
                    std::cout << "  Padding/truncating audio data to match model input size." << std::endl;
                    audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate the audio data
                }
                inputTensorShape = {1, expected_length}; // Set the tensor shape for ONNX Runtime
            } else if (actualInputShape.size() == 1) {
                // Case: Model expects a 1D input (e.g., [num_features])
                int64_t expected_length = actualInputShape[0];

                if (audioInput.size() != expected_length) {
                    std::cout << "  Warning: Loaded audio input size (" << audioInput.size()
                              << ") does not match model's expected input length (" << expected_length << ")." << std::endl;
                    std::cout << "  Padding/truncating audio data to match model input size." << std::endl;
                    audioInput.resize(expected_length, 0.0f); // Pad with zeros or truncate
                }
                inputTensorShape = {expected_length}; // Set the tensor shape for ONNX Runtime
            } else {
                std::cerr << "Error: Model input shape is not supported by this example ([N] or [1, N]). "
                          << "Please adjust the input tensor shape creation logic in C++ to match your model's specific requirements." << std::endl;
                return 1; // Exit if the input shape is not handled
            }

            // Create an ONNX Runtime memory info object for CPU memory.
            // This specifies where the tensor data is located (CPU in this case).
            Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);

            // Create the input tensor from the audio data.
            // `audioInput.data()` provides a pointer to the raw float data.
            // `audioInput.size()` is the total number of elements.
            // `inputTensorShape.data()` provides the shape array.
            // `inputTensorShape.size()` is the number of dimensions.
            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memory_info, audioInput.data(), audioInput.size(),
                                                                      inputTensorShape.data(), inputTensorShape.size());

            // Verify that the created input tensor is valid
            if (!inputTensor.IsTensor()) {
                std::cerr << "Error: Created input tensor is not valid! This might indicate a shape mismatch or data issue." << std::endl;
                return 1; // Exit if the tensor is invalid
            }

            // At this point, `inputTensor` is ready to be fed into the model.
            // For simplicity, we assume there's only one input to the model.
            // If your model has multiple inputs, you'd need to create multiple Ort::Value objects.

            // --- Get Output Node Information ---
            size_t numOutputNodes = session.GetOutputCount();
            std::vector<const char*> outputNodeNames(numOutputNodes); // To store output node names

            std::cout << "\n--- Model Output Information ---" << std::endl;
            // Iterate through all output nodes
            for (size_t k = 0; k < numOutputNodes; ++k) {
                outputNodeNames[k] = session.GetOutputNameAllocated(k, allocator).get();
                Ort::TypeInfo type_info_out = session.GetOutputTypeInfo(k);
                auto tensor_info_out = type_info_out.GetTensorTypeAndShapeInfo();
                std::vector<int64_t> outputShape = tensor_info_out.GetShape();
                std::cout << "  Output " << k << " : Name='" << outputNodeNames[k] << "', Shape=[";
                for (size_t l = 0; l < outputShape.size(); ++l) {
                    std::cout << outputShape[l] << (l == outputShape.size() - 1 ? "" : ", ");
                }
                std::cout << "]" << std::endl;
            }

            // --- Run Inference ---
            std::cout << "\nRunning ONNX model inference..." << std::endl;
            // The `session.Run` method executes the model.
            // Arguments:
            //   - Ort::RunOptions{nullptr}: Default run options.
            //   - inputNodeNames.data(): Array of C-style strings for input names.
            //   - &inputTensor: Pointer to the array of input tensors (here, just one).
            //   - 1: Number of input tensors.
            //   - outputNodeNames.data(): Array of C-style strings for output names.
            //   - numOutputNodes: Number of output tensors expected.
            std::vector<Ort::Value> outputTensors = session.Run(Ort::RunOptions{nullptr},
                                                                inputNodeNames.data(), &inputTensor, 1,
                                                                outputNodeNames.data(), numOutputNodes);

            // --- Process Output ---
            if (outputTensors.empty()) {
                std::cerr << "Error: No output tensors received from the model." << std::endl;
                return 1; // Exit if no output
            }

            // Assuming the first output is a float tensor (common for most models)
            if (outputTensors[0].IsTensor()) {
                // Get a mutable pointer to the raw data of the output tensor
                float* outputData = outputTensors[0].GetTensorMutableData<float>();
                Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo();
                std::vector<int64_t> outputShape = outputShapeInfo.GetShape();
                size_t outputSize = outputShapeInfo.GetElementCount(); // Total number of elements in the output tensor

                std::cout << "\n--- Model Inference Result (first few elements) ---" << std::endl;
                // Print the first 10 elements of the output (or fewer if output is smaller)
                for (size_t k = 0; k < std::min((size_t)10, outputSize); ++k) {
                    std::cout << outputData[k] << (k == std::min((size_t)10, outputSize) - 1 ? "" : ", ");
                }
                std::cout << std::endl;

                std::cout << "Full output tensor size: " << outputSize << " elements." << std::endl;
                std::cout << "Full output tensor shape: [";
                for (size_t k = 0; k < outputShape.size(); ++k) {
                    std::cout << outputShape[k] << (k == outputShape.size() - 1 ? "" : ", ");
                }
                std::cout << "]" << std::endl;

                // Here you would typically interpret the model's output based on its purpose.
                // For example:
                // - For classification: Find the index of the maximum value (highest probability).
                // - For regression: Use the numerical output directly.
                // - For feature extraction: Use the output vector as features for further processing.
            } else {
                std::cerr << "Error: First output tensor is not of the expected type (float tensor)." << std::endl;
            }
        } // End of loop for input nodes (assuming single input for simplicity in this example)

    } catch (const Ort::Exception& e) {
        // Catch ONNX Runtime specific exceptions
        std::cerr << "ONNX Runtime Exception: " << e.what() << std::endl;
        return 1;
    } catch (const std::exception& e) {
        // Catch other standard exceptions
        std::cerr << "Standard Exception: " << e.what() << std::endl;
        return 1;
    }

    std::cout << "\nProgram finished successfully." << std::endl;
    return 0;
}