go / cpp /inference /test copy 2.cpp

Upload 71 files

a2dca42 verified 3 months ago

27.9 kB

	#include <iostream> // For standard input/output operations (e.g., std::cout, std::cerr)
	#include <vector> // For dynamic arrays (e.g., std::vector<float>)
	#include <fstream> // For file input/output operations (e.g., std::ifstream, std::ofstream)
	#include <cstdint> // For fixed-width integer types (e.g., int16_t)
	#include <cmath> // For mathematical functions (e.g., std::sin, M_PI, std::log)
	#include <numeric> // For numerical operations (e.g., std::iota)
	#include <algorithm> // For algorithms like std::min, std::max
	#include <fstream>
	// Include the ONNX Runtime C++ API header
	#include <onnxruntime_cxx_api.h>

	// Include Eigen for powerful matrix operations.
	// You need to download Eigen and set up your include paths.
	// E.g., if Eigen is in 'C:/Libraries/eigen-3.4.0', you'd compile with -I C:/Libraries/eigen-3.4.0
	#include <Eigen/Dense>

	// Include KissFFT for Fast Fourier Transform.
	// You need to download KissFFT and set up your include paths.
	// E.g., if KissFFT is in 'C:/Libraries/kissfft-1.3.0', you'd compile with -I C:/Libraries/kissfft-1.3.0
	// You also need to compile kiss_fft.c and kiss_fftr.c and link them.
	#include "kiss_fft.h"
	#include "kiss_fftr.h" // For real-valued FFT

	// Define M_PI if it's not already defined by cmath or your compiler.
	#ifndef M_PI
	#define M_PI 3.14159265358979323846
	#endif

	// --- Global parameters for feature extraction (matching Python script) ---
	const float PREEMPHASIS_COEFF = 0.97f;
	const int N_FFT = 512; // FFT size
	const int WIN_LENGTH = 400; // Window length (samples)
	const int HOP_LENGTH = 160; // Hop length (samples)
	const int N_MELS = 80; // Number of Mel filterbank channels
	const int TARGET_SAMPLE_RATE = 16000; // Target sample rate for feature extraction

	/**
	* @brief Loads raw PCM audio data from a file into a float vector.
	*
	* This function reads 16-bit signed integer PCM samples from the specified file,
	* converts them to floating-point values, and normalizes them to the range [-1.0, 1.0].
	* It assumes the PCM data is little-endian.
	*
	* @param filename The path to the PCM audio file.
	* @return A std::vector<float> containing the normalized audio samples, or an empty
	* vector if the file cannot be opened.
	*/
	std::vector<float> loadPcmToFloatArray(const std::string& filename) {
	std::ifstream file(filename, std::ios::binary);
	if (!file.is_open()) {
	std::cerr << "Error: Could not open PCM file: " << filename << std::endl;
	return {};
	}

	std::vector<float> audioData;
	int16_t sample;

	while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
	audioData.push_back(static_cast<float>(sample) / 32768.0f);
	}

	file.close();
	return audioData;
	}

	/**
	* @brief Generates a Hamming window.
	* @param window_length The length of the window.
	* @return A std::vector<float> containing the Hamming window coefficients.
	*/
	std::vector<float> generateHammingWindow(int window_length) {
	std::vector<float> window(window_length);
	for (int i = 0; i < window_length; ++i) {
	window[i] = 0.54f - 0.46f * std::cos(2 * M_PI * i / static_cast<float>(window_length - 1));
	}
	return window;
	}

	/**
	* @brief Extracts spectrogram features from waveform, matching Python's _extract_spectrogram.
	*
	* @param wav The input waveform (1D array of floats).
	* @param fs The sampling rate of the waveform (fixed to 16000 Hz for this model).
	* @return A 2D Eigen::MatrixXf representing the spectrogram (frames x (N_FFT/2 + 1)).
	*/
	Eigen::MatrixXf extractSpectrogram(const std::vector<float>& wav, int fs) {
	// Calculate number of frames
	int n_batch = (wav.size() - WIN_LENGTH) / HOP_LENGTH + 1;
	if (n_batch <= 0) {
	std::cerr << "Warning: Input waveform too short for feature extraction. Returning empty spectrogram." << std::endl;
	return Eigen::MatrixXf(0, N_FFT / 2 + 1);
	}

	// Generate Hamming window once
	std::vector<float> fft_window = generateHammingWindow(WIN_LENGTH);
	// Initialize KissFFT for real-valued input
	kiss_fftr_cfg fft_cfg = kiss_fftr_alloc(N_FFT, 0 /* is_inverse_fft */, nullptr, nullptr);
	if (!fft_cfg) {
	std::cerr << "Error: Failed to allocate KissFFT configuration." << std::endl;
	return Eigen::MatrixXf(0, N_FFT / 2 + 1);
	}

	// Output spectrogram matrix: rows = frames, columns = FFT bins
	Eigen::MatrixXf spec_matrix(n_batch, N_FFT / 2 + 1);

	std::vector<float> frame_buffer(WIN_LENGTH);
	std::vector<float> prev_frame_buffer(WIN_LENGTH);
	kiss_fft_scalar fft_input[N_FFT]; // KissFFT requires input buffer of size N_FFT
	kiss_fft_cpx fft_output[N_FFT / 2 + 1]; // KissFFT real output size

	for (int i = 0; i < n_batch; ++i) {
	int start_idx = i * HOP_LENGTH;

	// Extract current frame
	for (int j = 0; j < WIN_LENGTH; ++j) {
	frame_buffer[j] = wav[start_idx + j];
	}

	// Prepare previous frame for pre-emphasis (np.roll equivalent)
	// y_frames_prev = np.roll(y_frames, 1, axis=1)
	// y_frames_prev[:, 0] = y_frames_prev[:, 1]
	prev_frame_buffer[0] = frame_buffer[0]; // Python's np.roll(..., 1) with axis=1 makes first element wrap around
	// but then it's overwritten by y_frames_prev[:, 1]
	if (WIN_LENGTH > 1) {
	for (int j = 0; j < WIN_LENGTH - 1; ++j) {
	prev_frame_buffer[j + 1] = frame_buffer[j];
	}
	}
	// Correcting the first element as per Python code: y_frames_prev[:, 0] = y_frames_prev[:, 1]
	// This means the first element of the 'previous' frame is actually the second element of the 'current' frame.
	// For the first frame (i=0), prev_frame_buffer[0] should be frame_buffer[1] if WIN_LENGTH > 1.
	// For subsequent frames, this logic applies to the current frame's first sample relative to its second.
	// The original Python code effectively does:
	// y_frames_prev = np.concatenate((y_frames[:, 1:2], y_frames[:, :-1]), axis=1)
	// This is a bit tricky. Let's simplify and apply pre-emphasis directly to the current frame elements.
	// The Python code applies pre-emphasis within each batch/frame.
	// y_frames = (y_frames - preemphasis * y_frames_prev)
	// y_frames_prev[:, 0] = y_frames_prev[:, 1] means the first element of the previous frame is taken from the second element of the current frame.
	// This is equivalent to: frame[j] - preemphasis * (j == 0 ? frame[1] : frame[j-1])
	// Let's use a temporary buffer for pre-emphasized frame.
	std::vector<float> preemphasized_frame(WIN_LENGTH);
	if (WIN_LENGTH > 0) {
	preemphasized_frame[0] = frame_buffer[0]; // First sample is not pre-emphasized against a previous sample
	if (WIN_LENGTH > 1) {
	for (int j = 1; j < WIN_LENGTH; ++j) {
	preemphasized_frame[j] = frame_buffer[j] - PREEMPHASIS_COEFF * frame_buffer[j - 1];
	}
	}
	}
	// Apply pre-emphasis and scale by 32768 (as in Python)
	for (int j = 0; j < WIN_LENGTH; ++j) {
	fft_input[j] = preemphasized_frame[j] * 32768.0f;
	// Pad with zeros if WIN_LENGTH < N_FFT
	if (j >= WIN_LENGTH) {
	fft_input[j] = 0.0f;
	}
	}
	// Zero-pad the rest of the FFT input if WIN_LENGTH < N_FFT
	for (int j = WIN_LENGTH; j < N_FFT; ++j) {
	fft_input[j] = 0.0f;
	}
	// Apply Hamming window
	for (int j = 0; j < WIN_LENGTH; ++j) {
	fft_input[j] *= fft_window[j];
	}
	// Perform real FFT
	kiss_fftr(fft_cfg, fft_input, fft_output);
	// Calculate magnitude spectrogram
	for (int j = 0; j <= N_FFT / 2; ++j) {
	spec_matrix(i, j) = std::sqrt(fft_output[j].r * fft_output[j].r + fft_output[j].i * fft_output[j].i);
	}
	}
	kiss_fftr_free(fft_cfg); // Free KissFFT configuration
	return spec_matrix;
	}

	/**
	* @brief Creates a Mel filter-bank matrix, matching Python's speechlib_mel.
	*
	* @param sample_rate Sample rate in Hz.
	* @param n_fft FFT size.
	* @param n_mels Mel filter size.
	* @param fmin Lowest frequency (in Hz).
	* @param fmax Highest frequency (in Hz).
	* @return An Eigen::MatrixXf representing the Mel transform matrix (n_mels x (1 + n_fft/2)).
	*/
	Eigen::MatrixXf speechlibMel(int sample_rate, int n_fft, int n_mels, float fmin, float fmax) {
	int bank_width = n_fft / 2 + 1;
	if (fmax == 0.0f) fmax = sample_rate / 2.0f; // Use 0.0f as a sentinel for None
	if (fmin == 0.0f) fmin = 0.0f; // Use 0.0f as a sentinel for None

	// Helper functions for Mel scale conversion
	auto mel = [](float f) { return 1127.0f * std::log(1.0f + f / 700.0f); };
	auto bin2mel = [&](int fft_bin) { return 1127.0f * std::log(1.0f + static_cast<float>(fft_bin) * sample_rate / (static_cast<float>(n_fft) * 700.0f)); };
	auto f2bin = [&](float f) { return static_cast<int>((f * n_fft / sample_rate) + 0.5f); };

	// Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax)]
	int klo = f2bin(fmin) + 1;
	int khi = f2bin(fmax);
	khi = std::max(khi, klo);

	// Spec 2: SpeechLib uses triangles in Mel space
	float mlo = mel(fmin);
	float mhi = mel(fmax);

	// Generate Mel centers
	std::vector<float> m_centers(n_mels + 2);
	float ms = (mhi - mlo) / (n_mels + 1);
	for (int i = 0; i < n_mels + 2; ++i) {
	m_centers[i] = mlo + i * ms;
	}

	Eigen::MatrixXf matrix = Eigen::MatrixXf::Zero(n_mels, bank_width);

	for (int m = 0; m < n_mels; ++m) {
	float left = m_centers[m];
	float center = m_centers[m + 1];
	float right = m_centers[m + 2];
	for (int fft_bin = klo; fft_bin < bank_width; ++fft_bin) { // Loop up to bank_width-1
	float mbin = bin2mel(fft_bin);
	if (left < mbin && mbin < right) {
	matrix(m, fft_bin) = 1.0f - std::abs(center - mbin) / ms;
	}
	}
	}
	matrix.transposeInPlace();
	return matrix;
	}

	/**
	* @brief Extracts log filterbank features from waveform, matching Python's _extract_features.
	*
	* @param wav The input waveform (1D array of floats).
	* @param fs The sampling rate of the waveform (fixed to 16000 Hz).
	* @param mel_filterbank The pre-computed Mel filterbank matrix.
	* @return An Eigen::MatrixXf representing the log Mel filterbank features (frames x N_MELS).
	*/
	Eigen::MatrixXf extractFeatures(const std::vector<float>& wav, int fs, const Eigen::MatrixXf& mel_filterbank) {
	// Extract spectrogram
	Eigen::MatrixXf spec = extractSpectrogram(wav, fs);
	if (spec.rows() == 0) {
	return Eigen::MatrixXf(0, N_MELS); // Return empty matrix if spectrogram extraction failed
	}

	// spec_power = spec**2
	Eigen::MatrixXf spec_power = spec.array().square();

	// fbank_power = np.clip(spec_power.dot(_mel), 1.0, None)
	// Note: Eigen's matrix multiplication is `*`, not `dot`.
	// The Python `dot` for 2D arrays is matrix multiplication.
	// Python: (frames, N_FFT/2+1) . (N_FFT/2+1, N_MELS) -> (frames, N_MELS)
	// C++ Eigen: spec_power (rows, cols) * mel_filterbank (cols, N_MELS)
	// So, mel_filterbank should be (N_FFT/2+1, N_MELS)
	Eigen::MatrixXf fbank_power = spec_power * mel_filterbank;

	// Apply clipping: np.clip(..., 1.0, None)
	// This means any value less than 1.0 becomes 1.0.
	fbank_power = fbank_power.array().max(1.0f);

	// log_fbank = np.log(fbank_power).astype(np.float32)
	Eigen::MatrixXf log_fbank = fbank_power.array().log();

	return log_fbank;
	}


	int main(int argc, char* argv[]) {
	// --- 1. Process command-line arguments ---
	if (argc != 3) {
	std::cerr << "Usage: " << argv[0] << " <path_to_onnx_model> <path_to_pcm_file>" << std::endl;
	std::cerr << "Example: " << argv[0] << " model.onnx audio.pcm" << std::endl;
	return 1;
	}

	std::string onnxModelPath = argv[1];
	std::string pcmFilename = argv[2];

	// --- Configuration for Audio and ONNX Model ---
	// These are fixed by the Python preprocessor code and model requirements.
	int bitDepth = 16;
	// numChannels is handled within loadPcmToFloatArray and then implicitly by feature extraction
	// which squeezes to 1D and takes mean if stereo. For simplicity, we assume mono PCM input.
	// If your PCM is stereo, you'd need to adjust loadPcmToFloatArray to handle channel interleaving
	// and then average or select a channel before passing to extractSpectrogram.
	int numChannels = 1;

	// --- Create a dummy PCM file if it doesn't exist for demonstration ---
	// This is helpful for initial testing without needing an actual PCM file.
	std::ifstream pcmCheck(pcmFilename, std::ios::binary);
	if (!pcmCheck.is_open()) {
	std::cerr << "PCM file '" << pcmFilename << "' not found. Creating a dummy one for demonstration." << std::endl;
	std::ofstream dummyPcmFile(pcmFilename, std::ios::binary);
	if (dummyPcmFile.is_open()) {
	std::cout << "Creating a dummy PCM file: " << pcmFilename << " ("
	<< (TARGET_SAMPLE_RATE * 2 * sizeof(int16_t)) / 1024 << " KB)" << std::endl;
	for (int i = 0; i < TARGET_SAMPLE_RATE * 2; ++i) { // Generate 2 seconds of audio
	int16_t sample = static_cast<int16_t>(30000 * std::sin(2 * M_PI * 440 * i / static_cast<double>(TARGET_SAMPLE_RATE)));
	dummyPcmFile.write(reinterpret_cast<char*>(&sample), sizeof(sample));
	}
	dummyPcmFile.close();
	} else {
	std::cerr << "Error: Could not create dummy PCM file '" << pcmFilename
	<< "'. Please ensure the directory is writable." << std::endl;
	return 1;
	}
	} else {
	pcmCheck.close();
	}


	// --- 2. Load PCM audio data into a float array ---
	std::vector<float> audioWav = loadPcmToFloatArray(pcmFilename);

	if (audioWav.empty()) {
	std::cerr << "Failed to load audio data from " << pcmFilename << ". Exiting." << std::endl;
	return 1;
	}

	std::cout << "Successfully loaded " << audioWav.size() << " samples from " << pcmFilename << std::endl;

	// --- 3. Precompute Mel filterbank (as it's constant for a given sample rate/FFT size) ---
	// The Python example uses fmax=16000//2-80-230. This translates to TARGET_SAMPLE_RATE/2 - 80 - 230.
	// Using 0.0f for fmin as sentinel for None.
	float mel_fmax = static_cast<float>(TARGET_SAMPLE_RATE) / 2.0f - 80.0f - 230.0f;
	Eigen::MatrixXf mel_filterbank = speechlibMel(TARGET_SAMPLE_RATE, N_FFT, N_MELS, 0.0f, mel_fmax);

	if (mel_filterbank.rows() == 0 \|\| mel_filterbank.cols() == 0) {
	std::cerr << "Error: Failed to create Mel filterbank. Exiting." << std::endl;
	return 1;
	}
	std::cout << "Mel filterbank created with shape: [" << mel_filterbank.rows() << ", " << mel_filterbank.cols() << "]" << std::endl;


	// --- 4. Apply feature extraction (preprocessor) ---
	std::cout << "Extracting features from audio..." << std::endl;
	Eigen::MatrixXf features = extractFeatures(audioWav, TARGET_SAMPLE_RATE, mel_filterbank);

	std::ofstream outputFile("matrix_output.txt");
	// Check if the file was opened successfully
	if (outputFile.is_open()) {
	// Iterate through rows and columns to write elements
	for (int i = 0; i < features.rows(); ++i) {
	for (int j = 0; j < features.cols(); ++j) {
	outputFile << features(i, j); // Write the element
	if (j < features.cols() - 1) {
	outputFile << ","; // Add a space separator between elements in a row
	}
	}
	outputFile << std::endl; // Move to the next line after each row
	}
	outputFile.close(); // Close the file
	std::cout << "Matrix successfully written to matrix_output.txt" << std::endl;
	}


	if (features.rows() == 0 \|\| features.cols() == 0) {
	std::cerr << "Error: Feature extraction resulted in an empty matrix. Exiting." << std::endl;
	return 1;
	}
	std::cout << "Features extracted with shape: [" << features.rows() << ", " << features.cols() << "]" << std::endl;
	std::cout << "First few feature values (first frame): [";
	for (int i = 0; i < std::min((int)features.cols(), 5); ++i) {
	std::cout << features(0, i) << (i == std::min((int)features.cols(), 5) - 1 ? "" : ", ");
	}
	std::cout << "]" << std::endl;

	// --- 5. Check for ONNX model existence and provide guidance if missing ---
	std::ifstream onnxModelCheck(onnxModelPath, std::ios::binary);
	if (!onnxModelCheck.is_open()) {
	std::cerr << "\nError: ONNX model file '" << onnxModelPath << "' not found." << std::endl;
	std::cerr << "Please provide a valid ONNX model file. If you need a simple dummy one for testing, "
	<< "you can create it using Python (e.g., with PyTorch) like this:" << std::endl;
	std::cerr << "```python" << std::endl;
	std::cerr << "import torch" << std::endl;
	std::cerr << "import torch.nn as nn" << std::endl;
	std::cerr << "" << std::endl;
	std::cerr << "class SimpleAudioModel(nn.Module):" << std::endl;
	std::cerr << " def __init__(self, input_frames, feature_size, output_size):" << std::endl;
	std::cerr << " super(SimpleAudioModel, self).__init__()" << std::endl;
	std::cerr << " # This model expects input of shape [batch_size, frames, feature_size]" << std::endl;
	std::cerr << " # Example: a simple linear layer that flattens input and processes it." << std::endl;
	std::cerr << " self.flatten = nn.Flatten()" << std::endl;
	std::cerr << " self.linear = nn.Linear(input_frames * feature_size, output_size)" << std::endl;
	std::cerr << "" << std::endl;
	std::cerr << " def forward(self, x):" << std::endl;
	std::cerr << " x = self.flatten(x)" << std::endl;
	std::cerr << " return self.linear(x)" << std::endl;
	std::cerr << "" << std::endl;
	std::cerr << "# --- IMPORTANT: Define model input and output sizes. Adjust these to match your actual model's requirements. ---" << std::endl;
	std::cerr << "# The C++ preprocessor will produce features of shape [frames, 80]." << std::endl;
	std::cerr << "# For a dummy model, we need to provide a fixed 'frames' value for ONNX export." << std::endl;
	std::cerr << "# A typical audio segment might be 2 seconds at 16kHz, which is 32000 samples." << std::endl;
	std::cerr << "# Frames = (32000 - 400) / 160 + 1 = 198.75 + 1 = 199 frames (approx)" << std::endl;
	std::cerr << "# Let's use a representative number of frames, e.g., 200 for a dummy input." << std::endl;
	std::cerr << "DUMMY_INPUT_FRAMES = 200 # This should be representative of your typical audio segment's frames" << std::endl;
	std::cerr << "DUMMY_FEATURE_SIZE = 80 # Fixed by the Mel filterbank (N_MELS)" << std::endl;
	std::cerr << "DUMMY_OUTPUT_SIZE = 10 # Example: 10 classification scores or features" << std::endl;
	std::cerr << "" << std::endl;
	std::cerr << "model = SimpleAudioModel(DUMMY_INPUT_FRAMES, DUMMY_FEATURE_SIZE, DUMMY_OUTPUT_SIZE)" << std::endl;
	std::cerr << "dummy_input_tensor = torch.randn(1, DUMMY_INPUT_FRAMES, DUMMY_FEATURE_SIZE) # Batch size 1" << std::endl;
	std::cerr << "" << std::endl;
	std::cerr << "torch.onnx.export(" << std::endl;
	std::cerr << " model," << std::endl;
	std::cerr << " dummy_input_tensor," << std::endl;
	std::cerr << " \"model.onnx\"," << std::endl;
	std::cerr << " verbose=True," << std::endl;
	std::cerr << " input_names=['input'], # Name of the input tensor in the ONNX graph" << std::endl;
	std::cerr << " output_names=['output'], # Name of the output tensor in the ONNX graph" << std::endl;
	std::cerr << " # Define dynamic axes for batch_size and frames" << std::endl;
	std::cerr << " dynamic_axes={'input': {0: 'batch_size', 1: 'frames'}, 'output': {0: 'batch_size'}}" << std::endl;
	std::cerr << ")" << std::endl;
	std::cerr << "print(\"Dummy model.onnx created successfully. Remember to adjust DUMMY_INPUT_FRAMES in this script to match the expected number of frames from your audio segments.\")" << std::endl;
	std::cerr << "```" << std::endl;
	return 1;
	}
	onnxModelCheck.close();
	std::cout << "ONNX model '" << onnxModelPath << "' found. Proceeding with inference." << std::endl;


	// --- 6. ONNX Runtime Inference ---
	try {
	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "AudioInference");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(1);
	// session_options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED);

	Ort::Session session(env, onnxModelPath.c_str(), session_options);
	std::cout << "Model loaded successfully from: " << onnxModelPath << std::endl;
	Ort::AllocatorWithDefaultOptions allocator;

	// --- Get Input Node Information ---
	size_t numInputNodes = session.GetInputCount();
	std::vector<const char*> inputNodeNames(numInputNodes);

	std::cout << "\n--- Model Input Information ---" << std::endl;
	if (numInputNodes == 0) {
	std::cerr << "Error: Model has no input nodes. Exiting." << std::endl;
	return 1;
	}

	// Assuming a single input node for simplicity
	inputNodeNames[0] = "audio_embeds";
	Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
	auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
	std::vector<int64_t> actualInputShape = tensor_info.GetShape();

	std::cout << " Input 0 : Name='" << inputNodeNames[0] << "', Shape=[";
	for (size_t j = 0; j < actualInputShape.size(); ++j) {
	// Print -1 for dynamic dimensions
	if (actualInputShape[j] == -1) {
	std::cout << "-1";
	} else {
	std::cout << actualInputShape[j];
	}
	std::cout << (j == actualInputShape.size() - 1 ? "" : ", ");
	}
	std::cout << "]" << std::endl;

	// --- Prepare Input Tensor Shape ---
	// The ONNX model input is [batch, frames, feature_size] = [-1, -1, 80]
	// Our extracted features are [frames, 80]. We need to add a batch dimension of 1.
	std::vector<int64_t> inputTensorShape = {1, features.rows(), features.cols()};
	std::cout << " Preparing input tensor with shape: [" << inputTensorShape[0] << ", "
	<< inputTensorShape[1] << ", " << inputTensorShape[2] << "]" << std::endl;

	// Flatten the Eigen::MatrixXf into a std::vector<float> for ONNX Runtime
	std::vector<float> inputTensorData(features.data(), features.data() + features.size());

	Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memory_info, inputTensorData.data(), inputTensorData.size(),
	inputTensorShape.data(), inputTensorShape.size());

	if (!inputTensor.IsTensor()) {
	std::cerr << "Error: Created input tensor is not valid! Exiting." << std::endl;
	return 1;
	}

	// --- Get Output Node Information ---
	size_t numOutputNodes = session.GetOutputCount();
	std::vector<const char*> outputNodeNames(numOutputNodes);

	std::cout << "\n--- Model Output Information ---" << std::endl;
	for (size_t k = 0; k < numOutputNodes; ++k) {
	outputNodeNames[k] = "audio_features";
	Ort::TypeInfo type_info_out = session.GetOutputTypeInfo(k);
	auto tensor_info_out = type_info_out.GetTensorTypeAndShapeInfo();
	std::vector<int64_t> outputShape = tensor_info_out.GetShape();
	std::cout << " Output " << k << " : Name='" << outputNodeNames[k] << "', Shape=[";
	for (size_t l = 0; l < outputShape.size(); ++l) {
	if (outputShape[l] == -1) {
	std::cout << "-1";
	} else {
	std::cout << outputShape[l];
	}
	std::cout << (l == outputShape.size() - 1 ? "" : ", ");
	}
	std::cout << "]" << std::endl;
	}

	// --- Run Inference ---
	std::cout << "\nRunning ONNX model inference..." << std::endl;
	std::vector<Ort::Value> outputTensors = session.Run(Ort::RunOptions{nullptr},
	inputNodeNames.data(), &inputTensor, 1,
	outputNodeNames.data(), numOutputNodes);
	std::ofstream output_file("f0.txt");
	for (auto& ort_value : outputTensors) {
	// Example: Assuming Ort::Value contains a float tensor
	if (ort_value.IsTensor()) {
	float* data = ort_value.GetTensorMutableData<float>();
	Ort::TensorTypeAndShapeInfo info = ort_value.GetTensorTypeAndShapeInfo();
	size_t num_elements = info.GetElementCount();

	for (size_t i = 0; i < num_elements; ++i) {
	output_file << data[i];
	if (i < num_elements - 1) {
	output_file << ","; // Space separator between elements
	}
	}
	output_file << std::endl; // Newline after each Ort::Value's content
	} else {
	// Handle other Ort::Value types if necessary (e.g., sequences, maps)
	output_file << "Non-tensor Ort::Value" << std::endl;
	}
	}

	output_file.close();


	// --- Process Output ---
	if (outputTensors.empty()) {
	std::cerr << "Error: No output tensors received from the model." << std::endl;
	return 1;
	}

	if (outputTensors[0].IsTensor()) {
	float* outputData = outputTensors[0].GetTensorMutableData<float>();
	Ort::TensorTypeAndShapeInfo outputShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo();
	std::vector<int64_t> outputShape = outputShapeInfo.GetShape();
	size_t outputSize = outputShapeInfo.GetElementCount();

	std::cout << "\n--- Model Inference Result (first few elements) ---" << std::endl;
	for (size_t k = 0; k < std::min((size_t)10, outputSize); ++k) {
	std::cout << outputData[k] << (k == std::min((size_t)10, outputSize) - 1 ? "" : ", ");
	}
	std::cout << std::endl;

	std::cout << "Full output tensor size: " << outputSize << " elements." << std::endl;
	std::cout << "Full output tensor shape: [";
	for (size_t k = 0; k < outputShape.size(); ++k) {
	std::cout << outputShape[k] << (k == outputShape.size() - 1 ? "" : ", ");
	}
	std::cout << "]" << std::endl;
	} else {
	std::cerr << "Error: First output tensor is not of the expected type (float tensor)." << std::endl;
	}

	} catch (const Ort::Exception& e) {
	std::cerr << "ONNX Runtime Exception: " << e.what() << std::endl;
	return 1;
	} catch (const std::exception& e) {
	std::cerr << "Standard Exception: " << e.what() << std::endl;
	return 1;
	}

	std::cout << "\nProgram finished successfully." << std::endl;
	return 0;
	}