/* * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ //! //! sampleNonZeroPlugin.cpp //! This file contains a sample demonstrating a plugin for NonZero. //! It can be run with the following command line: //! Command: ./sample_non_zero_plugin [-h or --help] [-d=/path/to/data/dir or --datadir=/path/to/data/dir] //! // Define TRT entrypoints used in common code #define DEFINE_TRT_ENTRYPOINTS 1 #define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 0 #include "argsParser.h" #include "buffers.h" #include "common.h" #include "logger.h" #include "nonZeroKernel.h" #include "parserOnnxConfig.h" #include "NvInfer.h" #include #include #include #include #include #include using namespace nvinfer1; using samplesCommon::SampleUniquePtr; std::string const kSAMPLE_NAME = "TensorRT.sample_non_zero_plugin"; using half = __half; void nonZeroIndicesHelper(nvinfer1::DataType type, void const* X, void* indices, void* count, void const* K, int32_t R, int32_t C, bool rowOrder, cudaStream_t stream) { if (type == nvinfer1::DataType::kFLOAT) { nonZeroIndicesImpl(static_cast(X), static_cast(indices), static_cast(count), static_cast(K), R, C, rowOrder, stream); } else if (type == nvinfer1::DataType::kHALF) { nonZeroIndicesImpl(static_cast(X), static_cast(indices), static_cast(count), static_cast(K), R, C, rowOrder, stream); } else { ASSERT(false && "Unsupported data type"); } } class NonZeroPlugin : public IPluginV3, public IPluginV3OneCore, public IPluginV3OneBuild, public IPluginV3OneRuntime { public: NonZeroPlugin(NonZeroPlugin const& p) = default; NonZeroPlugin(bool rowOrder) : mRowOrder(rowOrder) { initFieldsToSerialize(); } void initFieldsToSerialize() { mDataToSerialize.clear(); mDataToSerialize.emplace_back(PluginField("rowOrder", &mRowOrder, PluginFieldType::kINT32, 1)); mFCToSerialize.nbFields = mDataToSerialize.size(); mFCToSerialize.fields = mDataToSerialize.data(); } // IPluginV3 methods IPluginCapability* getCapabilityInterface(PluginCapabilityType type) noexcept override { try { if (type == PluginCapabilityType::kBUILD) { return static_cast(this); } if (type == PluginCapabilityType::kRUNTIME) { return static_cast(this); } ASSERT(type == PluginCapabilityType::kCORE); return static_cast(this); } catch (std::exception const& e) { sample::gLogError << e.what() << std::endl; } return nullptr; } IPluginV3* clone() noexcept override { auto clone = std::make_unique(*this); clone->initFieldsToSerialize(); return clone.release(); } // IPluginV3OneCore methods char const* getPluginName() const noexcept override { return "NonZeroPlugin"; } char const* getPluginVersion() const noexcept override { return "0"; } char const* getPluginNamespace() const noexcept override { return ""; } // IPluginV3OneBuild methods int32_t getNbOutputs() const noexcept override { return 2; } int32_t configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override { return 0; } bool supportsFormatCombination( int32_t pos, DynamicPluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override { bool typeOk{false}; if (pos == 0) { typeOk = inOut[0].desc.type == DataType::kFLOAT || inOut[0].desc.type == DataType::kHALF; } else if (pos == 1) { typeOk = inOut[1].desc.type == DataType::kINT32; } else // pos == 2 { // size tensor outputs must be NCHW INT32 typeOk = inOut[2].desc.type == DataType::kINT32; } return inOut[pos].desc.format == PluginFormat::kLINEAR && typeOk; } int32_t getOutputDataTypes( DataType* outputTypes, int32_t nbOutputs, DataType const* inputTypes, int32_t nbInputs) const noexcept override { outputTypes[0] = DataType::kINT32; outputTypes[1] = DataType::kINT32; return 0; } int32_t getOutputShapes(DimsExprs const* inputs, int32_t nbInputs, DimsExprs const* shapeInputs, int32_t nbShapeInputs, DimsExprs* outputs, int32_t nbOutputs, IExprBuilder& exprBuilder) noexcept override { // The input tensor must be 2-D if (inputs[0].nbDims != 2) { return -1; } outputs[0].nbDims = 2; auto upperBound = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[0], *inputs[0].d[1]); // On average, we can assume that half of all elements will be non-zero auto optValue = exprBuilder.operation(DimensionOperation::kFLOOR_DIV, *upperBound, *exprBuilder.constant(2)); auto numNonZeroSizeTensor = exprBuilder.declareSizeTensor(1, *optValue, *upperBound); if (!mRowOrder) { outputs[0].d[0] = exprBuilder.constant(2); outputs[0].d[1] = numNonZeroSizeTensor; } else { outputs[0].d[0] = numNonZeroSizeTensor; outputs[0].d[1] = exprBuilder.constant(2); } // output at index 1 is a size tensor outputs[1].nbDims = 0; // size tensors must be declared as 0-D return 0; } // IPluginV3OneRuntime methods int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override { int32_t const R = inputDesc[0].dims.d[0]; int32_t const C = inputDesc[0].dims.d[1]; auto type = inputDesc[0].type; if (!(type == nvinfer1::DataType::kHALF || type == nvinfer1::DataType::kFLOAT)) { sample::gLogError << "Unsupported: Sample only supports DataType::kHALF and DataType::FLOAT" << std::endl; return -1; } cudaMemsetAsync(outputs[1], 0, sizeof(int32_t), stream); if (workspace == nullptr) { sample::gLogError << "Unsupported: workspace is null" << std::endl; return -1; } if (!mRowOrder) { // When constructing a column major output, the kernel needs to be aware of the total number of non-zero // elements so as to write the non-zero indices at the correct places. Therefore, we will launch the kernel // twice: first, only to calculate the total non-zero count, which will be stored in workspace; and // then to actually write the non-zero indices to the outputs[0] buffer. cudaMemsetAsync(workspace, 0, sizeof(int32_t), stream); nonZeroIndicesHelper(type, inputs[0], nullptr, workspace, 0, R, C, mRowOrder, stream); nonZeroIndicesHelper(type, inputs[0], outputs[0], outputs[1], workspace, R, C, mRowOrder, stream); } else { nonZeroIndicesHelper(type, inputs[0], outputs[0], outputs[1], 0, R, C, mRowOrder, stream); } return 0; } int32_t onShapeChange( PluginTensorDesc const* in, int32_t nbInputs, PluginTensorDesc const* out, int32_t nbOutputs) noexcept override { return 0; } IPluginV3* attachToContext(IPluginResourceContext* context) noexcept override { return clone(); } PluginFieldCollection const* getFieldsToSerialize() noexcept override { return &mFCToSerialize; } size_t getWorkspaceSize(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override { return sizeof(int32_t); } private: bool mRowOrder{true}; std::vector mDataToSerialize; nvinfer1::PluginFieldCollection mFCToSerialize; }; class NonZeroPluginCreator : public nvinfer1::IPluginCreatorV3One { public: NonZeroPluginCreator() { mPluginAttributes.clear(); mPluginAttributes.emplace_back(PluginField("rowOrder", nullptr, PluginFieldType::kINT32, 1)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } char const* getPluginName() const noexcept override { return "NonZeroPlugin"; } char const* getPluginVersion() const noexcept override { return "0"; } PluginFieldCollection const* getFieldNames() noexcept override { return &mFC; } IPluginV3* createPlugin(char const* name, PluginFieldCollection const* fc, TensorRTPhase phase) noexcept override { try { bool rowOrder{true}; for (int32_t i = 0; i < fc->nbFields; ++i) { auto const fieldName(fc->fields[i].name); if (std::strcmp(fieldName, "rowOrder") == 0) { rowOrder = *static_cast(fc->fields[i].data); } } return new NonZeroPlugin(rowOrder); } catch (std::exception const& e) { sample::gLogError << e.what() << std::endl; } return nullptr; } char const* getPluginNamespace() const noexcept override { return ""; } private: nvinfer1::PluginFieldCollection mFC; std::vector mPluginAttributes; }; namespace { struct NonZeroParams : public samplesCommon::SampleParams { bool rowOrder{true}; }; } // namespace //! \brief The SampleNonZeroPlugin class implements a NonZero plugin //! //! \details The plugin is able to output the non-zero indices in row major or column major order //! class SampleNonZeroPlugin { public: SampleNonZeroPlugin(NonZeroParams const& params) : mParams(params) , mRuntime(nullptr) , mEngine(nullptr) { mSeed = static_cast(time(nullptr)); } //! //! \brief Function builds the network engine //! bool build(); //! //! \brief Runs the TensorRT inference engine for this sample //! bool infer(); private: NonZeroParams mParams; //!< The parameters for the sample. nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network. nvinfer1::Dims mOutputDims; //!< The dimensions of the output to the network. std::shared_ptr mRuntime; //!< The TensorRT runtime used to deserialize the engine std::shared_ptr mEngine; //!< The TensorRT engine used to run the network uint32_t mSeed{}; //! //! \brief Creates a TensorRT network and inserts a NonZero plugin //! bool constructNetwork(SampleUniquePtr& builder, SampleUniquePtr& network, SampleUniquePtr& config); //! //! \brief Reads the input and stores the result in a managed buffer //! bool processInput(samplesCommon::BufferManager const& buffers); //! //! \brief Verifies the result //! bool verifyOutput(samplesCommon::BufferManager const& buffers); }; //! //! \brief Creates the network, configures the builder and creates the network engine //! //! \details This function creates a network containing a NonZeroPlugin and builds //! the engine that will be used to run the plugin (mEngine) //! //! \return true if the engine was created successfully and false otherwise //! bool SampleNonZeroPlugin::build() { auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); if (!builder) { return false; } auto network = SampleUniquePtr(builder->createNetworkV2(0)); if (!network) { return false; } auto config = SampleUniquePtr(builder->createBuilderConfig()); if (!config) { return false; } auto pluginCreator = std::make_unique(); getPluginRegistry()->registerCreator(*pluginCreator, ""); auto constructed = constructNetwork(builder, network, config); if (!constructed) { return false; } // CUDA stream used for profiling by the builder. auto profileStream = samplesCommon::makeCudaStream(); if (!profileStream) { return false; } config->setProfileStream(*profileStream); SampleUniquePtr plan{builder->buildSerializedNetwork(*network, *config)}; if (!plan) { return false; } mRuntime = std::shared_ptr(createInferRuntime(sample::gLogger.getTRTLogger())); if (!mRuntime) { return false; } mEngine = std::shared_ptr( mRuntime->deserializeCudaEngine(plan->data(), plan->size()), samplesCommon::InferDeleter()); if (!mEngine) { return false; } ASSERT(network->getNbInputs() == 1); mInputDims = network->getInput(0)->getDimensions(); ASSERT(mInputDims.nbDims == 2); ASSERT(network->getNbOutputs() == 2); mOutputDims = network->getOutput(0)->getDimensions(); ASSERT(mOutputDims.nbDims == 2); return true; } //! //! \brief Creates a network with a single custom layer containing the NonZero plugin and marks the //! output layers //! //! \param network Pointer to the network that will be populated with the NonZero plugin //! //! \param builder Pointer to the engine builder //! bool SampleNonZeroPlugin::constructNetwork(SampleUniquePtr& builder, SampleUniquePtr& network, SampleUniquePtr& config) { if (mParams.fp16) { config->setFlag(BuilderFlag::kFP16); } std::default_random_engine generator(mSeed); std::uniform_int_distribution distr(10, 25); int32_t const R = distr(generator); int32_t const C = distr(generator); auto* in = network->addInput("Input", DataType::kFLOAT, {2, {R, C}}); ASSERT(in != nullptr); std::vector const vecPF{{"rowOrder", &mParams.rowOrder, PluginFieldType::kINT32, 1}}; PluginFieldCollection pfc{static_cast(vecPF.size()), vecPF.data()}; auto pluginCreator = static_cast(getPluginRegistry()->getCreator("NonZeroPlugin", "0", "")); auto plugin = std::unique_ptr(pluginCreator->createPlugin("NonZeroPlugin", &pfc, TensorRTPhase::kBUILD)); std::vector inputsVec{in}; auto pluginNonZeroLayer = network->addPluginV3(inputsVec.data(), inputsVec.size(), nullptr, 0, *plugin); ASSERT(pluginNonZeroLayer != nullptr); ASSERT(pluginNonZeroLayer->getOutput(0) != nullptr); ASSERT(pluginNonZeroLayer->getOutput(1) != nullptr); pluginNonZeroLayer->getOutput(0)->setName("Output0"); pluginNonZeroLayer->getOutput(1)->setName("Output1"); network->markOutput(*(pluginNonZeroLayer->getOutput(0))); network->markOutput(*(pluginNonZeroLayer->getOutput(1))); return true; } //! //! \brief Runs the TensorRT inference engine for this sample //! //! \details This function is the main execution function of the sample. It allocates the buffer, //! sets inputs and executes the engine. //! bool SampleNonZeroPlugin::infer() { // Since the data dependent output size cannot be inferred from the engine denote a sufficient size for the // corresponding output buffer (along with the rest of the I/O tensors) std::vector ioVolumes = {mInputDims.d[0] * mInputDims.d[1], mInputDims.d[0] * mInputDims.d[1] * 2, 1}; // Create RAII buffer manager object samplesCommon::BufferManager buffers(mEngine, ioVolumes); auto context = SampleUniquePtr(mEngine->createExecutionContext()); if (!context) { return false; } for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; ++i) { auto const name = mEngine->getIOTensorName(i); context->setTensorAddress(name, buffers.getDeviceBuffer(name)); } // Read the input data into the managed buffers ASSERT(mParams.inputTensorNames.size() == 1); if (!processInput(buffers)) { return false; } // Create CUDA stream for the execution of this inference. cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // Memcpy from host input buffers to device input buffers buffers.copyInputToDeviceAsync(stream); bool status = context->enqueueV3(stream); if (!status) { return false; } // Asynchronously copy data from device output buffers to host output buffers. buffers.copyOutputToHostAsync(stream); // Wait for the work in the stream to complete. CHECK(cudaStreamSynchronize(stream)); // Release stream. CHECK(cudaStreamDestroy(stream)); // Verify results if (!verifyOutput(buffers)) { return false; } return true; } //! //! \brief Reads the input and stores the result in a managed buffer //! bool SampleNonZeroPlugin::processInput(samplesCommon::BufferManager const& buffers) { int32_t const inputH = mInputDims.d[0]; int32_t const inputW = mInputDims.d[1]; std::vector fileData(inputH * inputW); std::default_random_engine generator(mSeed); std::uniform_int_distribution distr(0, 9); auto const number = distr(generator); samplesCommon::readPGMFile( samplesCommon::locateFile(std::to_string(number) + ".pgm", mParams.dataDirs), fileData.data(), inputH, inputW); float* hostDataBuffer = static_cast(buffers.getHostBuffer(mParams.inputTensorNames[0])); for (int32_t i = 0; i < inputH * inputW; ++i) { auto const raw = 1.0 - float(fileData[i] / 255.0); hostDataBuffer[i] = raw; } sample::gLogInfo << "Input:" << std::endl; for (int32_t i = 0; i < inputH; ++i) { for (int32_t j = 0; j < inputW; ++j) { sample::gLogInfo << hostDataBuffer[i * inputW + j]; if (j < inputW - 1) { sample::gLogInfo << ", "; } } sample::gLogInfo << std::endl; } sample::gLogInfo << std::endl; return true; } //! //! \brief Verify result //! //! \return whether the output correctly identifies all (and only) non-zero elements //! bool SampleNonZeroPlugin::verifyOutput(samplesCommon::BufferManager const& buffers) { float* input = static_cast(buffers.getHostBuffer(mParams.inputTensorNames[0])); int32_t* output = static_cast(buffers.getHostBuffer(mParams.outputTensorNames[0])); int32_t count = *static_cast(buffers.getHostBuffer(mParams.outputTensorNames[1])); std::vector covered(mInputDims.d[0] * mInputDims.d[1], false); sample::gLogInfo << "Output:" << std::endl; if (mParams.rowOrder) { for (int32_t i = 0; i < count; ++i) { for (int32_t j = 0; j < 2; ++j) { sample::gLogInfo << output[j + 2 * i] << " "; } sample::gLogInfo << std::endl; } } else { for (int32_t i = 0; i < 2; ++i) { for (int32_t j = 0; j < count; ++j) { sample::gLogInfo << output[j + count * i] << " "; } sample::gLogInfo << std::endl; } } if (!mParams.rowOrder) { for (int32_t i = 0; i < count; ++i) { auto const idx = output[i] * mInputDims.d[1] + output[i + count]; covered[idx] = true; if (input[idx] == 0.F) { return false; } } } else { for (int32_t i = 0; i < count; ++i) { auto const idx = output[2 * i] * mInputDims.d[1] + output[2 * i + 1]; covered[idx] = true; if (input[idx] == 0.F) { return false; } } } for (int32_t i = 0; i < static_cast(covered.size()); ++i) { if (!covered[i]) { if (input[i] != 0.F) { return false; } } } return true; } //! //! \brief Initializes members of the params struct using the command line args //! NonZeroParams initializeSampleParams(samplesCommon::Args const& args) { NonZeroParams params; if (args.dataDirs.empty()) // Use default directories if user hasn't provided directory paths { params.dataDirs.push_back("data/mnist/"); params.dataDirs.push_back("data/samples/mnist/"); } else // Use the data directory provided by the user { params.dataDirs = args.dataDirs; } params.inputTensorNames.push_back("Input"); params.outputTensorNames.push_back("Output0"); params.outputTensorNames.push_back("Output1"); params.fp16 = args.runInFp16; params.rowOrder = args.rowOrder; return params; } //! //! \brief Prints the help information for running this sample //! void printHelpInfo() { std::cout << "Usage: ./sample_non_zero_plugin [-h or --help] [-d or --datadir=]" << std::endl; std::cout << "--help Display help information" << std::endl; std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used " "multiple times to add multiple directories. If no data directories are given, the default is to use " "(data/samples/mnist/, data/mnist/)" << std::endl; std::cout << "--fp16 Run in FP16 mode." << std::endl; std::cout << "--columnOrder Run plugin in column major output mode." << std::endl; } int main(int argc, char** argv) { samplesCommon::Args args; bool argsOK = samplesCommon::parseArgs(args, argc, argv); if (!argsOK) { sample::gLogError << "Invalid arguments" << std::endl; printHelpInfo(); return EXIT_FAILURE; } if (args.help) { printHelpInfo(); return EXIT_SUCCESS; } auto sampleTest = sample::gLogger.defineTest(kSAMPLE_NAME, argc, argv); sample::gLogger.reportTestStart(sampleTest); SampleNonZeroPlugin sample(initializeSampleParams(args)); sample::gLogInfo << "Building and running a GPU inference engine for NonZero plugin" << std::endl; if (!sample.build()) { return sample::gLogger.reportFail(sampleTest); } if (!sample.infer()) { return sample::gLogger.reportFail(sampleTest); } return sample::gLogger.reportPass(sampleTest); }