/* * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "NvOnnxParser.h" #include "ErrorRecorder.h" #include "common.h" #include "logger.h" #include "sampleDevice.h" #include "sampleEngines.h" #include "sampleOptions.h" #include "sampleUtils.h" #if ENABLE_UNIFIED_BUILDER #include "NvInferConsistency.h" #include "safeErrorRecorder.h" #endif using namespace nvinfer1; namespace sample { namespace { class FileStreamWriter final : public nvinfer1::IStreamWriter { protected: std::ofstream mStream; int64_t mTotalWrittenSize; public: FileStreamWriter(std::string const& path) : mStream(path, std::ios::binary) , mTotalWrittenSize(0) { } virtual int64_t write(void const* data, int64_t nbBytes) final { SMP_RETVAL_IF_FALSE( (mStream.is_open() && mStream.good()), "Cannot write to FileStreamWriter", -1, sample::gLogError); auto const* src = reinterpret_cast(data); mStream.write(src, nbBytes); mTotalWrittenSize += nbBytes; return nbBytes; } int64_t finalize() { mStream.close(); return mTotalWrittenSize; } }; std::map readScalesFromCalibrationCache(std::string const& calibrationFile) { std::map tensorScales; std::ifstream cache{calibrationFile}; if (!cache.is_open()) { sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl; return tensorScales; } std::string line; while (std::getline(cache, line)) { auto colonPos = line.find_last_of(':'); if (colonPos != std::string::npos) { // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); auto const tensorName = line.substr(0, colonPos); tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); } } cache.close(); return tensorScales; } } // namespace nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() { SMP_RETVAL_IF_FALSE( !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError); if (mEngine == nullptr) { SMP_RETVAL_IF_FALSE(getAsyncFileReader().isOpen() || getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!", nullptr, sample::gLogError); using time_point = std::chrono::time_point; using duration = std::chrono::duration; time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; if (mLeanDLLPath.empty()) { mRuntime.reset(createRuntime()); } else { mParentRuntime.reset(createRuntime()); ASSERT(mParentRuntime != nullptr); mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str())); } ASSERT(mRuntime != nullptr); if (mVersionCompatible) { // Application needs to opt into allowing deserialization of engines with embedded lean runtime. mRuntime->setEngineHostCodeAllowed(true); } if (!mTempdir.empty()) { mRuntime->setTemporaryDirectory(mTempdir.c_str()); } mRuntime->setTempfileControlFlags(mTempfileControls); SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError); if (mDLACore != -1) { mRuntime->setDLACore(mDLACore); } mRuntime->setErrorRecorder(&gRecorder); for (auto const& pluginPath : mDynamicPlugins) { mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str()); } if (getAsyncFileReader().isOpen()) { mEngine.reset(mRuntime->deserializeCudaEngine(getAsyncFileReader())); } else if (getFileReader().isOpen()) { mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader())); } else { auto const& engineBlob = getBlob(); mEngine.reset(mRuntime->deserializeCudaEngine(engineBlob.data, engineBlob.size)); } SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError); time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()}; sample::gLogInfo << "Engine deserialized in " << duration(deserializeEndTime - deserializeStartTime).count() << " sec." << std::endl; } return mEngine.get(); } nvinfer1::ICudaEngine* LazilyDeserializedEngine::release() { return mEngine.release(); } bool LazilyDeserializedEngine::checkDLASafe() { ASSERT(sample::hasSafeRuntime()); SMP_RETVAL_IF_FALSE(mDLACore == -1, "Safe DLA engine built with kDLA_STANDALONE should not be run via TRT!", false, sample::gLogError); return true; } void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, std::vector const& outputFormats, std::string const& calibrationFile) { auto const tensorScales = readScalesFromCalibrationCache(calibrationFile); bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs()); for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) { int32_t formatIdx = broadcastInputFormats ? 0 : i; if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8) { auto* input = network.getInput(i); auto const calibScale = tensorScales.at(input->getName()); input->setDynamicRange(-127 * calibScale, 127 * calibScale); } } bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbOutputs()); for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) { int32_t formatIdx = broadcastOutputFormats ? 0 : i; if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8) { auto* output = network.getOutput(i); auto const calibScale = tensorScales.at(output->getName()); output->setDynamicRange(-127 * calibScale, 127 * calibScale); } } } //! //! \brief Generate a network definition for a given model //! //! \param[in] model Model options for this network //! \param[in,out] network Network storing the parsed results //! \param[in,out] err Error stream //! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by //! the parsed network. //! //! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid //! parser (the returned parser converts to false if tested) //! //! Constant input dimensions in the model must not be changed in the corresponding //! network definition, because its correctness may rely on the constants. //! //! \see Parser::operator bool() //! Parser modelToNetwork(ModelOptions const& model, BuildOptions const& build, nvinfer1::INetworkDefinition& network, std::ostream& err, std::vector* vcPluginLibrariesUsed) { sample::gLogInfo << "Start parsing network model." << std::endl; auto const tBegin = std::chrono::high_resolution_clock::now(); Parser parser; switch (model.baseModel.format) { case ModelFormat::kONNX: { using namespace nvonnxparser; parser.onnxParser.reset(createONNXParser(network)); ASSERT(parser.onnxParser != nullptr); // kNATIVE_INSTANCENORM is ON by default in the parser and must be cleared to use the plugin implementation. if (build.pluginInstanceNorm) { parser.onnxParser->clearFlag(OnnxParserFlag::kNATIVE_INSTANCENORM); } if (build.enableUInt8AsymmetricQuantizationDLA) { parser.onnxParser->setFlag(OnnxParserFlag::kENABLE_UINT8_AND_ASYMMETRIC_QUANTIZATION_DLA); } if (!parser.onnxParser->parseFromFile( model.baseModel.model.c_str(), static_cast(sample::gLogger.getReportableSeverity()))) { err << "Failed to parse onnx file" << std::endl; parser.onnxParser.reset(); } if (vcPluginLibrariesUsed && parser.onnxParser.get()) { int64_t nbPluginLibs; char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs); if (nbPluginLibs >= 0) { vcPluginLibrariesUsed->reserve(nbPluginLibs); for (int64_t i = 0; i < nbPluginLibs; ++i) { sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl; vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]}); } } else { sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network" << std::endl; } } break; } case ModelFormat::kANY: break; } auto const tEnd = std::chrono::high_resolution_clock::now(); float const parseTime = std::chrono::duration(tEnd - tBegin).count(); sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl; return parser; } namespace { class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { public: RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, nvinfer1::INetworkDefinition const& network, std::ostream& err); ~RndInt8Calibrator() override { for (auto& elem : mInputDeviceBuffers) { CHECK_WITH_STREAM(cudaFree(elem.second), mErr); } } bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override; int32_t getBatchSize() const noexcept override { return 1; } void const* readCalibrationCache(size_t& length) noexcept override; void writeCalibrationCache(void const*, size_t) noexcept override {} private: int32_t mBatches{}; int32_t mCurrentBatch{}; std::string mCacheFile; std::map mInputDeviceBuffers; std::vector mCalibrationCache; std::ostream& mErr; }; RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, INetworkDefinition const& network, std::ostream& err) : mBatches(batches) , mCurrentBatch(0) , mCacheFile(cacheFile) , mErr(err) { std::ifstream tryCache(cacheFile, std::ios::binary); if (tryCache.good()) { return; } std::default_random_engine generator; std::uniform_real_distribution distribution(-1.0F, 1.0F); auto gen = [&generator, &distribution]() { return distribution(generator); }; for (int32_t i = 0; i < network.getNbInputs(); i++) { auto* input = network.getInput(i); std::vector rnd_data(elemCount[i]); std::generate_n(rnd_data.begin(), elemCount[i], gen); void* data; CHECK_WITH_STREAM(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); CHECK_WITH_STREAM( cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); } } bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept { if (mCurrentBatch >= mBatches) { return false; } for (int32_t i = 0; i < nbBindings; ++i) { bindings[i] = mInputDeviceBuffers[names[i]]; } ++mCurrentBatch; return true; } void const* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept { mCalibrationCache.clear(); std::ifstream input(mCacheFile, std::ios::binary); input >> std::noskipws; if (input.good()) { std::copy( std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); } length = mCalibrationCache.size(); return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; } bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F) { // Ensure that all layer inputs have a dynamic range. for (int32_t l = 0; l < network.getNbLayers(); l++) { auto* layer = network.getLayer(l); for (int32_t i = 0; i < layer->getNbInputs(); i++) { ITensor* input{layer->getInput(i)}; // Optional inputs are nullptr here and are from RNN layers. if (input && !input->dynamicRangeIsSet()) { // Concat should propagate dynamic range from outputs to inputs to avoid // Re-quantization during the concatenation auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange; if (!input->setDynamicRange(-dynRange, dynRange)) { return false; }} } for (int32_t o = 0; o < layer->getNbOutputs(); o++) { ITensor* output{layer->getOutput(o)}; // Optional outputs are nullptr here and are from RNN layers. if (output && !output->dynamicRangeIsSet()) { // Pooling must have the same input and output dynamic range. if (layer->getType() == LayerType::kPOOLING) { if (!output->setDynamicRange(-inRange, inRange)) { return false; } } else { if (!output->setDynamicRange(-outRange, outRange)) { return false; } } } } } return true; } bool isNonActivationType(nvinfer1::DataType const type) { return type == nvinfer1::DataType::kINT32 || type == nvinfer1::DataType::kINT64 || type == nvinfer1::DataType::kBOOL || type == nvinfer1::DataType::kUINT8; } void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions) { bool hasLayerPrecisionSkipped{false}; for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) { auto* layer = network.getLayer(layerIdx); auto const layerName = layer->getName(); auto exactMatch = layerPrecisions.find(layerName); auto plausibleMatch = findPlausible(layerPrecisions, layerName); if (exactMatch != layerPrecisions.end()) { sample::gLogInfo << "Set layer " << layerName << " to precision " << exactMatch->second << std::endl; layer->setPrecision(exactMatch->second); } else if (plausibleMatch != layerPrecisions.end()) { if (isNonActivationType(layer->getPrecision())) { hasLayerPrecisionSkipped = true; sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the " << " default layer precision is of non-activation type." << std::endl; continue; } if (layer->getType() == nvinfer1::LayerType::kCONSTANT && (isNonActivationType(static_cast(layer)->getWeights().type))) { hasLayerPrecisionSkipped = true; sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " << "constant layer has weights of non-activation type." << std::endl; continue; } if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) { hasLayerPrecisionSkipped = true; sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer " << "operates on a shape tensor." << std::endl; continue; } if (layer->getNbInputs() >= 1 && isNonActivationType(layer->getInput(0)->getType()) && layer->getNbOutputs() >= 1 && isNonActivationType(layer->getOutput(0)->getType())) { hasLayerPrecisionSkipped = true; sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " << "layer has input and output of non-activation type." << std::endl; continue; } // All heuristics passed. Set the layer precision. sample::gLogInfo << "Set layer " << layerName << " to precision " << plausibleMatch->second << std::endl; layer->setPrecision(plausibleMatch->second); } } if (hasLayerPrecisionSkipped) { sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details." << std::endl; } } void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes) { bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()}; auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT; bool hasLayerOutputTypeSkipped{false}; for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) { auto* layer = network.getLayer(layerIdx); auto const layerName = layer->getName(); auto const nbOutputs = layer->getNbOutputs(); auto exactMatch = layerOutputTypes.find(layerName); auto plausibleMatch = findPlausible(layerOutputTypes, layerName); if (exactMatch != layerOutputTypes.end()) { auto const& outputTypes = exactMatch->second; bool const isBroadcast = (outputTypes.size() == 1); if (!isBroadcast && static_cast(outputTypes.size()) != nbOutputs) { sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but " << outputTypes.size() << " output types are given in --layerOutputTypes flag." << std::endl; throw std::invalid_argument("Invalid --layerOutputTypes flag."); } for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) { auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType << std::endl; layer->setOutputType(outputIdx, outputType); } } else if (plausibleMatch != layerOutputTypes.end()) { auto const& outputTypes = plausibleMatch->second; bool const isBroadcast = (outputTypes.size() == 1); // We should not set the layer output types if its default precision is INT32 or Bool. if (layer->getPrecision() == nvinfer1::DataType::kINT32 || layer->getPrecision() == nvinfer1::DataType::kBOOL) { hasLayerOutputTypeSkipped = true; sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the " << " default layer precision is INT32 or Bool." << std::endl; continue; } // We should not set the constant layer output types if its weights are in INT32. if (layer->getType() == nvinfer1::LayerType::kCONSTANT && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) { hasLayerOutputTypeSkipped = true; sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this " << "constant layer has INT32 weights." << std::endl; continue; } for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) { // We should not set the output type if the output is a shape tensor. if (layer->getOutput(0)->isShapeTensor()) { hasLayerOutputTypeSkipped = true; sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer " << layerName << " because it is a shape tensor." << std::endl; continue; } auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType << std::endl; layer->setOutputType(outputIdx, globalOutputType); } } } if (hasLayerOutputTypeSkipped) { sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details." << std::endl; } } void setLayerDeviceTypes( INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes) { for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) { auto* layer = network.getLayer(layerIdx); auto const layerName = layer->getName(); auto match = findPlausible(layerDeviceTypes, layerName); if (match != layerDeviceTypes.end()) { DeviceType const deviceType = match->second; sample::gLogInfo << "Set layer " << layerName << " to device type " << deviceType << std::endl; config.setDeviceType(layer, deviceType); } } } void markDebugTensors(INetworkDefinition& network, StringSet const& debugTensors) { for (int64_t inputIndex = 0; inputIndex < network.getNbInputs(); ++inputIndex) { auto* t = network.getInput(inputIndex); auto const tensorName = t->getName(); if (debugTensors.count(tensorName) > 0) { network.markDebug(*t); } } for (int64_t layerIndex = 0; layerIndex < network.getNbLayers(); ++layerIndex) { auto* layer = network.getLayer(layerIndex); for (int64_t outputIndex = 0; outputIndex < layer->getNbOutputs(); ++outputIndex) { auto* t = layer->getOutput(outputIndex); auto const tensorName = t->getName(); if (debugTensors.count(tensorName) > 0) { network.markDebug(*t); } } } } void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build) { auto const roundToBytes = [](double const size, bool fromMB = true) { return static_cast(size * (fromMB ? 1.0_MiB : 1.0_KiB)); }; if (build.workspace >= 0) { config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace)); } if (build.dlaSRAM >= 0) { size_t const sizeInBytes = roundToBytes(build.dlaSRAM); size_t sizeInPowerOf2{1}; // Using 2^30 bytes as a loose upper bound to prevent the possibility of overflows and infinite loops. while (sizeInPowerOf2 < 31 && (static_cast(1) << sizeInPowerOf2) <= sizeInBytes) { ++sizeInPowerOf2; } --sizeInPowerOf2; if (sizeInPowerOf2 == 30) { sample::gLogWarning << "User-specified DLA managed SRAM size is too large and has been clipped to 2^30 bytes. " << "Please make sure that this is the intended managed SRAM size." << std::endl; } config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, static_cast(1) << sizeInPowerOf2); } if (build.dlaLocalDRAM >= 0) { config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM)); } if (build.dlaGlobalDRAM >= 0) { config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM)); } if (build.tacticSharedMem >= 0) { config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem, false)); } } void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build) { auto const setFlag = [&](PreviewFeature feat) { int32_t featVal = static_cast(feat); if (build.previewFeatures.find(featVal) != build.previewFeatures.end()) { config.setPreviewFeature(feat, build.previewFeatures.at(featVal)); } }; setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); setFlag(PreviewFeature::kRUNTIME_ACTIVATION_RESIZE_10_10); } [[nodiscard]] bool setupTilingSettings(BuildOptions const& build, IBuilderConfig& config, std::ostream& err) { if (!config.setTilingOptimizationLevel(static_cast(build.tilingOptimizationLevel))) { err << "Can not set tilingOptimizationLevel(" << build.tilingOptimizationLevel << ")" << std::endl; return false; } if (build.l2LimitForTiling != -1) { if (!config.setL2LimitForTiling(build.l2LimitForTiling)) { err << "Can not set l2LimitForTiling(" << build.l2LimitForTiling << ")" << std::endl; return false; } } return true; } bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr& calibrator, std::ostream& err, std::vector>& sparseWeights) { std::vector profiles{}; profiles.resize(build.optProfiles.size()); for (auto& profile : profiles) { profile = builder.createOptimizationProfile(); } bool hasDynamicShapes{false}; bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs()); // Check if the provided input tensor names match the input tensors of the engine. // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. for (auto const& shapes : build.optProfiles) { for (auto const& shape : shapes) { bool tensorNameFound{false}; for (int32_t i = 0; i < network.getNbInputs(); ++i) { if (matchStringWithOneWildcard(shape.first, network.getInput(i)->getName())) { tensorNameFound = true; break; } } if (!tensorNameFound) { sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network " << "inputs! Please make sure the input tensor names are correct." << std::endl; return false; } } } for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) { // Set formats and data types of inputs auto* input = network.getInput(i); if (!build.inputFormats.empty()) { int32_t inputFormatIndex = broadcastInputFormats ? 0 : i; input->setType(build.inputFormats[inputFormatIndex].first); input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); } auto const dims = input->getDimensions(); auto const isScalar = dims.nbDims == 0; auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) || input->isShapeTensor(); if (isDynamicInput) { hasDynamicShapes = true; for (size_t i = 0; i < build.optProfiles.size(); i++) { auto const& optShapes = build.optProfiles[i]; auto profile = profiles[i]; auto const tensorName = input->getName(); auto shape = findPlausible(optShapes, tensorName); ShapeRange shapes{}; // If no shape is provided, set dynamic dimensions to 1. if (shape == optShapes.end()) { constexpr int32_t kDEFAULT_DIMENSION{1}; std::vector staticDims; if (input->isShapeTensor()) { if (isScalar) { staticDims.push_back(1); } else { staticDims.resize(dims.d[0]); std::fill(staticDims.begin(), staticDims.end(), kDEFAULT_DIMENSION); } } else { staticDims.resize(dims.nbDims); std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), [&](int dimension) { return dimension > 0 ? dimension : kDEFAULT_DIMENSION; }); } sample::gLogWarning << "Dynamic dimensions required for input: " << tensorName << ", but no shapes were provided. Automatically overriding shape to: " << staticDims << std::endl; std::fill(shapes.begin(), shapes.end(), staticDims); } else { shapes = shape->second; } std::vector profileDims{}; if (input->isShapeTensor()) { profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; SMP_RETVAL_IF_FALSE(profile->setShapeValuesV2(tensorName, OptProfileSelector::kMIN, profileDims.data(), static_cast(profileDims.size())), "Error in set shape values MIN", false, err); profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; SMP_RETVAL_IF_FALSE(profile->setShapeValuesV2(tensorName, OptProfileSelector::kOPT, profileDims.data(), static_cast(profileDims.size())), "Error in set shape values OPT", false, err); profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; SMP_RETVAL_IF_FALSE(profile->setShapeValuesV2(tensorName, OptProfileSelector::kMAX, profileDims.data(), static_cast(profileDims.size())), "Error in set shape values MAX", false, err); sample::gLogInfo << "Set input shape tensor " << tensorName << " for optimization profile " << i << " to:" << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; } else { profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; SMP_RETVAL_IF_FALSE( profile->setDimensions(tensorName, OptProfileSelector::kMIN, toDims(profileDims)), "Error in set dimensions to profile MIN", false, err); profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; SMP_RETVAL_IF_FALSE( profile->setDimensions(tensorName, OptProfileSelector::kOPT, toDims(profileDims)), "Error in set dimensions to profile OPT", false, err); profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; SMP_RETVAL_IF_FALSE( profile->setDimensions(tensorName, OptProfileSelector::kMAX, toDims(profileDims)), "Error in set dimensions to profile MAX", false, err); sample::gLogInfo << "Set shape of input tensor " << tensorName << " for optimization profile " << i << " to:" << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; } } } } for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) { auto* output = network.getOutput(i); auto const dims = output->getDimensions(); // A shape tensor output with known static dimensions may have dynamic shape values inside it. auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) || output->isShapeTensor(); if (isDynamicOutput) { hasDynamicShapes = true; } } if (!hasDynamicShapes && !build.optProfiles[0].empty()) { sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be " "determined by the model itself" << std::endl; return false; } if (hasDynamicShapes) { for (auto profile : profiles) { SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err); SMP_RETVAL_IF_FALSE( config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err); } } bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) { // Set formats and data types of outputs auto* output = network.getOutput(i); if (!build.outputFormats.empty()) { int32_t outputFormatIndex = broadcastOutputFormats ? 0 : i; output->setType(build.outputFormats[outputFormatIndex].first); output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); } } setMemoryPoolLimits(config, build); setPreviewFeatures(config, build); if (build.builderOptimizationLevel != defaultBuilderOptimizationLevel) { config.setBuilderOptimizationLevel(build.builderOptimizationLevel); } if (build.maxTactics != defaultMaxTactics) { config.setMaxNbTactics(build.maxTactics); } if (build.timingCacheMode == TimingCacheMode::kDISABLE) { config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE); } if (build.disableCompilationCache) { config.setFlag(BuilderFlag::kDISABLE_COMPILATION_CACHE); } if (build.errorOnTimingCacheMiss) { config.setFlag(BuilderFlag::kERROR_ON_TIMING_CACHE_MISS); } if (!build.tf32) { config.clearFlag(BuilderFlag::kTF32); } if (build.refittable) { config.setFlag(BuilderFlag::kREFIT); } if (build.stripWeights) { // The kREFIT_IDENTICAL is enabled by default when kSTRIP_PLAN is on. config.setFlag(BuilderFlag::kSTRIP_PLAN); } if (build.versionCompatible) { config.setFlag(BuilderFlag::kVERSION_COMPATIBLE); } std::vector pluginPaths; for (auto const& pluginPath : sys.setPluginsToSerialize) { sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl; pluginPaths.push_back(pluginPath.c_str()); } if (!pluginPaths.empty()) { config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size()); } if (build.excludeLeanRuntime) { config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME); } if (build.sparsity != SparsityFlag::kDISABLE) { config.setFlag(BuilderFlag::kSPARSE_WEIGHTS); if (build.sparsity == SparsityFlag::kFORCE) { sparsify(network, sparseWeights); } } if (build.enableMonitorMemory) { config.setFlag(BuilderFlag::kMONITOR_MEMORY); } if (build.distributiveIndependence) { config.setFlag(BuilderFlag::kDISTRIBUTIVE_INDEPENDENCE); } config.setProfilingVerbosity(build.profilingVerbosity); config.setAvgTimingIterations(build.avgTiming); if (build.fp16) { config.setFlag(BuilderFlag::kFP16); } if (build.int8) { config.setFlag(BuilderFlag::kINT8); } if (build.bf16) { config.setFlag(BuilderFlag::kBF16); } SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), "FP8 and INT8 precisions have been specified", false, err); if (build.fp8) { config.setFlag(BuilderFlag::kFP8); } if (build.int4) { config.setFlag(BuilderFlag::kINT4); } if (build.int8 && !build.fp16) { sample::gLogInfo << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally " "specifying --fp16 or --best" << std::endl; } auto isInt8 = [](IOFormat const& format) { return format.first == DataType::kINT8; }; auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8) + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8); auto hasQDQLayers = [](INetworkDefinition& network) { // Determine if our network has QDQ layers. auto const nbLayers = network.getNbLayers(); for (int32_t i = 0; i < nbLayers; i++) { auto const& layer = network.getLayer(i); if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE) { return true; } } return false; }; if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty()) { // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8, // because auto calibration does not support this case. SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err); } else if (build.int8) { if (!hasQDQLayers(network) && int8IO) { try { // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration); } catch (std::exception&) { sample::gLogError << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file" << std::endl; return false; } } IOptimizationProfile* profileCalib{nullptr}; if (!build.shapesCalib.empty()) { profileCalib = builder.createOptimizationProfile(); for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) { auto* input = network.getInput(i); Dims profileDims{}; auto const tensorName = input->getName(); auto shape = findPlausible(build.shapesCalib, tensorName); if (shape == build.shapesCalib.end()) { std::ostringstream msg; msg << "Calibration profile for tensor " << tensorName << " cannot be found!"; throw std::invalid_argument(msg.str()); } auto shapesCalib = shape->second; profileDims = toDims(shapesCalib[static_cast(OptProfileSelector::kOPT)]); // Here we check only kMIN as all profileDims are the same. SMP_RETVAL_IF_FALSE(profileCalib->setDimensions(tensorName, OptProfileSelector::kMIN, profileDims), "Error in set dimensions to calibration profile OPT", false, err); profileCalib->setDimensions(tensorName, OptProfileSelector::kOPT, profileDims); profileCalib->setDimensions(tensorName, OptProfileSelector::kMAX, profileDims); sample::gLogInfo << "Set calibration profile for input tensor " << tensorName << " to " << profileDims << std::endl; } SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err); SMP_RETVAL_IF_FALSE( config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err); } std::vector elemCount{}; for (int i = 0; i < network.getNbInputs(); i++) { auto* input = network.getInput(i); auto const dims = input->getDimensions(); auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); if (profileCalib) { elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT))); } else if (!profiles.empty() && isDynamicInput) { elemCount.push_back( volume(profiles[build.calibProfile]->getDimensions(input->getName(), OptProfileSelector::kOPT))); } else { elemCount.push_back(volume(input->getDimensions())); } } calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); config.setInt8Calibrator(calibrator.get()); } if (build.directIO) { config.setFlag(BuilderFlag::kDIRECT_IO); } switch (build.precisionConstraints) { case PrecisionConstraints::kNONE: // It's the default for TensorRT. break; case PrecisionConstraints::kOBEY: config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); break; case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break; } if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) { setLayerPrecisions(network, build.layerPrecisions); } if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) { setLayerOutputTypes(network, build.layerOutputTypes); } if (!build.layerDeviceTypes.empty()) { setLayerDeviceTypes(network, config, build.layerDeviceTypes); } if (!build.debugTensors.empty()) { markDebugTensors(network, build.debugTensors); } if (build.markUnfusedTensorsAsDebugTensors) { network.markUnfusedTensorsAsDebugTensors(); } if (build.safe && sys.DLACore == -1) { config.setEngineCapability(EngineCapability::kSAFETY); } if (build.restricted) { config.setFlag(BuilderFlag::kSAFETY_SCOPE); } if (sys.DLACore != -1) { if (sys.DLACore < builder.getNbDLACores()) { config.setDefaultDeviceType(DeviceType::kDLA); config.setDLACore(sys.DLACore); config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); if (build.buildDLAStandalone) { config.setEngineCapability(EngineCapability::kDLA_STANDALONE); } if (build.allowGPUFallback) { config.setFlag(BuilderFlag::kGPU_FALLBACK); } else { // Reformatting runs on GPU, so avoid I/O reformatting. config.setFlag(BuilderFlag::kDIRECT_IO); } if (!build.int8) { config.setFlag(BuilderFlag::kFP16); } } else { err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl; return false; } } if (build.enabledTactics || build.disabledTactics) { TacticSources tacticSources = config.getTacticSources(); tacticSources |= build.enabledTactics; tacticSources &= ~build.disabledTactics; config.setTacticSources(tacticSources); } config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel); config.setRuntimePlatform(build.runtimePlatform); if (build.maxAuxStreams != defaultMaxAuxStreams) { config.setMaxAuxStreams(build.maxAuxStreams); } if (build.allowWeightStreaming) { config.setFlag(BuilderFlag::kWEIGHT_STREAMING); } if (!setupTilingSettings(build, config, err)) { return false; } config.setRemoteAutoTuningConfig(build.remoteAutoTuningConfig.c_str()); return true; } } // namespace //! //! \brief Create a serialized engine for a network defintion //! //! \return Whether the engine creation succeeds or fails. //! bool networkToSerializedEngine( BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err) { std::unique_ptr config{builder.createBuilderConfig()}; std::unique_ptr calibrator; std::vector> sparseWeights; SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); SMP_RETVAL_IF_FALSE( setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights), "Network And Config setup failed", false, err); std::unique_ptr timingCache{}; // Try to load cache from file. Create a fresh cache if the file doesn't exist if (build.timingCacheMode == TimingCacheMode::kGLOBAL) { timingCache = samplesCommon::buildTimingCacheFromFile(gLogger.getTRTLogger(), *config, build.timingCacheFile); } // CUDA stream used for profiling by the builder. auto profileStream = samplesCommon::makeCudaStream(); SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); config->setProfileStream(*profileStream); auto const tBegin = std::chrono::high_resolution_clock::now(); if (!(build.safe || build.buildDLAStandalone) && build.save) { auto const engineFile = build.engine; FileStreamWriter writer(engineFile); SMP_RETVAL_IF_FALSE(builder.buildSerializedNetworkToStream(*env.network, *config, writer), "Engine could not be created from network", false, err); auto const engineSize = writer.finalize(); std::vector streamEngine(engineSize, 0); std::ifstream reader(engineFile, std::ios::binary); SMP_RETVAL_IF_FALSE((reader.is_open() && reader.good()), "Failed to open engine file for reading", false, err); reader.read(reinterpret_cast(streamEngine.data()), engineSize); SMP_RETVAL_IF_FALSE((!reader.fail()), "Error when reading engine file", false, err); reader.close(); sample::gLogInfo << "Created engine with size: " << (engineSize / 1.0_MiB) << " MiB" << std::endl; env.engine.setBlob(std::move(streamEngine)); } else { std::unique_ptr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err); sample::gLogInfo << "Created engine with size: " << (serializedEngine->size() / 1.0_MiB) << " MiB" << std::endl; if (build.safe && build.consistency) { if (!checkSafeEngine(serializedEngine->data(), serializedEngine->size())) { sample::gLogError << "Consistency validation is not supported." << std::endl; return false; } } env.engine.setBlob(serializedEngine); } auto const tEnd = std::chrono::high_resolution_clock::now(); float const buildTime = std::chrono::duration(tEnd - tBegin).count(); sample::gLogInfo << "Engine built in " << buildTime << " sec." << std::endl; if (build.timingCacheMode == TimingCacheMode::kGLOBAL) { auto timingCache = config->getTimingCache(); samplesCommon::updateTimingCacheFile(gLogger.getTRTLogger(), build.timingCacheFile, timingCache, builder); } return true; } //! //! \brief Parse a given model, create a network and an engine. //! bool modelToBuildEnv( ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) { env.builder.reset(createBuilder()); SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err); env.builder->setErrorRecorder(&gRecorder); auto networkFlags = (build.stronglyTyped) ? 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED) : 0U; for (auto const& pluginPath : sys.dynamicPlugins) { env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); } env.network.reset(env.builder->createNetworkV2(networkFlags)); std::vector vcPluginLibrariesUsed; SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err); env.parser = modelToNetwork(model, build, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr); SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err); if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty()) { sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a " "version-compatible engine:" << std::endl; for (auto const& lib : vcPluginLibrariesUsed) { sample::gLogInfo << " " << lib << std::endl; } if (!build.excludeLeanRuntime) { sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime " "was not specified." << std::endl; std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.setPluginsToSerialize)); } sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl; std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins)); // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well. for (auto const& pluginPath : vcPluginLibrariesUsed) { env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); } sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl; } SMP_RETVAL_IF_FALSE( networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err); return true; } namespace { std::pair, std::vector> getLayerWeightsRolePair(IRefitter& refitter) { // Get number of refittable items. auto const nbAll = refitter.getAll(0, nullptr, nullptr); std::vector layerNames(nbAll); // Allocate buffers for the items and get them. std::vector weightsRoles(nbAll); refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); std::vector layerNameStrs(nbAll); std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { if (name == nullptr) { return std::string{}; } return std::string{name}; }); return {layerNameStrs, weightsRoles}; } std::pair, std::vector> getMissingLayerWeightsRolePair(IRefitter& refitter) { // Get number of refittable items. auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); std::vector layerNames(nbMissing); // Allocate buffers for the items and get them. std::vector weightsRoles(nbMissing); refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); std::vector layerNameStrs(nbMissing); std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { if (name == nullptr) { return std::string{}; } return std::string{name}; }); return {layerNameStrs, weightsRoles}; } } // namespace bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) { auto& reader = env.engine.getFileReader(); SMP_RETVAL_IF_FALSE(reader.open(filepath), "", false, err << "Error opening engine file: " << filepath); return true; } bool loadAsyncStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) { auto& asyncReader = env.engine.getAsyncFileReader(); SMP_RETVAL_IF_FALSE(asyncReader.open(filepath), "", false, err << "Error opening engine file: " << filepath); return true; } bool loadEngineToBuildEnv( std::string const& filepath, BuildEnvironment& env, std::ostream& err, bool const enableConsistency) { auto const tBegin = std::chrono::high_resolution_clock::now(); std::ifstream engineFile(filepath, std::ios::binary); SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << filepath); engineFile.seekg(0, std::ifstream::end); int64_t fsize = engineFile.tellg(); engineFile.seekg(0, std::ifstream::beg); std::vector engineBlob(fsize); engineFile.read(reinterpret_cast(engineBlob.data()), fsize); SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << filepath); auto const tEnd = std::chrono::high_resolution_clock::now(); float const loadTime = std::chrono::duration(tEnd - tBegin).count(); sample::gLogInfo << "Engine loaded in " << loadTime << " sec." << std::endl; sample::gLogInfo << "Loaded engine with size: " << (fsize / 1.0_MiB) << " MiB" << std::endl; if (enableConsistency) { if (!checkSafeEngine(engineBlob.data(), fsize)) { sample::gLogError << "Consistency validation is not enabled." << std::endl; return false; } } env.engine.setBlob(std::move(engineBlob)); return true; } bool printPlanVersion(BuildEnvironment& env, std::ostream& err) { constexpr int64_t kPLAN_SIZE{28}; std::vector data(kPLAN_SIZE); auto blob = data.data(); auto& reader = env.engine.getFileReader(); auto& asyncReader = env.engine.getAsyncFileReader(); if (reader.isOpen()) { SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err); } else if (asyncReader.isOpen()) { SMP_RETVAL_IF_FALSE(asyncReader.read(data.data(), kPLAN_SIZE, cudaStream_t{}) == kPLAN_SIZE, "Failed to read plan file", false, err); } else { SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err); SMP_RETVAL_IF_FALSE(env.engine.getBlob().size >= 28, "Plan file is incorrect", false, err); blob = static_cast(env.engine.getBlob().data); } auto blob32 = reinterpret_cast(blob); //! Correct TensorRT plan file starts with this tag constexpr uint32_t kPLAN_FILE_TAG{0x74727466U}; SMP_RETVAL_IF_FALSE(blob32[0] == kPLAN_FILE_TAG, "Failed to verify a plan tag.", false, err); switch (blob32[1]) { case 0U: { // Blob index to store the plan version may depend on the serialization version. sample::gLogInfo << "Plan was created with TensorRT version " << static_cast(blob[24]) << "." << static_cast(blob[25]) << "." << static_cast(blob[26]) << "." << static_cast(blob[27]) << std::endl; return true; } } sample::gLogError << "Serialization version is not supported." << std::endl; return false; } void dumpRefittable(nvinfer1::ICudaEngine& engine) { std::unique_ptr refitter{createRefitter(engine)}; if (refitter == nullptr) { sample::gLogError << "Failed to create a refitter." << std::endl; return; } auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); auto const& layerNames = layerWeightsRolePair.first; auto const& weightsRoles = layerWeightsRolePair.second; auto const nbAll = layerWeightsRolePair.first.size(); for (size_t i = 0; i < nbAll; ++i) { sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; } } ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err) { BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); return loadEngineToBuildEnv(engine, env, err, false) ? env.engine.release() : nullptr; } bool saveEngine(ICudaEngine const& engine, std::string const& fileName, std::ostream& err) { std::ofstream engineFile(fileName, std::ios::binary); if (!engineFile) { err << "Cannot open engine file: " << fileName << std::endl; return false; } std::unique_ptr serializedEngine{engine.serialize()}; if (serializedEngine == nullptr) { err << "Engine serialization failed" << std::endl; return false; } engineFile.write(static_cast(serializedEngine->data()), serializedEngine->size()); return !engineFile.fail(); } bool getEngineBuildEnv( ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) { bool createEngineSuccess{false}; if (build.load) { if (build.safe) { createEngineSuccess = loadEngineToBuildEnv(build.engine, env, err, build.safe && build.consistency); } else { if (build.asyncFileReader) { createEngineSuccess = loadAsyncStreamingEngineToBuildEnv(build.engine, env, err); } else { createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err); } } } else { createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); } SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err); if (build.getPlanVersionOnly && build.load) { SMP_RETVAL_IF_FALSE(printPlanVersion(env, err), "Failed to get plan file version.", false, err); return true; } if (build.save) { std::ofstream engineFile(build.engine, std::ios::binary); auto& engineBlob = env.engine.getBlob(); engineFile.write(static_cast(engineBlob.data), engineBlob.size); SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err); engineFile.flush(); engineFile.close(); if (!build.safe) { env.engine.releaseBlob(); if (build.asyncFileReader) { SMP_RETVAL_IF_FALSE(loadAsyncStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file via async stream reader failed.", false, err); } else { SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file via stream reader failed.", false, err); } } } return true; } // There is not a getWeightsName API, so we need to use WeightsRole. std::vector> getAllRefitWeightsForLayer(ILayer const& l) { switch (l.getType()) { case LayerType::kCONSTANT: { auto const& layer = static_cast(l); auto const weights = layer.getWeights(); switch (weights.type) { case DataType::kFLOAT: case DataType::kHALF: case DataType::kBF16: case DataType::kINT8: case DataType::kINT32: case DataType::kINT64: return {std::make_pair(WeightsRole::kCONSTANT, weights)}; case DataType::kBOOL: case DataType::kUINT8: case DataType::kFP8: case DataType::kINT4: case DataType::kFP4: case DataType::kE8M0: // Refit not supported for these types. break; } break; } case LayerType::kCONVOLUTION: { auto const& layer = static_cast(l); return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; } case LayerType::kDECONVOLUTION: { auto const& layer = static_cast(l); return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; } case LayerType::kSCALE: { auto const& layer = static_cast(l); return {std::make_pair(WeightsRole::kSCALE, layer.getScale()), std::make_pair(WeightsRole::kSHIFT, layer.getShift())}; } case LayerType::kACTIVATION: case LayerType::kASSERTION: case LayerType::kCAST: case LayerType::kCONCATENATION: case LayerType::kCONDITION: case LayerType::kCONDITIONAL_INPUT: case LayerType::kCONDITIONAL_OUTPUT: case LayerType::kCUMULATIVE: case LayerType::kDEQUANTIZE: case LayerType::kDYNAMIC_QUANTIZE: case LayerType::kEINSUM: case LayerType::kELEMENTWISE: case LayerType::kFILL: case LayerType::kGATHER: case LayerType::kGRID_SAMPLE: case LayerType::kIDENTITY: case LayerType::kITERATOR: case LayerType::kLOOP_OUTPUT: case LayerType::kLRN: case LayerType::kMATRIX_MULTIPLY: case LayerType::kNMS: case LayerType::kNON_ZERO: case LayerType::kNORMALIZATION: case LayerType::kONE_HOT: case LayerType::kPADDING: case LayerType::kPARAMETRIC_RELU: case LayerType::kPLUGIN: case LayerType::kPLUGIN_V2: case LayerType::kPLUGIN_V3: case LayerType::kPOOLING: case LayerType::kQUANTIZE: case LayerType::kRAGGED_SOFTMAX: case LayerType::kRECURRENCE: case LayerType::kREDUCE: case LayerType::kRESIZE: case LayerType::kREVERSE_SEQUENCE: case LayerType::kSCATTER: case LayerType::kSELECT: case LayerType::kSHAPE: case LayerType::kSHUFFLE: case LayerType::kSLICE: case LayerType::kSOFTMAX: case LayerType::kSQUEEZE: case LayerType::kTOPK: case LayerType::kTRIP_LIMIT: case LayerType::kUNARY: case LayerType::kUNSQUEEZE: return {}; } return {}; } bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading) { using time_point = std::chrono::time_point; using durationMs = std::chrono::duration; auto const nbLayers = network.getNbLayers(); std::unique_ptr refitter{createRefitter(engine)}; // Set max threads that can be used by refitter. if (multiThreading && !refitter->setMaxThreads(10)) { sample::gLogError << "Failed to set max threads to refitter." << std::endl; return false; } auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); // We use std::string instead of char const* since we can have copies of layer names. std::set> layerRoleSet; auto const& layerNames = layerWeightsRolePair.first; auto const& weightsRoles = layerWeightsRolePair.second; std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), std::inserter(layerRoleSet, layerRoleSet.begin()), [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); }); auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) { return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end(); }; auto const setWeights = [&] { for (int32_t i = 0; i < nbLayers; i++) { auto const layer = network.getLayer(i); auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); for (auto const& roleWeights : roleWeightsVec) { if (isRefittable(layer->getName(), roleWeights.first)) { bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second); if (!success) { return false; } } } } return true; }; auto const reportMissingWeights = [&] { auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); auto const& layerNames = missingPair.first; auto const& weightsRoles = missingPair.second; for (size_t i = 0; i < layerNames.size(); ++i) { sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting." << std::endl; } return layerNames.empty(); }; // Skip weights validation since we are confident that the new weights are similar to the weights used to build // engine. refitter->setWeightsValidation(false); // Warm up and report missing weights // We only need to set weights for the first time and that can be reused in later refitting process. bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); if (!success) { return false; } TrtCudaStream stream; constexpr int32_t kLOOP = 10; time_point const refitStartTime{std::chrono::steady_clock::now()}; { for (int32_t l = 0; l < kLOOP; l++) { if (!refitter->refitCudaEngineAsync(stream.get())) { return false; } } } stream.synchronize(); time_point const refitEndTime{std::chrono::steady_clock::now()}; sample::gLogInfo << "Engine refitted" << " in " << durationMs(refitEndTime - refitStartTime).count() / kLOOP << " ms." << std::endl; return true; } namespace { void* initSafeRuntime() { void* handle{nullptr}; // Currently libnvinfer_safe_debug.so for samplesCommon::isDebug() is not ready. #if !defined(_WIN32) std::string const dllName{"libnvinfer_safe.so"}; #if SANITIZER_BUILD handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); #else // RTLD_GLOBAL is used for symbol resolution of subsequently loaded plugin libraries handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_GLOBAL); #endif #endif return handle; } void* initConsistencyCheckerLibrary() { void* handle{nullptr}; #if !defined(_WIN32) std::string const dllName{"libnvinfer_checker_shared.so"}; #if SANITIZER_BUILD handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); #else handle = dlopen(dllName.c_str(), RTLD_LAZY); #endif #endif return handle; } #if !defined(_WIN32) struct DllDeleter { void operator()(void* handle) { if (handle != nullptr) { dlclose(handle); } } }; const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; const std::unique_ptr consistencyCheckerLibrary{initConsistencyCheckerLibrary()}; #endif } // namespace bool hasSafeRuntime() { #if defined(_WIN32) return false; #else return (safeRuntimeLibrary != nullptr); #endif } bool hasConsistencyChecker() { #if defined(_WIN32) return false; #else return (consistencyCheckerLibrary != nullptr); #endif } #if ENABLE_UNIFIED_BUILDER nvinfer2::safe::consistency::IConsistencyChecker* createConsistencyChecker( sample::SampleSafeRecorder& recorder, void const* serializedEngine, int32_t const engineSize) noexcept { nvinfer2::safe::consistency::IConsistencyChecker* checker{nullptr}; if (serializedEngine == nullptr || engineSize == 0) { return checker; } #if !defined(_WIN32) constexpr char symbolName[] = "createConsistencyChecker"; typedef ErrorCode (*CreateCheckerFn)(nvinfer2::safe::consistency::IConsistencyChecker * &checker, sample::SampleSafeRecorder & recorder, void const* data, size_t size); if (hasSafeRuntime()) { auto createFn = reinterpret_cast(dlsym(consistencyCheckerLibrary.get(), symbolName)); if (createFn != nullptr) { ErrorCode errorCode = createFn(checker, recorder, serializedEngine, engineSize); if (errorCode != ErrorCode::kSUCCESS) { return nullptr; } } } #endif return checker; } #endif bool checkSafeEngine(void const* serializedEngine, int64_t const engineSize) { if (!hasConsistencyChecker()) { sample::gLogError << "Cannot perform consistency check because the checker is not loaded.." << std::endl; return false; } #if ENABLE_UNIFIED_BUILDER sample::SampleSafeRecorder recorder{nvinfer2::safe::Severity::kINFO}; auto checker = std::unique_ptr( createConsistencyChecker(recorder, serializedEngine, engineSize)); if (checker.get() == nullptr) { sample::gLogError << "Failed to create consistency checker." << std::endl; return false; } sample::gLogInfo << "Start consistency checking." << std::endl; if (!checker->validate()) { sample::gLogError << "Consistency validation failed." << std::endl; return false; } sample::gLogInfo << "Consistency validation passed." << std::endl; return true; #else return false; #endif } } // namespace sample