/* * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__QNX__) #include #include #endif #include "NvInferRuntime.h" #include "bfloat16.h" #include "common.h" #include "debugTensorWriter.h" #include "logger.h" #include "sampleDevice.h" #include "sampleEngines.h" #include "sampleInference.h" #include "sampleOptions.h" #include "sampleReporting.h" #include "sampleUtils.h" #include #if CUDA_VERSION >= 11060 #include #endif using namespace nvinfer1; #if ENABLE_UNIFIED_BUILDER using namespace nvinfer2::safe; // Provide a weak default definition that can be overridden __attribute__((weak)) std::shared_ptr gSafeRecorder = std::make_shared(nvinfer2::safe::Severity::kINFO); #endif namespace sample { #if !TRT_STATIC std::string const& getRuntimeLibraryName(RuntimeMode const mode) { switch (mode) { case RuntimeMode::kFULL: return kNVINFER_LIBNAME; case RuntimeMode::kDISPATCH: return kNVINFER_DISPATCH_LIBNAME; case RuntimeMode::kLEAN: return kNVINFER_LEAN_LIBNAME; case RuntimeMode::kSAFE: return kNVINFER_SAFE_LIBNAME; } throw std::runtime_error("Unknown runtime mode"); } #endif // !TRT_STATIC #if ENABLE_UNIFIED_BUILDER namespace safe { namespace { std::function pcreateTRTGraphInternal{}; std::function pdestroyTRTGraphInternal{}; std::function pgetSafePluginRegistryInternal{}; } // namespace //! Track runtime used for the execution of trtexec. //! Must be tracked as a global variable due to how library init functions APIs are organized. RuntimeMode gUseRuntime = RuntimeMode::kSAFE; //! //! \brief Initialize the NVIDIA Inference Safe Runtime library //! //! This function dynamically loads the Safe TensorRT runtime library and initializes //! function pointers for safe TensorRT operations. It is used to set up the safe runtime //! environment for inference with safety-certified TensorRT engines. //! //! The function performs the following operations: //! - Dynamically loads the safe TensorRT runtime library //! - Retrieves and stores function pointers for: //! - createTRTGraph: Creates a safe TRT graph from serialized engine data //! - destroyTRTGraph: Destroys a safe TRT graph and releases resources //! - getSafePluginRegistry: Gets the safe plugin registry for loading plugins //! //! \return true if the safe runtime library was successfully loaded and initialized, //! false otherwise (e.g., in static builds or if library loading fails) //! bool initNvinferSafe() { #if !TRT_STATIC static LibraryPtr libnvinfersafePtr{}; auto fetchPtrs = [](samplesCommon::DynamicLibrary* l) { if (gUseRuntime == RuntimeMode::kSAFE) { pcreateTRTGraphInternal = l->symbolAddress("createTRTGraph"); pdestroyTRTGraphInternal = l->symbolAddress("destroyTRTGraph"); pgetSafePluginRegistryInternal = l->symbolAddress( "getSafePluginRegistry"); } }; return initLibrary(libnvinfersafePtr, sample::getRuntimeLibraryName(gUseRuntime), fetchPtrs); #else return false; #endif // !TRT_STATIC } //! //! \brief Create a safe TRT graph from serialized engine data //! //! This function creates a safe TRT graph from serialized engine data. It is used to create //! a safe TRT graph for inference with safety-certified TensorRT engines. //! nvinfer1::ErrorCode createSafeTRTGraph(nvinfer2::safe::ITRTGraph*& graph, void const* blob, int64_t size, ISafeRecorder& recorder, bool useManaged, ISafeMemAllocator* allocator) { if (!initNvinferSafe()) { return nvinfer1::ErrorCode::kINTERNAL_ERROR; } ASSERT(pcreateTRTGraphInternal != nullptr); return pcreateTRTGraphInternal(graph, blob, size, recorder, useManaged, allocator); } //! //! \brief Destroy a safe TRT graph and release resources //! //! This function destroys a safe TRT graph and releases the associated resources. It is used to clean up //! the safe TRT graph after inference with safety-certified TensorRT engines. //! nvinfer1::ErrorCode destroySafeTRTGraph(nvinfer2::safe::ITRTGraph*& graph) { if (!initNvinferSafe()) { return nvinfer1::ErrorCode::kINTERNAL_ERROR; } ASSERT(pdestroyTRTGraphInternal != nullptr); return pdestroyTRTGraphInternal(graph); } //! //! \brief Get the safe plugin registry for loading plugins //! //! This function retrieves the safe plugin registry for loading plugins. It is used to get the safe plugin registry //! for loading plugins with safety-certified TensorRT engines. //! nvinfer2::safe::ISafePluginRegistry* getSafePluginRegistry(ISafeRecorder& recorder) { if (!initNvinferSafe()) { return nullptr; } ASSERT(pgetSafePluginRegistryInternal != nullptr); return pgetSafePluginRegistryInternal(recorder); } namespace { nvinfer2::safe::TypedArray createTypedArray(void* const ptr, DataType const type, uint64_t bufferSize) { switch (type) { case DataType::kFLOAT: return nvinfer2::safe::TypedArray(static_cast(ptr), bufferSize); case DataType::kHALF: return nvinfer2::safe::TypedArray(static_cast(ptr), bufferSize); case DataType::kINT32: return nvinfer2::safe::TypedArray(static_cast(ptr), bufferSize); case DataType::kINT8: return nvinfer2::safe::TypedArray(static_cast(ptr), bufferSize); default: { sample::gLogError << "Invalid tensor DataType encountered." << std::endl; return nvinfer2::safe::TypedArray{}; } } } } // namespace } // namespace safe #endif template bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex) { // Check if the provided input tensor names match the input tensors of the engine. // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. for (auto const& item : map) { bool tensorNameFound{false}; for (int32_t b = 0; b < endBindingIndex; ++b) { auto const tensorName = engine->getIOTensorName(b); auto const tensorIOMode = engine->getTensorIOMode(tensorName); if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName)) { tensorNameFound = true; break; } } if (!tensorNameFound) { sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! " << "Please make sure the input tensor names are correct." << std::endl; return false; } } return true; } template class FillBindingClosure { private: using InputsMap = std::unordered_map; using BindingsVector = std::vector>; TEngineType const* mEngine; nvinfer1::IExecutionContext const* mContext; InputsMap const& inputs; BindingsVector& bindings; int32_t batch; int32_t endBindingIndex; int32_t profileIndex; void fillOneBinding(TensorInfo const& tensorInfo) { auto const name = tensorInfo.name; auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output"; for (auto& binding : bindings) { auto const input = findPlausible(inputs, name); if (tensorInfo.isInput && input != inputs.end()) { sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; binding->addBinding(tensorInfo, input->second); } else { if (tensorInfo.isInput) { sample::gLogInfo << "Using random values for input " << name << std::endl; } binding->addBinding(tensorInfo); } if (tensorInfo.isDynamic) { sample::gLogInfo << bindingInOutStr << " binding for " << name << " is dynamic and will be created during execution using OutputAllocator." << std::endl; } else { sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims << " is created." << std::endl; } } } bool fillAllBindings(int32_t batch, int32_t endBindingIndex) { if (!validateTensorNames(inputs, mEngine, endBindingIndex)) { sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl; return false; } for (int32_t b = 0; b < endBindingIndex; b++) { TensorInfo tensorInfo; tensorInfo.bindingIndex = b; getTensorInfo(tensorInfo); tensorInfo.updateVolume(batch); fillOneBinding(tensorInfo); } return true; } void getTensorInfo(TensorInfo& tensorInfo); public: FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context, InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex, int32_t _profileIndex) : mEngine(_engine) , mContext(_context) , inputs(_inputs) , bindings(_bindings) , batch(_batch) , endBindingIndex(_endBindingIndex) , profileIndex(_profileIndex) { } bool operator()() { return fillAllBindings(batch, endBindingIndex); } }; template <> void FillBindingClosure::getTensorInfo(TensorInfo& tensorInfo) { auto const b = tensorInfo.bindingIndex; auto const name = mEngine->getIOTensorName(b); tensorInfo.name = name; tensorInfo.dims = mContext->getTensorShape(name); tensorInfo.isDynamic = std::any_of( tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; }); tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex); tensorInfo.strides = mContext->getTensorStrides(name); tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex); tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT; tensorInfo.dataType = mEngine->getTensorDataType(name); } namespace { bool allocateContextMemory(InferenceEnvironmentStd& iEnv, InferenceOptions const& inference) { auto* engine = iEnv.engine.get(); iEnv.deviceMemory.resize(inference.infStreams); // Delay context memory allocation until input shapes are specified because runtime allocation would require actual // input shapes. for (int32_t i = 0; i < inference.infStreams; ++i) { auto const& ec = iEnv.contexts.at(i); if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC) { sample::gLogInfo << "Created execution context with device memory size: " << (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl; } else { size_t sizeToAlloc{0}; const char* allocReason{nullptr}; if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE) { auto const p = inference.optProfileIndex; sizeToAlloc = engine->getDeviceMemorySizeForProfile(p); allocReason = "current profile"; } else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME) { sizeToAlloc = ec->updateDeviceMemorySizeForShapes(); allocReason = "current input shapes"; } else { sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl; return false; } iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc); ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize()); sample::gLogInfo << "Maximum device memory size across all profiles: " << (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl; sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": " << (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl; } } return true; } //! \brief Transform shapeData so that it can be type-punned to array of int32_t. //! //! Transform shapeData so if data() is type-punned to (int32_t*), the sequence //! of values are equal to the original elements of shapeData. void contractInt64ToInt32(std::vector& shapeData) { int64_t const size = shapeData.size(); for (int64_t const& val : shapeData) { ASSERT(val <= std::numeric_limits::max() && val >= std::numeric_limits::min() && "Value out of range for int32_t conversion"); } int64_t const* src = shapeData.data(); int32_t* dst = reinterpret_cast(shapeData.data()); std::copy(src, src + size, dst); shapeData.resize((size + 1) / 2); } } // namespace bool setUpInference(InferenceEnvironmentBase& iEnv, InferenceOptions const& inference, SystemOptions const& system) { #if ENABLE_UNIFIED_BUILDER if (iEnv.safe) { return setUpSafeInference(static_cast(iEnv), inference, system); } #endif return setUpStdInference(static_cast(iEnv), inference, system); } #if ENABLE_UNIFIED_BUILDER void getSafeTensorInfo(uint32_t profileIndex, nvinfer2::safe::ITRTGraph* safeGraph, TensorInfo& tensorInfo) { nvinfer2::safe::TensorDescriptor desc; auto const b = tensorInfo.bindingIndex; const char* name = nullptr; safeGraph->getIOTensorName(name, b); tensorInfo.name = name; safeGraph->getIOTensorDescriptor(desc, name); tensorInfo.dims = desc.shape; tensorInfo.isDynamic = std::any_of( tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; }); tensorInfo.strides = desc.stride; tensorInfo.isInput = desc.ioMode == TensorIOMode::kINPUT; tensorInfo.dataType = desc.dataType; } bool setUpSafeInference(InferenceEnvironmentSafe& iEnv, InferenceOptions const& inference, SystemOptions const& system) { int32_t device{}; CHECK(cudaGetDevice(&device)); cudaDeviceProp properties; CHECK(cudaGetDeviceProperties(&properties, device)); int32_t const isIntegrated{properties.integrated}; ASSERT(sample::hasSafeRuntime()); ASSERT(sample::safe::initNvinferSafe()); auto safeEngineBlob = iEnv.engine.getBlob(); SMP_RETVAL_IF_FALSE(safeEngineBlob.data != nullptr, "Engine blob is empty.", false, sample::gLogError); SMP_RETVAL_IF_FALSE(iEnv.engine.checkDLASafe(), "Safe DLA engine built with kDLA_STANDALONE should not be infered in TRT!", false, sample::gLogError); std::unique_ptr safeGraph; // Use managed memory on integrated devices when transfers are skipped // and when it is explicitly requested on the commandline. bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged}; nvinfer2::safe::ITRTGraph* tempGraph = nullptr; if (sample::safe::createSafeTRTGraph( tempGraph, safeEngineBlob.data, safeEngineBlob.size, *gSafeRecorder, true, nullptr) != nvinfer2::safe::ErrorCode::kSUCCESS) { sample::gLogError << "Create Safe TRT Graph Failed." << std::endl; } safeGraph.reset(tempGraph); // Release serialized blob to save memory space. iEnv.engine.releaseBlob(); for (int32_t s = 0; s < inference.infStreams; ++s) { nvinfer2::safe::ITRTGraph* clonedGraph{nullptr}; safeGraph->clone(clonedGraph, *gSafeRecorder); // return errorcode iEnv.mClonedGraphs.emplace_back(clonedGraph); iEnv.bindings.emplace_back(std::make_unique(useManagedMemory)); } int64_t endBindingIndex = 0; safeGraph->getNbIOTensors(endBindingIndex); for (int32_t b = 0; b < endBindingIndex; b++) { TensorInfo tensorInfo; tensorInfo.bindingIndex = b; getSafeTensorInfo(inference.optProfileIndex, safeGraph.get(), tensorInfo); tensorInfo.updateVolume(1); auto const name = tensorInfo.name; auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output"; for (auto& binding : iEnv.bindings) { auto const input = findPlausible(inference.inputs, name); if (tensorInfo.isInput && input != inference.inputs.end()) { sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; binding->addBinding(tensorInfo, input->second); } else { if (tensorInfo.isInput) { sample::gLogInfo << "Using random values for input " << name << std::endl; } binding->addBinding(tensorInfo); } if (tensorInfo.isDynamic) { sample::gLogInfo << bindingInOutStr << " binding for " << name << " is dynamic and will be created during execution using OutputAllocator." << std::endl; } else { sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims << " is created." << std::endl; } } } return true; } #endif bool setUpStdInference(InferenceEnvironmentStd& iEnv, InferenceOptions const& inference, SystemOptions const& system) { int32_t device{}; CHECK(cudaGetDevice(&device)); cudaDeviceProp properties; CHECK(cudaGetDeviceProperties(&properties, device)); int32_t const isIntegrated{properties.integrated}; // Use managed memory on integrated devices when transfers are skipped // and when it is explicitly requested on the commandline. bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged}; using FillStdBindings = FillBindingClosure; auto* engine = iEnv.engine.get(); SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError); // Release serialized blob to save memory space. iEnv.engine.releaseBlob(); // Setup weight streaming if enabled if (engine->getStreamableWeightsSize() > 0) { auto const& budget = inference.weightStreamingBudget; int64_t wsBudget = budget.bytes; if (budget.percent != 100.0) { double const percent = budget.percent; ASSERT(percent < 100.0); auto const max = engine->getStreamableWeightsSize(); wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE; } if (wsBudget == WeightStreamingBudget::kDISABLE) { wsBudget = engine->getStreamableWeightsSize(); } else if (wsBudget == WeightStreamingBudget::kAUTOMATIC) { wsBudget = engine->getWeightStreamingAutomaticBudget(); } ASSERT(wsBudget >= 0); bool success = engine->setWeightStreamingBudgetV2(wsBudget); SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError); switch (wsBudget) { case WeightStreamingBudget::kDISABLE: { sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl; break; } case WeightStreamingBudget::kAUTOMATIC: { sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl; break; } default: { sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes." << std::endl; break; } } } int32_t const nbOptProfiles = engine->getNbOptimizationProfiles(); if (inference.optProfileIndex >= nbOptProfiles) { sample::gLogError << "Selected profile index " << inference.optProfileIndex << " exceeds the number of profiles that the engine holds. " << std::endl; return false; } if (nbOptProfiles > 1 && !inference.setOptProfile) { sample::gLogWarning << nbOptProfiles << " profiles detected but not set. Running with profile 0. Please use " "--dumpOptimizationProfile to see all available profiles." << std::endl; } cudaStream_t setOptProfileStream; CHECK(cudaStreamCreate(&setOptProfileStream)); for (int32_t s = 0; s < inference.infStreams; ++s) { IExecutionContext* ec{nullptr}; //! \return the `ExecutionContextAllocationStrategy` to use for the given allocation strategy, \p s. auto getExecutionContextAllocationStrategy = [](MemoryAllocationStrategy s) { return s == MemoryAllocationStrategy::kSTATIC // Let TRT pre-allocate and manage the memory. ? ExecutionContextAllocationStrategy::kSTATIC // Allocate based on the current profile or runtime shapes. : ExecutionContextAllocationStrategy::kUSER_MANAGED; }; ec = engine->createExecutionContext(getExecutionContextAllocationStrategy(inference.memoryAllocationStrategy)); if (ec == nullptr) { sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; return false; } ec->setNvtxVerbosity(inference.nvtxVerbosity); int32_t const persistentCacheLimit = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio; sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl; ec->setPersistentCacheLimit(persistentCacheLimit); auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream); CHECK(cudaStreamSynchronize(setOptProfileStream)); if (!setProfile) { sample::gLogError << "Set optimization profile failed. " << std::endl; if (inference.infStreams > 1) { sample::gLogError << "Please ensure that the engine is built with preview feature profileSharing0806 enabled. " << std::endl; } return false; } iEnv.contexts.emplace_back(ec); iEnv.bindings.emplace_back(std::make_unique(useManagedMemory)); } CHECK(cudaStreamDestroy(setOptProfileStream)); if (iEnv.profiler) { iEnv.contexts.front()->setProfiler(iEnv.profiler.get()); // Always run reportToProfiler() after enqueue launch iEnv.contexts.front()->setEnqueueEmitsProfile(false); } int32_t const endBindingIndex = engine->getNbIOTensors(); // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings // to avoid silent typos. if (!validateTensorNames(inference.shapes, engine, endBindingIndex)) { sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl; return false; } for (int32_t b = 0; b < endBindingIndex; ++b) { auto const& name = engine->getIOTensorName(b); auto const& mode = engine->getTensorIOMode(name); if (mode == TensorIOMode::kINPUT) { Dims const dims = iEnv.contexts.front()->getTensorShape(name); bool isShapeInferenceIO{false}; isShapeInferenceIO = engine->isShapeInferenceIO(name); bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); auto const shape = findPlausible(inference.shapes, name); if (hasRuntimeDim || isShapeInferenceIO) { // Set shapeData to either dimensions of the input (if it has a dynamic shape) // or set to values of the input (if it is an input shape tensor). std::vector shapeData; if (shape == inference.shapes.end()) { // No information provided. Use default value for missing data. constexpr int32_t kDEFAULT_VALUE = 1; if (isShapeInferenceIO) { // Set shape tensor to all ones. shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE); sample::gLogWarning << "Values missing for input shape tensor: " << name << "Automatically setting values to: " << shapeData << std::endl; } else { // Use default value for unspecified runtime dimensions. shapeData.resize(dims.nbDims); std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(), [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; }); sample::gLogWarning << "Shape missing for input with dynamic shape: " << name << "Automatically setting shape to: " << shapeData << std::endl; } } else if (inference.inputs.count(shape->first) && isShapeInferenceIO) { // Load shape tensor from file. int64_t const size = volume(dims, 0, dims.nbDims); shapeData.resize(size); auto const& filename = inference.inputs.at(shape->first); auto dst = reinterpret_cast(shapeData.data()); loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type)); } else { shapeData = shape->second; } int64_t* shapeTensorData{nullptr}; if (isShapeInferenceIO) { // Save the data in iEnv, in a way that its address does not change // before enqueueV3 is called. DataType const type = engine->getTensorDataType(name); switch (type) { case DataType::kINT64: break; case DataType::kINT32: contractInt64ToInt32(shapeData); break; default: sample::gLogError << "Shape tensor " << name << " has unexpected type " << type << std::endl; return false; } iEnv.inputShapeTensorValues.emplace_back(shapeData); shapeTensorData = iEnv.inputShapeTensorValues.back().data(); } for (auto& c : iEnv.contexts) { if (isShapeInferenceIO) { sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl; if (!c->setTensorAddress(name, shapeTensorData)) { return false; } } else { sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData << std::endl; if (!c->setInputShape(name, toDims(shapeData))) { return false; } } } } else if (nbOptProfiles && shape != inference.shapes.end()) { // Check if the provided shape matches the static dimensions in the engine. for (auto& c : iEnv.contexts) { if (!c->setInputShape(name, toDims(shape->second))) { sample::gLogError << "The engine was built with static shapes for input tensor " << name << " but the provided shapes do not match the static shapes!" << std::endl; return false; } } } } } // Create Debug Listener and turn on debug states if client requested dumping debug tensors. if (!inference.debugTensorFileNames.empty() || !inference.dumpAlldebugTensorFormats.empty()) { iEnv.listener = std::make_unique( inference.debugTensorFileNames, inference.dumpAlldebugTensorFormats, engine->getName(), iEnv.cmdline); iEnv.contexts.front()->setDebugListener(iEnv.listener.get()); for (auto const& s : inference.debugTensorFileNames) { iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true); } if (!inference.dumpAlldebugTensorFormats.empty()) { iEnv.contexts.front()->setUnfusedTensorsDebugState(true); } } if (!allocateContextMemory(iEnv, inference)) { return false; } auto const* context = iEnv.contexts.front().get(); bool fillBindingsSuccess = FillStdBindings( engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)(); return fillBindingsSuccess; } TaskInferenceEnvironment::TaskInferenceEnvironment(std::string engineFile, InferenceOptions const& inference, ReportingOptions const& reporting, int32_t deviceId, int32_t DLACore, int32_t bs) : iOptions(inference) , rOptions(reporting) , device(deviceId) , batch(bs) { BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError, false); iEnv = std::make_unique(bEnv); CHECK(cudaSetDevice(device)); SystemOptions system{}; system.device = device; system.DLACore = DLACore; if (!setUpStdInference(*iEnv, iOptions, system)) { sample::gLogError << "Inference set up failed" << std::endl; } } namespace { #if defined(__QNX__) using TimePoint = double; #else using TimePoint = std::chrono::time_point; #endif TimePoint getCurrentTime() { #if defined(__QNX__) uint64_t const currentCycles = ClockCycles(); uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; // Return current timestamp in ms. return static_cast(currentCycles) * 1000. / cyclesPerSecond; #else return std::chrono::high_resolution_clock::now(); #endif } //! //! \struct SyncStruct //! \brief Threads synchronization structure //! struct SyncStruct { std::mutex mutex; TrtCudaStream mainStream; TrtCudaEvent gpuStart{cudaEventBlockingSync}; TimePoint cpuStart{}; float sleep{}; }; struct Enqueue { explicit Enqueue(nvinfer1::IExecutionContext& context) : mContext(context) { } nvinfer1::IExecutionContext& mContext; }; #if ENABLE_UNIFIED_BUILDER struct SafeEnqueue { explicit SafeEnqueue(nvinfer2::safe::ITRTGraph& graph) : mGraph(graph) { } nvinfer2::safe::ITRTGraph& mGraph; }; #endif //! //! \class EnqueueExplicit //! \brief Functor to enqueue inference with explict batch //! class EnqueueExplicit : private Enqueue { public: explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, BindingsStd const& bindings) : Enqueue(context) , mBindings(bindings) { ASSERT(mBindings.setTensorAddresses(mContext)); } bool operator()(TrtCudaStream& stream) const { try { bool const result = mContext.enqueueV3(stream.get()); // Collecting layer timing info from current profile index of execution context, except under capturing // mode. if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) { gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl; } return result; } catch (const std::exception&) { return false; } return false; } private: // Helper function to check if a stream is in capturing mode. bool isStreamCapturing(TrtCudaStream& stream) const { cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone}; CHECK(cudaStreamIsCapturing(stream.get(), &status)); return status != cudaStreamCaptureStatusNone; } BindingsStd const& mBindings; }; #if ENABLE_UNIFIED_BUILDER //! //! \class EnqueueExplicitSafe //! \brief Functor to safeEnqueue inference with explict batch //! class EnqueueExplicitSafe : private SafeEnqueue { public: explicit EnqueueExplicitSafe(nvinfer2::safe::ITRTGraph& graph, BindingsSafe const& bindings) : SafeEnqueue(graph) , mBindings(bindings) { ASSERT(mBindings.setTensorAddresses(graph)); } bool operator()(TrtCudaStream& stream) const { try { bool const result = (mGraph.executeAsync(stream.get()) == nvinfer1::ErrorCode::kSUCCESS); return result; } catch (const std::exception&) { return false; } return false; } private: BindingsSafe const& mBindings; }; #endif //! //! \class EnqueueGraph //! \brief Functor to enqueue inference from CUDA Graph //! class EnqueueGraph { public: explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph) : mGraph(graph) , mContext(context) { } bool operator()(TrtCudaStream& stream) const { if (mGraph.launch(stream)) { // Collecting layer timing info from current profile index of execution context if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) { gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl; } return true; } return false; } TrtCudaGraph& mGraph; nvinfer1::IExecutionContext& mContext; }; #if ENABLE_UNIFIED_BUILDER //! //! \class EnqueueGraphSafe //! \brief Functor to enqueue inference from CUDA Graph //! class EnqueueGraphSafe { public: explicit EnqueueGraphSafe(nvinfer2::safe::ITRTGraph& graph) : mGraph(graph) { } bool operator()(TrtCudaStream& stream) const { return mGraph.executeAsync(stream.get()) == nvinfer1::ErrorCode::kSUCCESS; } nvinfer2::safe::ITRTGraph& mGraph; }; #endif using EnqueueFunction = std::function; enum class StreamType : int32_t { kINPUT = 0, kCOMPUTE = 1, kOUTPUT = 2, kNUM = 3 }; enum class EventType : int32_t { kINPUT_S = 0, kINPUT_E = 1, kCOMPUTE_S = 2, kCOMPUTE_E = 3, kOUTPUT_S = 4, kOUTPUT_E = 5, kNUM = 6 }; using MultiStream = std::array(StreamType::kNUM)>; using MultiEvent = std::array, static_cast(EventType::kNUM)>; using EnqueueTimes = std::array; //! //! \class IterationBase //! \brief Inference iteration and streams management //! class IterationBase { public: explicit IterationBase(int32_t id, InferenceOptions const& inference, BindingsBase& bindings) : mBindings(bindings) , mStreamId(id) , mDepth(1 + inference.overlap) , mActive(mDepth) , mEvents(mDepth) , mEnqueueTimes(mDepth) { for (auto& eventsAtDepth : mEvents) { std::generate(eventsAtDepth.begin(), eventsAtDepth.end(), [&] { return std::make_unique(!inference.spin); }); } } bool query(bool skipTransfers) { if (mActive[mNext]) { return true; } if (!skipTransfers) { record(EventType::kINPUT_S, StreamType::kINPUT); setInputData(false); record(EventType::kINPUT_E, StreamType::kINPUT); wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute } record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); recordEnqueueTime(); if (!mEnqueue(getStream(StreamType::kCOMPUTE))) { return false; } recordEnqueueTime(); record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); if (!skipTransfers) { wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA record(EventType::kOUTPUT_S, StreamType::kOUTPUT); fetchOutputData(false); record(EventType::kOUTPUT_E, StreamType::kOUTPUT); } mActive[mNext] = true; moveNext(); return true; } float sync( TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) { if (mActive[mNext]) { if (skipTransfers) { getEvent(EventType::kCOMPUTE_E).synchronize(); } else { getEvent(EventType::kOUTPUT_E).synchronize(); } trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); mActive[mNext] = false; return getEvent(EventType::kCOMPUTE_S) - gpuStart; } return 0; } void syncAll( TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) { for (int32_t d = 0; d < mDepth; ++d) { sync(cpuStart, gpuStart, trace, skipTransfers); moveNext(); } } void wait(TrtCudaEvent& gpuStart) { getStream(StreamType::kINPUT).wait(gpuStart); } void setInputData(bool sync) { mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); // additional sync to avoid overlapping with inference execution. if (sync) { getStream(StreamType::kINPUT).synchronize(); } } void fetchOutputData(bool sync) { mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); // additional sync to avoid overlapping with inference execution. if (sync) { getStream(StreamType::kOUTPUT).synchronize(); } } protected: void moveNext() { mNext = mDepth - 1 - mNext; } TrtCudaStream& getStream(StreamType t) { return mStream[static_cast(t)]; } TrtCudaEvent& getEvent(EventType t) { return *mEvents[mNext][static_cast(t)]; } void record(EventType e, StreamType s) { getEvent(e).record(getStream(s)); } void recordEnqueueTime() { mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); enqueueStart = 1 - enqueueStart; } TimePoint getEnqueueTime(bool start) { return mEnqueueTimes[mNext][start ? 0 : 1]; } void wait(EventType e, StreamType s) { getStream(s).wait(getEvent(e)); } InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers) { float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart; float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart; float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart; float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart; return InferenceTrace(mStreamId, std::chrono::duration(getEnqueueTime(true) - cpuStart).count(), std::chrono::duration(getEnqueueTime(false) - cpuStart).count(), is, ie, getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); } BindingsBase& mBindings; TrtCudaGraph mGraph; EnqueueFunction mEnqueue; int32_t mStreamId{0}; int32_t mNext{0}; int32_t mDepth{2}; // default to double buffer to hide DMA transfers std::vector mActive; MultiStream mStream; std::vector mEvents; int32_t enqueueStart{0}; std::vector mEnqueueTimes; }; //! //! \class IterationStd //! \brief Inference iteration and streams management for standard inference //! class IterationStd : public IterationBase { public: explicit IterationStd( int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, BindingsStd& bindings) : IterationBase(id, inference, bindings) { createEnqueueFunction(inference, context, bindings); } private: void createEnqueueFunction( InferenceOptions const& inference, nvinfer1::IExecutionContext& context, BindingsStd& bindings) { mEnqueue = EnqueueFunction(EnqueueExplicit(context, bindings)); if (inference.graph) { sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl; TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); // Avoid capturing initialization calls by executing the enqueue function at least // once before starting CUDA graph capture. auto const ret = mEnqueue(stream); if (!ret) { throw std::runtime_error("Inference enqueue failed."); } stream.synchronize(); mGraph.beginCapture(stream); // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. if (mEnqueue(stream)) { mGraph.endCapture(stream); mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl; } else { mGraph.endCaptureOnError(stream); // Ensure any CUDA error has been cleaned up. CHECK(cudaGetLastError()); sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " "CUDA graph capture mode." << std::endl; sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " "launched without using CUDA graph launch." << std::endl; } } } }; #if ENABLE_UNIFIED_BUILDER //! //! \class IterationSafe //! \brief Inference iteration and streams management for safe inference //! class IterationSafe : public IterationBase { public: explicit IterationSafe( int32_t id, InferenceOptions const& inference, nvinfer2::safe::ITRTGraph& graph, BindingsSafe& bindings) : IterationBase(id, inference, bindings) { createEnqueueFunction(inference, graph, bindings); } private: void createEnqueueFunction( InferenceOptions const& inference, nvinfer2::safe::ITRTGraph& graph, BindingsSafe& bindings) { mEnqueue = EnqueueFunction(EnqueueExplicitSafe(graph, bindings)); if (inference.graph) { sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl; TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); // Avoid capturing initialization calls by executing the enqueue function at least // once before starting CUDA graph capture. auto const ret = mEnqueue(stream); if (!ret) { throw std::runtime_error("Inference enqueue failed."); } stream.synchronize(); mGraph.beginCapture(stream); // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. if (mEnqueue(stream)) { mGraph.endCapture(stream); mEnqueue = EnqueueFunction(EnqueueGraphSafe(graph)); sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl; } else { mGraph.endCaptureOnError(stream); // Ensure any CUDA error has been cleaned up. CHECK(cudaGetLastError()); sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " "CUDA graph capture mode." << std::endl; sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " "launched without using CUDA graph launch." << std::endl; } } } }; #endif bool inferenceLoop(std::vector>& iStreams, TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs, std::vector& trace, bool skipTransfers, float idleMs) { float durationMs = 0; int32_t skip = 0; if (maxDurationMs == -1.F) { sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until" << " aborted with CTRL-C (SIGINT)" << std::endl; while (true) { for (auto& s : iStreams) { if (!s->query(skipTransfers)) { return false; } } for (auto& s : iStreams) { s->sync(cpuStart, gpuStart, trace, skipTransfers); } } } for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i) { for (auto& s : iStreams) { if (!s->query(skipTransfers)) { return false; } } for (auto& s : iStreams) { durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers)); } if (durationMs < warmupMs) // Warming up { if (durationMs) // Skip complete iterations { ++skip; } continue; } if (idleMs != 0.F) { std::this_thread::sleep_for(std::chrono::duration(idleMs)); } } for (auto& s : iStreams) { s->syncAll(cpuStart, gpuStart, trace, skipTransfers); } return true; } void inferenceExecution(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, SyncStruct& sync, int32_t const threadIdx, int32_t const streamsPerThread, int32_t device, std::vector& trace, ReportingOptions const& reporting) noexcept { try { float warmupMs = inference.warmup; float durationMs = -1.F; if (inference.duration != -1.F) { durationMs = inference.duration * 1000.F + warmupMs; } CHECK(cudaSetDevice(device)); #if ENABLE_UNIFIED_BUILDER if (iEnv.safe) { //! Function to make one iteration: auto makeIteration = [&](int32_t s) -> std::unique_ptr { int32_t const streamId{threadIdx * streamsPerThread + s}; auto iteration = std::make_unique(streamId, inference, *static_cast(iEnv).mClonedGraphs[streamId], *static_cast(iEnv).bindings[streamId]); if (inference.skipTransfers) { iteration->setInputData(true); } return iteration; }; std::vector> iStreams; for (int32_t s = 0; s < streamsPerThread; ++s) { iStreams.emplace_back(makeIteration(s)); } for (auto& s : iStreams) { s->wait(sync.gpuStart); } std::vector localTrace; if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, localTrace, inference.skipTransfers, inference.idle)) { std::lock_guard lock{sync.mutex}; iEnv.error = true; } if (inference.skipTransfers) { for (auto& s : iStreams) { s->fetchOutputData(true); } } std::lock_guard lock{sync.mutex}; trace.insert(trace.end(), localTrace.begin(), localTrace.end()); return; } #endif //! Function to make one iteration: auto makeIteration = [&](int32_t s) -> std::unique_ptr { int32_t const streamId{threadIdx * streamsPerThread + s}; auto iteration = std::make_unique(streamId, inference, *static_cast(iEnv).getContext(streamId), *static_cast(iEnv).bindings[streamId]); if (inference.skipTransfers) { iteration->setInputData(true); } return iteration; }; std::vector> iStreams; for (int32_t s = 0; s < streamsPerThread; ++s) { iStreams.emplace_back(makeIteration(s)); } for (auto& s : iStreams) { s->wait(sync.gpuStart); } std::vector localTrace; if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, localTrace, inference.skipTransfers, inference.idle)) { std::lock_guard lock{sync.mutex}; iEnv.error = true; } auto const needOutput = reporting.output || !reporting.exportOutput.empty(); if (inference.skipTransfers && needOutput) { for (auto& s : iStreams) { s->fetchOutputData(true); } } { std::lock_guard lock{sync.mutex}; trace.insert(trace.end(), localTrace.begin(), localTrace.end()); } } catch (...) { std::lock_guard lock{sync.mutex}; iEnv.error = true; } } inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, SyncStruct& sync, int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector& trace, ReportingOptions const& reporting) { return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace), std::cref(reporting)); } } // namespace bool runInference(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, int32_t device, std::vector& trace, ReportingOptions const& reporting) { CHECK(cudaProfilerStart()); trace.resize(0); SyncStruct sync; sync.sleep = inference.sleep; sync.mainStream.sleep(&sync.sleep); sync.cpuStart = getCurrentTime(); sync.gpuStart.record(sync.mainStream); // When multiple streams are used, trtexec can run inference in two modes: // (1) if inference.threads is true, then run each stream on each thread. // (2) if inference.threads is false, then run all streams on the same thread. int32_t const numThreads = inference.threads ? inference.infStreams : 1; int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams; std::vector threads; for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) { threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace, reporting)); } for (auto& th : threads) { th.join(); } CHECK(cudaProfilerStop()); auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; std::sort(trace.begin(), trace.end(), cmpTrace); return !iEnv.error; } bool runMultiTasksInference(std::vector>& tEnvList) { CHECK(cudaProfilerStart()); cudaSetDeviceFlags(cudaDeviceScheduleSpin); SyncStruct sync; sync.sleep = 0; sync.mainStream.sleep(&sync.sleep); sync.cpuStart = getCurrentTime(); sync.gpuStart.record(sync.mainStream); std::vector threads; for (size_t i = 0; i < tEnvList.size(); ++i) { auto& tEnv = tEnvList[i]; threads.emplace_back(makeThread( tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace, tEnv->rOptions)); } for (auto& th : threads) { th.join(); } CHECK(cudaProfilerStop()); auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; for (auto& tEnv : tEnvList) { std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace); } return std::none_of(tEnvList.begin(), tEnvList.end(), [](std::unique_ptr& tEnv) { return tEnv->iEnv->error; }); } namespace { size_t reportGpuMemory() { static size_t prevFree{0}; size_t free{0}; size_t total{0}; size_t newlyAllocated{0}; CHECK(cudaMemGetInfo(&free, &total)); sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; if (prevFree != 0) { newlyAllocated = (prevFree - free); sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB"; } sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl; prevFree = free; return newlyAllocated; } } // namespace //! Returns true if deserialization is slower than expected or fails. bool timeDeserialize(InferenceEnvironmentBase& iEnv, SystemOptions const& sys) { constexpr int32_t kNB_ITERS{20}; std::unique_ptr rt{createRuntime()}; std::unique_ptr engine; SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); auto timeDeserializeFn = [&]() -> float { bool deserializeOK{false}; engine.reset(nullptr); auto startClock = std::chrono::high_resolution_clock::now(); SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); for (auto const& pluginPath : sys.dynamicPlugins) { rt->getPluginRegistry().loadLibrary(pluginPath.c_str()); } auto& reader = iEnv.engine.getFileReader(); auto& asyncReader = iEnv.engine.getAsyncFileReader(); ASSERT(reader.isOpen() || asyncReader.isOpen()); if (asyncReader.isOpen()) { asyncReader.reset(); engine.reset(rt->deserializeCudaEngine(asyncReader)); } else { reader.reset(); engine.reset(rt->deserializeCudaEngine(reader)); } deserializeOK = (engine != nullptr); auto endClock = std::chrono::high_resolution_clock::now(); // return NAN if deserialization failed. return deserializeOK ? std::chrono::duration(endClock - startClock).count() : NAN; }; // Warmup the caches to make sure that cache thrashing isn't throwing off the results { sample::gLogInfo << "Begin deserialization warmup..." << std::endl; for (int32_t i = 0, e = 2; i < e; ++i) { timeDeserializeFn(); } } sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; float const first = timeDeserializeFn(); // Check if first deserialization succeeded. if (std::isnan(first)) { sample::gLogError << "Engine deserialization failed." << std::endl; return true; } sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl; // Record initial gpu memory state. reportGpuMemory(); float totalTime{0.F}; for (int32_t i = 0; i < kNB_ITERS; ++i) { totalTime += timeDeserializeFn(); } auto const averageTime = totalTime / kNB_ITERS; // reportGpuMemory sometimes reports zero after a single deserialization of a small engine, // so use the size of memory for all the iterations. auto const totalEngineSizeGpu = reportGpuMemory(); sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS << " iterations, average time = " << averageTime << " milliseconds, first time = " << first << " milliseconds." << std::endl; sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl; // If the first deserialization is more than tolerance slower than // the average deserialization, return true, which means an error occurred. // The tolerance is set to 2x since the deserialization time is quick and susceptible // to caching issues causing problems in the first timing. auto const tolerance = 2.0F; bool const isSlowerThanExpected = first > averageTime * tolerance; if (isSlowerThanExpected) { sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime) << ". Exceeds tolerance of " << tolerance << "x." << std::endl; } return isSlowerThanExpected; } std::string getLayerInformation( nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format) { auto runtime = std::unique_ptr{createRuntime()}; auto inspector = std::unique_ptr(engine->createEngineInspector()); if (context != nullptr) { inspector->setExecutionContext(context); } std::string result = inspector->getEngineInformation(format); return result; } void Binding::fill(std::string const& fileName) { loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); } void Binding::fill() { switch (dataType) { case nvinfer1::DataType::kBOOL: { fillBuffer(buffer->getHostBuffer(), volume, 0, 1); break; } case nvinfer1::DataType::kINT32: { fillBuffer(buffer->getHostBuffer(), volume, -128, 127); break; } case nvinfer1::DataType::kINT64: { fillBuffer(buffer->getHostBuffer(), volume, -128, 127); break; } case nvinfer1::DataType::kINT8: { fillBuffer(buffer->getHostBuffer(), volume, -128, 127); break; } case nvinfer1::DataType::kFLOAT: { fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); break; } case nvinfer1::DataType::kHALF: { fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); break; } case nvinfer1::DataType::kBF16: { fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); break; } case nvinfer1::DataType::kUINT8: { fillBuffer(buffer->getHostBuffer(), volume, 0, 255); break; } case nvinfer1::DataType::kFP8: #if CUDA_VERSION < 11060 ASSERT(false && "FP8 is not supported"); #else { fillBuffer<__nv_fp8_e4m3>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); break; } #endif case nvinfer1::DataType::kINT4: { // int4 is implemented as packing two elements into a single byte, // so all possible bit patterns of the two int4 elements coincides with all possible bit patterns of // an uint8. fillBuffer(buffer->getHostBuffer(), volume, 0, 255); break; } case DataType::kFP4: ASSERT(false && "FP4 is not supported"); case DataType::kE8M0: ASSERT(false && "E8M0 is not supported"); } } void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv, std::string const separator /*= " "*/) const { void* outputBuffer{}; if (outputAllocator != nullptr) { outputBuffer = outputAllocator->getBuffer()->getHostBuffer(); // Overwrite dimensions with those reported by the output allocator. dims = outputAllocator->getFinalDims(); os << "Final shape is " << dims << " reported by the output allocator." << std::endl; } else { outputBuffer = buffer->getHostBuffer(); } switch (dataType) { case nvinfer1::DataType::kBOOL: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kINT32: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kINT8: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kFLOAT: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kHALF: { dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kBF16: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kUINT8: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kINT64: { dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kFP8: #if CUDA_VERSION < 11060 ASSERT(false && "FP8 is not supported"); #else { dumpBuffer<__nv_fp8_e4m3>(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } #endif case nvinfer1::DataType::kINT4: { dumpInt4Buffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kFP4: ASSERT(false && "FP4 is not supported"); case nvinfer1::DataType::kE8M0: ASSERT(false && "E8M0 is not supported"); } } void BindingsBase::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/) { auto const b = tensorInfo.bindingIndex; while (mBindings.size() <= static_cast(b)) { mBindings.emplace_back(); mDevicePointers.emplace_back(); } mNames[tensorInfo.name] = b; mBindings[b].isInput = tensorInfo.isInput; mBindings[b].volume = tensorInfo.vol; mBindings[b].dataType = tensorInfo.dataType; //! Make a UnifiedMirroredBuffer if useManaged or Discrete othereise: auto makeBuffer = [](bool useManaged) -> std::unique_ptr { if (useManaged) { return std::make_unique(); } else { return std::make_unique(); } }; if (tensorInfo.isDynamic) { ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS. if (mBindings[b].outputAllocator == nullptr) { mBindings[b].outputAllocator = std::make_unique(makeBuffer(mUseManaged)); } } else { if (mBindings[b].buffer == nullptr) { mBindings[b].buffer = makeBuffer(mUseManaged); } // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr // even for empty tensors, so allocate a dummy byte. if (tensorInfo.vol == 0) { mBindings[b].buffer->allocate(1); } else { mBindings[b].buffer->allocate(samplesCommon::getNbBytes(tensorInfo.dataType, tensorInfo.vol)); } mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); } if (tensorInfo.isInput) { if (fileName.empty()) { fill(b); } else { fill(b, fileName); } } } void** BindingsBase::getDeviceBuffers() { return mDevicePointers.data(); } void BindingsBase::transferInputToDevice(TrtCudaStream& stream) { for (auto& b : mNames) { if (mBindings[b.second].isInput) { mBindings[b.second].buffer->hostToDevice(stream); } } } void BindingsBase::transferOutputToHost(TrtCudaStream& stream) { for (auto& b : mNames) { if (!mBindings[b.second].isInput) { if (mBindings[b.second].outputAllocator != nullptr) { mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream); } else { mBindings[b.second].buffer->deviceToHost(stream); } } } } void BindingsStd::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const { auto const tensorName = context.getEngine().getIOTensorName(binding); Dims dims = context.getTensorShape(tensorName); Dims strides = context.getTensorStrides(tensorName); int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName); int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName); mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); } namespace { Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name) { return context.getTensorShape(name.c_str()); } } // namespace void BindingsStd::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const { os << "Dumping I/O Bindings to RAW Files:" << std::endl; for (auto const& n : mNames) { auto name = n.first; auto bIndex = n.second; auto const& binding = mBindings[bIndex]; void* outputBuffer{}; if (binding.outputAllocator != nullptr) { outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer(); } else { outputBuffer = binding.buffer->getHostBuffer(); } Dims dims = getBindingDimensions(context, name); std::string dimsStr; std::string dotStr; for (int32_t i = 0; i < dims.nbDims; i++) { dimsStr += dotStr + std::to_string(dims.d[i]); dotStr = "."; } std::string const bindingTypeStr = (binding.isInput ? "input" : "output"); std::stringstream fileNameStream; fileNameStream << name << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType << ".raw"; std::string fileName = genFilenameSafeString(fileNameStream.str()); os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType << " and dimensions " << dimsStr << ") to " << fileName << std::endl; std::ofstream f(fileName, std::ios::out | std::ios::binary); ASSERT(f && "Cannot open file for write"); f.write(static_cast(outputBuffer), samplesCommon::getNbBytes(binding.dataType, binding.volume)); f.close(); } } void BindingsStd::dumpBindingDimensions( std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const { auto const dims = context.getTensorShape(name.c_str()); // Do not add a newline terminator, because the caller may be outputting a JSON string. os << dims; } std::unordered_map BindingsBase::getBindings(std::function predicate) const { std::unordered_map bindings; for (auto const& n : mNames) { auto const binding = n.second; if (predicate(mBindings[binding])) { bindings.insert(n); } } return bindings; } bool BindingsStd::setTensorAddresses(nvinfer1::IExecutionContext& context) const { for (auto const& b : mNames) { auto const name = b.first.c_str(); auto const location = context.getEngine().getTensorLocation(name); if (location == TensorLocation::kDEVICE) { if (mBindings[b.second].outputAllocator != nullptr) { if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get())) { return false; } } else { if (!context.setTensorAddress(name, mDevicePointers[b.second])) { return false; } } } } return true; } #if ENABLE_UNIFIED_BUILDER namespace { Dims getBindingDimensions(ITRTGraph& graph, std::string const& name) { nvinfer2::safe::TensorDescriptor desc; graph.getIOTensorDescriptor(desc, name.c_str()); return desc.shape; } } // namespace void BindingsSafe::dumpBindingDimensions(std::string const& name, ITRTGraph const& graph, std::ostream& os) const { // Do not add a newline terminator, because the caller may be outputting a JSON string. os << getBindingDimensions(const_cast(graph), name); } void BindingsSafe::dumpBindingValues(ITRTGraph const& graph, int32_t binding, std::ostream& os, std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const { char const* tensorName; graph.getIOTensorName(tensorName, binding); nvinfer2::safe::TensorDescriptor desc; graph.getIOTensorDescriptor(desc, tensorName); Dims dims = desc.shape; Dims strides = desc.stride; // int32_t vectorDim = desc.vectorizedDim; // int32_t const spv = desc.componentsPerVector; mBindings[binding].dump(os, dims, strides, -1, -1, separator); } void BindingsSafe::dumpRawBindingToFiles(ITRTGraph& graph, std::ostream& os) const { os << "Dumping I/O Bindings to RAW Files:" << std::endl; for (auto const& n : mNames) { auto name = n.first; auto bIndex = n.second; auto const& binding = mBindings[bIndex]; void* outputBuffer{}; if (binding.outputAllocator != nullptr) { outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer(); } else { outputBuffer = binding.buffer->getHostBuffer(); } Dims dims = getBindingDimensions(graph, name); std::string dimsStr; std::string dotStr; for (int32_t i = 0; i < dims.nbDims; i++) { dimsStr += dotStr + std::to_string(dims.d[i]); dotStr = "."; } std::string const bindingTypeStr = (binding.isInput ? "input" : "output"); std::stringstream fileName; fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType << ".raw"; os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl; std::ofstream f(fileName.str(), std::ios::out | std::ios::binary); ASSERT(f && "Cannot open file for write"); f.write(static_cast(outputBuffer), samplesCommon::getNbBytes(binding.dataType, binding.volume)); f.close(); } } bool BindingsSafe::setTensorAddresses(ITRTGraph& graph) const { for (auto const& b : mNames) { auto const name = b.first.c_str(); nvinfer2::safe::TensorDescriptor desc; graph.getIOTensorDescriptor(desc, name); bool onGpu = desc.memPlacement == nvinfer2::safe::MemoryPlacement::kGPU || desc.memPlacement == nvinfer2::safe::MemoryPlacement::kNONE; if (onGpu) { if (mBindings[b.second].outputAllocator != nullptr) { nvinfer2::safe::TypedArray tensor = safe::createTypedArray( mBindings[b.second].outputAllocator->getBuffer(), desc.dataType, desc.sizeInBytes); graph.setIOTensorAddress(name, tensor); } else { nvinfer2::safe::TypedArray tensor = safe::createTypedArray(mDevicePointers[b.second], desc.dataType, desc.sizeInBytes); graph.setIOTensorAddress(name, tensor); } } } return true; } #endif } // namespace sample