whitbrunn's picture
1231: g0plus dockerfile
38fb1f6 verified
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TRT_SAMPLE_DEVICE_H
#define TRT_SAMPLE_DEVICE_H
#include <cassert>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <thread>
#include "common.h"
#include "sampleUtils.h"
namespace sample
{
class TrtCudaEvent;
namespace
{
void cudaSleep(void* sleep)
{
std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(*static_cast<float*>(sleep)));
}
} // namespace
//!
//! \class TrtCudaStream
//! \brief Managed CUDA stream
//!
class TrtCudaStream
{
public:
TrtCudaStream()
{
CHECK(cudaStreamCreate(&mStream));
}
TrtCudaStream(const TrtCudaStream&) = delete;
TrtCudaStream& operator=(const TrtCudaStream&) = delete;
TrtCudaStream(TrtCudaStream&&) = delete;
TrtCudaStream& operator=(TrtCudaStream&&) = delete;
~TrtCudaStream()
{
CHECK(cudaStreamDestroy(mStream));
}
cudaStream_t get() const
{
return mStream;
}
void synchronize()
{
CHECK(cudaStreamSynchronize(mStream));
}
void wait(TrtCudaEvent& event);
void sleep(float* ms)
{
CHECK(cudaLaunchHostFunc(mStream, cudaSleep, ms));
}
private:
cudaStream_t mStream{};
};
//!
//! \class TrtCudaEvent
//! \brief Managed CUDA event
//!
class TrtCudaEvent
{
public:
explicit TrtCudaEvent(bool blocking = true)
{
const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault;
CHECK(cudaEventCreateWithFlags(&mEvent, flags));
}
TrtCudaEvent(const TrtCudaEvent&) = delete;
TrtCudaEvent& operator=(const TrtCudaEvent&) = delete;
TrtCudaEvent(TrtCudaEvent&&) = delete;
TrtCudaEvent& operator=(TrtCudaEvent&&) = delete;
~TrtCudaEvent()
{
CHECK(cudaEventDestroy(mEvent));
}
cudaEvent_t get() const
{
return mEvent;
}
void record(const TrtCudaStream& stream)
{
CHECK(cudaEventRecord(mEvent, stream.get()));
}
void synchronize()
{
CHECK(cudaEventSynchronize(mEvent));
}
// Returns time elapsed time in milliseconds
float operator-(const TrtCudaEvent& e) const
{
float time{0};
CHECK(cudaEventElapsedTime(&time, e.get(), get()));
return time;
}
private:
cudaEvent_t mEvent{};
};
inline void TrtCudaStream::wait(TrtCudaEvent& event)
{
CHECK(cudaStreamWaitEvent(mStream, event.get(), 0));
}
//!
//! \class TrtCudaGraph
//! \brief Managed CUDA graph
//!
class TrtCudaGraph
{
public:
explicit TrtCudaGraph() = default;
TrtCudaGraph(const TrtCudaGraph&) = delete;
TrtCudaGraph& operator=(const TrtCudaGraph&) = delete;
TrtCudaGraph(TrtCudaGraph&&) = delete;
TrtCudaGraph& operator=(TrtCudaGraph&&) = delete;
~TrtCudaGraph()
{
if (mGraphExec)
{
cudaGraphExecDestroy(mGraphExec);
}
}
void beginCapture(TrtCudaStream& stream)
{
CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
}
bool launch(TrtCudaStream& stream)
{
return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess;
}
void endCapture(TrtCudaStream& stream)
{
CHECK(cudaStreamEndCapture(stream.get(), &mGraph));
CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
CHECK(cudaGraphDestroy(mGraph));
}
void endCaptureOnError(TrtCudaStream& stream)
{
// There are two possibilities why stream capture would fail:
// (1) stream is in cudaErrorStreamCaptureInvalidated state.
// (2) TRT reports a failure.
// In case (1), the returning mGraph should be nullptr.
// In case (2), the returning mGraph is not nullptr, but it should not be used.
const auto ret = cudaStreamEndCapture(stream.get(), &mGraph);
if (ret == cudaErrorStreamCaptureInvalidated)
{
assert(mGraph == nullptr);
}
else
{
CHECK(ret);
assert(mGraph != nullptr);
CHECK(cudaGraphDestroy(mGraph));
mGraph = nullptr;
}
// Clean up any CUDA error.
cudaGetLastError();
sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl;
}
private:
cudaGraph_t mGraph{};
cudaGraphExec_t mGraphExec{};
};
//!
//! \class TrtCudaBuffer
//! \brief Managed buffer for host and device
//!
template <typename A, typename D>
class TrtCudaBuffer
{
public:
TrtCudaBuffer() = default;
TrtCudaBuffer(const TrtCudaBuffer&) = delete;
TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete;
TrtCudaBuffer(TrtCudaBuffer&& rhs)
{
reset(rhs.mPtr, rhs.mSize);
rhs.mPtr = nullptr;
rhs.mSize = 0;
}
TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs)
{
if (this != &rhs)
{
reset(rhs.mPtr, rhs.mSize);
rhs.mPtr = nullptr;
rhs.mSize = 0;
}
return *this;
}
~TrtCudaBuffer()
{
reset();
}
TrtCudaBuffer(size_t size)
{
A()(&mPtr, size);
mSize = size;
}
void allocate(size_t size)
{
reset();
A()(&mPtr, size);
mSize = size;
}
void reset(void* ptr = nullptr, size_t size = 0)
{
if (mPtr)
{
D()(mPtr);
}
mPtr = ptr;
mSize = size;
}
void* get() const
{
return mPtr;
}
size_t getSize() const
{
return mSize;
}
private:
void* mPtr{nullptr};
size_t mSize{0};
};
struct DeviceAllocator
{
void operator()(void** ptr, size_t size)
{
CHECK(cudaMalloc(ptr, size));
}
};
struct DeviceDeallocator
{
void operator()(void* ptr)
{
CHECK(cudaFree(ptr));
}
};
struct ManagedAllocator
{
void operator()(void** ptr, size_t size)
{
CHECK(cudaMallocManaged(ptr, size));
}
};
struct HostAllocator
{
//! Attempts to allocate size bytes on host, pointing *ptr to the start.
//! First attempts to allocate pinned memory using cudaMallocHost(ptr, size), failing that, warns to gLogWarning and
//! falls back to ::operator new(size) to allocate pageable memory. If that still fails, an exception may be thrown.
void operator()(void** ptr, size_t size)
{
// Try allocating pinned host memory.
cudaError_t ret = cudaMallocHost(ptr, size);
// If we cannot allocate pinned host memory, allocate pageable host memory instead and print a warning.
if (ret != cudaSuccess)
{
// Clean up the last cuda error.
(void) cudaGetLastError();
sample::gLogWarning << "cudaMallocHost() call with ptr=" << ptr << " and size=" << size
<< " returns a cuda error: " << cudaGetErrorString(ret) << std::endl;
sample::gLogWarning << "Allocate pageable host memory instead of pinned host memory. H2D and D2H copy "
"latencies may become longer."
<< std::endl;
*ptr = ::operator new(size);
// Make sure there is no remaining cuda error at this point.
CHECK(cudaGetLastError());
}
}
};
struct HostDeallocator
{
//! Attempts to deallocate the host memory allocated by HostAllocator.
//! It first checks if ptr is a pinned or pageable host memory. If pinned, call cudaFreeHost() to free it. If
//! pageable, call ::operator delete() to free it. If ptr is neither of them, an error is printed and the program
//! exits.
void operator()(void* ptr)
{
// Check if the host memory pointer is pinned or pageable.
cudaPointerAttributes attrs;
CHECK(cudaPointerGetAttributes(&attrs, ptr));
// If pinned, call cudaFreeHost() to deallocate it.
if (attrs.type == cudaMemoryTypeHost)
{
CHECK(cudaFreeHost(ptr));
}
// If pageable, delete it directly.
else if (attrs.type == cudaMemoryTypeUnregistered)
{
::operator delete(ptr);
}
// The host memory pointer should not be of any other types.
else
{
sample::gLogError << "Unexpected cuda memory type:" << static_cast<int32_t>(attrs.type) << std::endl;
exit(EXIT_FAILURE);
}
}
};
using TrtDeviceBuffer = TrtCudaBuffer<DeviceAllocator, DeviceDeallocator>;
using TrtManagedBuffer = TrtCudaBuffer<ManagedAllocator, DeviceDeallocator>;
using TrtHostBuffer = TrtCudaBuffer<HostAllocator, HostDeallocator>;
//!
//! \class MirroredBuffer
//! \brief Coupled host and device buffers
//!
class IMirroredBuffer
{
public:
//!
//! Allocate memory for the mirrored buffer give the size
//! of the allocation.
//!
virtual void allocate(size_t size) = 0;
//!
//! Get the pointer to the device side buffer.
//!
//! \return pointer to device memory or nullptr if uninitialized.
//!
virtual void* getDeviceBuffer() const = 0;
//!
//! Get the pointer to the host side buffer.
//!
//! \return pointer to host memory or nullptr if uninitialized.
//!
virtual void* getHostBuffer() const = 0;
//!
//! Copy the memory from host to device.
//!
virtual void hostToDevice(TrtCudaStream& stream) = 0;
//!
//! Copy the memory from device to host.
//!
virtual void deviceToHost(TrtCudaStream& stream) = 0;
//!
//! Interface to get the size of the memory
//!
//! \return the size of memory allocated.
//!
virtual size_t getSize() const = 0;
//!
//! Virtual destructor declaraion
//!
virtual ~IMirroredBuffer() = default;
}; // class IMirroredBuffer
//!
//! Class to have a separate memory buffer for discrete device and host allocations.
//!
class DiscreteMirroredBuffer : public IMirroredBuffer
{
public:
void allocate(size_t size) override
{
mSize = size;
mHostBuffer.allocate(size);
mDeviceBuffer.allocate(size);
}
void* getDeviceBuffer() const override
{
return mDeviceBuffer.get();
}
void* getHostBuffer() const override
{
return mHostBuffer.get();
}
void hostToDevice(TrtCudaStream& stream) override
{
CHECK(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get()));
}
void deviceToHost(TrtCudaStream& stream) override
{
CHECK(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get()));
}
size_t getSize() const override
{
return mSize;
}
private:
size_t mSize{0};
TrtHostBuffer mHostBuffer;
TrtDeviceBuffer mDeviceBuffer;
}; // class DiscreteMirroredBuffer
//!
//! Class to have a unified memory buffer for embedded devices.
//!
class UnifiedMirroredBuffer : public IMirroredBuffer
{
public:
void allocate(size_t size) override
{
mSize = size;
mBuffer.allocate(size);
}
void* getDeviceBuffer() const override
{
return mBuffer.get();
}
void* getHostBuffer() const override
{
return mBuffer.get();
}
void hostToDevice(TrtCudaStream& stream) override
{
// Does nothing since we are using unified memory.
}
void deviceToHost(TrtCudaStream& stream) override
{
// Does nothing since we are using unified memory.
}
size_t getSize() const override
{
return mSize;
}
private:
size_t mSize{0};
TrtManagedBuffer mBuffer;
}; // class UnifiedMirroredBuffer
//!
//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
//! not possible.
//!
class OutputAllocator : public nvinfer1::IOutputAllocator
{
public:
//! Construct, using buffer as the backing storage:
explicit OutputAllocator(std::unique_ptr<IMirroredBuffer> buffer)
: mBuffer{std::move(buffer)}
{
ASSERT(mBuffer);
}
~OutputAllocator() override = default;
void* reallocateOutput(
char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override
{
// Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
// even for empty tensors, so allocate a dummy byte.
size = std::max(size, static_cast<uint64_t>(1));
if (size > mSize)
{
mBuffer->allocate(roundUp(size, alignment));
mSize = size;
}
return mBuffer->getDeviceBuffer();
}
//! IMirroredBuffer does not implement Async allocation, hence this is just a wrap around
void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment,
cudaStream_t /*stream*/) noexcept override
{
return reallocateOutput(tensorName, currentMemory, size, alignment);
}
void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override
{
mFinalDims = dims;
}
IMirroredBuffer* getBuffer()
{
return mBuffer.get();
}
nvinfer1::Dims getFinalDims()
{
return mFinalDims;
}
private:
std::unique_ptr<IMirroredBuffer> mBuffer;
uint64_t mSize{};
nvinfer1::Dims mFinalDims;
};
//! Set the GPU to run the inference on.
void setCudaDevice(int32_t device, std::ostream& os);
//! Get the CUDA version of the current CUDA driver.
int32_t getCudaDriverVersion();
//! Get the CUDA version of the current CUDA runtime.
int32_t getCudaRuntimeVersion();
} // namespace sample
#endif // TRT_SAMPLE_DEVICE_H