| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #ifndef TENSORRT_BUFFERS_H |
| #define TENSORRT_BUFFERS_H |
|
|
| #include "NvInfer.h" |
| #include "common.h" |
| #include "half.h" |
| #include <cassert> |
| #include <cuda_runtime_api.h> |
| #include <iostream> |
| #include <iterator> |
| #include <memory> |
| #include <new> |
| #include <numeric> |
| #include <string> |
| #include <vector> |
|
|
| namespace samplesCommon |
| { |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| template <typename AllocFunc, typename FreeFunc> |
| class GenericBuffer |
| { |
| public: |
| |
| |
| |
| GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) |
| : mSize(0) |
| , mCapacity(0) |
| , mType(type) |
| , mBuffer(nullptr) |
| { |
| } |
|
|
| |
| |
| |
| GenericBuffer(size_t size, nvinfer1::DataType type) |
| : mSize(size) |
| , mCapacity(size) |
| , mType(type) |
| { |
| if (!allocFn(&mBuffer, this->nbBytes())) |
| { |
| throw std::bad_alloc(); |
| } |
| } |
|
|
| GenericBuffer(GenericBuffer&& buf) |
| : mSize(buf.mSize) |
| , mCapacity(buf.mCapacity) |
| , mType(buf.mType) |
| , mBuffer(buf.mBuffer) |
| { |
| buf.mSize = 0; |
| buf.mCapacity = 0; |
| buf.mType = nvinfer1::DataType::kFLOAT; |
| buf.mBuffer = nullptr; |
| } |
|
|
| GenericBuffer& operator=(GenericBuffer&& buf) |
| { |
| if (this != &buf) |
| { |
| freeFn(mBuffer); |
| mSize = buf.mSize; |
| mCapacity = buf.mCapacity; |
| mType = buf.mType; |
| mBuffer = buf.mBuffer; |
| |
| buf.mSize = 0; |
| buf.mCapacity = 0; |
| buf.mBuffer = nullptr; |
| } |
| return *this; |
| } |
|
|
| |
| |
| |
| void* data() |
| { |
| return mBuffer; |
| } |
|
|
| |
| |
| |
| const void* data() const |
| { |
| return mBuffer; |
| } |
|
|
| |
| |
| |
| size_t size() const |
| { |
| return mSize; |
| } |
|
|
| |
| |
| |
| size_t nbBytes() const |
| { |
| return samplesCommon::getNbBytes(mType, size()); |
| } |
|
|
| |
| |
| |
| void resize(size_t newSize) |
| { |
| mSize = newSize; |
| if (mCapacity < newSize) |
| { |
| freeFn(mBuffer); |
| if (!allocFn(&mBuffer, this->nbBytes())) |
| { |
| throw std::bad_alloc{}; |
| } |
| mCapacity = newSize; |
| } |
| } |
|
|
| |
| |
| |
| void resize(const nvinfer1::Dims& dims) |
| { |
| return this->resize(samplesCommon::volume(dims)); |
| } |
|
|
| ~GenericBuffer() |
| { |
| freeFn(mBuffer); |
| } |
|
|
| private: |
| size_t mSize{0}, mCapacity{0}; |
| nvinfer1::DataType mType; |
| void* mBuffer; |
| AllocFunc allocFn; |
| FreeFunc freeFn; |
| }; |
|
|
| class DeviceAllocator |
| { |
| public: |
| bool operator()(void** ptr, size_t size) const |
| { |
| return cudaMalloc(ptr, size) == cudaSuccess; |
| } |
| }; |
|
|
| class DeviceFree |
| { |
| public: |
| void operator()(void* ptr) const |
| { |
| cudaFree(ptr); |
| } |
| }; |
|
|
| class HostAllocator |
| { |
| public: |
| bool operator()(void** ptr, size_t size) const |
| { |
| *ptr = malloc(size); |
| return *ptr != nullptr; |
| } |
| }; |
|
|
| class HostFree |
| { |
| public: |
| void operator()(void* ptr) const |
| { |
| free(ptr); |
| } |
| }; |
|
|
| using DeviceBuffer = GenericBuffer<DeviceAllocator, DeviceFree>; |
| using HostBuffer = GenericBuffer<HostAllocator, HostFree>; |
|
|
| |
| |
| |
| class ManagedBuffer |
| { |
| public: |
| DeviceBuffer deviceBuffer; |
| HostBuffer hostBuffer; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| class BufferManager |
| { |
| public: |
| static const size_t kINVALID_SIZE_VALUE = ~size_t(0); |
|
|
| |
| |
| |
| |
| BufferManager( |
| std::shared_ptr<nvinfer1::ICudaEngine> engine, std::vector<int64_t> const& volumes, int32_t batchSize = 0) |
| : mEngine(engine) |
| , mBatchSize(batchSize) |
| { |
| |
| for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++) |
| { |
| auto const name = engine->getIOTensorName(i); |
| mNames[name] = i; |
|
|
| nvinfer1::DataType type = mEngine->getTensorDataType(name); |
|
|
| std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()}; |
| manBuf->deviceBuffer = DeviceBuffer(volumes[i], type); |
| manBuf->hostBuffer = HostBuffer(volumes[i], type); |
| void* deviceBuffer = manBuf->deviceBuffer.data(); |
| mDeviceBindings.emplace_back(deviceBuffer); |
| mManagedBuffers.emplace_back(std::move(manBuf)); |
| } |
| } |
|
|
| |
| |
| |
| BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, int32_t const batchSize = 0, |
| nvinfer1::IExecutionContext const* context = nullptr) |
| : mEngine(engine) |
| , mBatchSize(batchSize) |
| { |
| |
| for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++) |
| { |
| auto const name = engine->getIOTensorName(i); |
| mNames[name] = i; |
|
|
| auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name); |
| size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize); |
| nvinfer1::DataType type = mEngine->getTensorDataType(name); |
| int32_t vecDim = mEngine->getTensorVectorizedDim(name); |
| if (-1 != vecDim) |
| { |
| int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name); |
| dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); |
| vol *= scalarsPerVec; |
| } |
| vol *= samplesCommon::volume(dims); |
| std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()}; |
| manBuf->deviceBuffer = DeviceBuffer(vol, type); |
| manBuf->hostBuffer = HostBuffer(vol, type); |
| void* deviceBuffer = manBuf->deviceBuffer.data(); |
| mDeviceBindings.emplace_back(deviceBuffer); |
| mManagedBuffers.emplace_back(std::move(manBuf)); |
| } |
| } |
|
|
| |
| |
| |
| |
| std::vector<void*>& getDeviceBindings() |
| { |
| return mDeviceBindings; |
| } |
|
|
| |
| |
| |
| std::vector<void*> const& getDeviceBindings() const |
| { |
| return mDeviceBindings; |
| } |
|
|
| |
| |
| |
| |
| void* getDeviceBuffer(std::string const& tensorName) const |
| { |
| return getBuffer(false, tensorName); |
| } |
|
|
| |
| |
| |
| |
| void* getHostBuffer(std::string const& tensorName) const |
| { |
| return getBuffer(true, tensorName); |
| } |
|
|
| |
| |
| |
| |
| size_t size(std::string const& tensorName) const |
| { |
| auto record = mNames.find(tensorName); |
| if (record == mNames.end()) |
| return kINVALID_SIZE_VALUE; |
| return mManagedBuffers[record->second]->hostBuffer.nbBytes(); |
| } |
|
|
| |
| |
| |
| |
| |
| template <typename T> |
| void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) |
| { |
| assert(rowCount != 0); |
| assert(bufSize % sizeof(T) == 0); |
| T* typedBuf = static_cast<T*>(buf); |
| size_t numItems = bufSize / sizeof(T); |
| for (int32_t i = 0; i < static_cast<int>(numItems); i++) |
| { |
| |
| if (rowCount == 1 && i != static_cast<int>(numItems) - 1) |
| os << typedBuf[i] << std::endl; |
| else if (rowCount == 1) |
| os << typedBuf[i]; |
| |
| else if (i % rowCount == 0) |
| os << typedBuf[i]; |
| else if (i % rowCount == rowCount - 1) |
| os << " " << typedBuf[i] << std::endl; |
| else |
| os << " " << typedBuf[i]; |
| } |
| } |
|
|
| |
| |
| |
| void copyInputToDevice() |
| { |
| memcpyBuffers(true, false, false); |
| } |
|
|
| |
| |
| |
| void copyOutputToHost() |
| { |
| memcpyBuffers(false, true, false); |
| } |
|
|
| |
| |
| |
| void copyInputToDeviceAsync(cudaStream_t const& stream = 0) |
| { |
| memcpyBuffers(true, false, true, stream); |
| } |
|
|
| |
| |
| |
| void copyOutputToHostAsync(cudaStream_t const& stream = 0) |
| { |
| memcpyBuffers(false, true, true, stream); |
| } |
|
|
| ~BufferManager() = default; |
|
|
| private: |
| void* getBuffer(bool const isHost, std::string const& tensorName) const |
| { |
| auto record = mNames.find(tensorName); |
| if (record == mNames.end()) |
| return nullptr; |
| return (isHost ? mManagedBuffers[record->second]->hostBuffer.data() |
| : mManagedBuffers[record->second]->deviceBuffer.data()); |
| } |
|
|
| bool tenosrIsInput(const std::string& tensorName) const |
| { |
| return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT; |
| } |
|
|
| void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0) |
| { |
| for (auto const& n : mNames) |
| { |
| void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data() |
| : mManagedBuffers[n.second]->deviceBuffer.data(); |
| void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data() |
| : mManagedBuffers[n.second]->hostBuffer.data(); |
| size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes(); |
| const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; |
| if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first))) |
| { |
| if (async) |
| CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); |
| else |
| CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); |
| } |
| } |
| } |
|
|
| std::shared_ptr<nvinfer1::ICudaEngine> mEngine; |
| int mBatchSize; |
| std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; |
| std::vector<void*> mDeviceBindings; |
| std::unordered_map<std::string, int32_t> mNames; |
| }; |
|
|
| } |
|
|
| #endif |
|
|