File size: 7,390 Bytes

3c0d3e1

#include <NvInfer.h>
#include <cuda_runtime_api.h>

#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <cstring>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

namespace {
class Logger final : public nvinfer1::ILogger {
 public:
  void log(Severity severity, const char* msg) noexcept override {
    if (severity <= Severity::kWARNING) {
      std::fprintf(stderr, "[trt] %s\n", msg);
    }
  }
};

Logger g_logger;

struct DeviceBuffer {
  void* ptr{nullptr};
  size_t bytes{0};
  ~DeviceBuffer() { reset(0); }
  bool reset(size_t nbytes) {
    if (nbytes == 0) {
      if (ptr) {
        cudaFree(ptr);
        ptr = nullptr;
        bytes = 0;
      }
      return true;
    }
    if (ptr && bytes >= nbytes) return true;
    if (ptr) {
      cudaFree(ptr);
      ptr = nullptr;
      bytes = 0;
    }
    if (cudaMalloc(&ptr, nbytes) != cudaSuccess) return false;
    bytes = nbytes;
    return true;
  }
};

struct PinnedHostBuffer {
  void* ptr{nullptr};
  size_t bytes{0};
  ~PinnedHostBuffer() { reset(0); }
  bool reset(size_t nbytes) {
    if (nbytes == 0) {
      if (ptr) {
        cudaFreeHost(ptr);
        ptr = nullptr;
        bytes = 0;
      }
      return true;
    }
    if (ptr && bytes >= nbytes) return true;
    if (ptr) {
      cudaFreeHost(ptr);
      ptr = nullptr;
      bytes = 0;
    }
    if (cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault) != cudaSuccess) return false;
    bytes = nbytes;
    return true;
  }
};

std::vector<char> read_file(const char* path) {
  std::ifstream in(path, std::ios::binary);
  if (!in) return {};
  in.seekg(0, std::ios::end);
  size_t size = static_cast<size_t>(in.tellg());
  in.seekg(0, std::ios::beg);
  std::vector<char> data(size);
  in.read(data.data(), static_cast<std::streamsize>(size));
  return data;
}

const char* find_tensor(nvinfer1::ICudaEngine* engine, nvinfer1::TensorIOMode mode, const char* preferred) {
  for (int i = 0; i < engine->getNbIOTensors(); ++i) {
    const char* name = engine->getIOTensorName(i);
    if (engine->getTensorIOMode(name) == mode && std::string(name) == preferred) return name;
  }
  return nullptr;
}

struct TrtContext {
  std::unique_ptr<nvinfer1::IRuntime> runtime;
  std::unique_ptr<nvinfer1::ICudaEngine> engine;
  std::unique_ptr<nvinfer1::IExecutionContext> context;
  std::mutex mu;
  cudaStream_t stream{nullptr};
  DeviceBuffer d_image;
  DeviceBuffer d_im_shape;
  DeviceBuffer d_scale_factor;
  DeviceBuffer d_boxes;
  DeviceBuffer d_counts;
  DeviceBuffer d_masks;
  PinnedHostBuffer h_boxes;
  PinnedHostBuffer h_counts;
  std::string image_name{"image"};
  std::string im_shape_name{"im_shape"};
  std::string scale_factor_name{"scale_factor"};
  std::string boxes_name{"fetch_name_0"};
  std::string counts_name{"fetch_name_1"};
  std::string masks_name{"fetch_name_2"};
  int max_batch{1};
  ~TrtContext() {
    if (stream) {
      cudaStreamDestroy(stream);
      stream = nullptr;
    }
  }
};
}  // namespace

extern "C" {

TrtContext* trt_create(const char* engine_path) {
  auto data = read_file(engine_path);
  if (data.empty()) return nullptr;
  auto* ctx = new TrtContext();
  ctx->runtime.reset(nvinfer1::createInferRuntime(g_logger));
  if (!ctx->runtime) { delete ctx; return nullptr; }
  ctx->engine.reset(ctx->runtime->deserializeCudaEngine(data.data(), data.size()));
  if (!ctx->engine) { delete ctx; return nullptr; }
  ctx->context.reset(ctx->engine->createExecutionContext());
  if (!ctx->context) { delete ctx; return nullptr; }
  if (cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking) != cudaSuccess) { delete ctx; return nullptr; }

  if (!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->image_name.c_str()) ||
      !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->im_shape_name.c_str()) ||
      !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->scale_factor_name.c_str()) ||
      !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->boxes_name.c_str()) ||
      !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->counts_name.c_str())) {
    delete ctx;
    return nullptr;
  }
  auto profile = ctx->engine->getProfileShape(ctx->image_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX);
  ctx->max_batch = std::max(1, static_cast<int>(profile.d[0]));
  return ctx;
}

void trt_destroy(TrtContext* ctx) { delete ctx; }

int trt_max_batch(TrtContext* ctx) { return ctx ? ctx->max_batch : 0; }

int trt_infer(
    TrtContext* ctx,
    const float* image,
    const float* im_shape,
    const float* scale_factor,
    int batch,
    float* boxes_out,
    int32_t* counts_out) {
  if (!ctx || batch <= 0 || batch > ctx->max_batch) return -1;
  std::lock_guard<std::mutex> lock(ctx->mu);

  const size_t image_bytes = static_cast<size_t>(batch) * 3 * 800 * 800 * sizeof(float);
  const size_t meta_bytes = static_cast<size_t>(batch) * 2 * sizeof(float);
  const size_t boxes_bytes = static_cast<size_t>(batch) * 300 * 7 * sizeof(float);
  const size_t counts_bytes = static_cast<size_t>(batch) * sizeof(int32_t);
  const size_t masks_bytes = static_cast<size_t>(batch) * 300 * 200 * 200 * sizeof(int32_t);

  if (!ctx->d_image.reset(image_bytes) || !ctx->d_im_shape.reset(meta_bytes) ||
      !ctx->d_scale_factor.reset(meta_bytes) || !ctx->d_boxes.reset(boxes_bytes) ||
      !ctx->d_counts.reset(counts_bytes) || !ctx->d_masks.reset(masks_bytes) ||
      !ctx->h_boxes.reset(boxes_bytes) || !ctx->h_counts.reset(counts_bytes)) {
    return -2;
  }

  if (!ctx->context->setInputShape(ctx->image_name.c_str(), nvinfer1::Dims4{batch, 3, 800, 800}) ||
      !ctx->context->setInputShape(ctx->im_shape_name.c_str(), nvinfer1::Dims2{batch, 2}) ||
      !ctx->context->setInputShape(ctx->scale_factor_name.c_str(), nvinfer1::Dims2{batch, 2})) {
    return -3;
  }

  if (cudaMemcpyAsync(ctx->d_image.ptr, image, image_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess ||
      cudaMemcpyAsync(ctx->d_im_shape.ptr, im_shape, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess ||
      cudaMemcpyAsync(ctx->d_scale_factor.ptr, scale_factor, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess) {
    return -4;
  }

  ctx->context->setTensorAddress(ctx->image_name.c_str(), ctx->d_image.ptr);
  ctx->context->setTensorAddress(ctx->im_shape_name.c_str(), ctx->d_im_shape.ptr);
  ctx->context->setTensorAddress(ctx->scale_factor_name.c_str(), ctx->d_scale_factor.ptr);
  ctx->context->setTensorAddress(ctx->boxes_name.c_str(), ctx->d_boxes.ptr);
  ctx->context->setTensorAddress(ctx->counts_name.c_str(), ctx->d_counts.ptr);
  // The mask output is required by the engine but not by layout consumers. Keep it on-device.
  ctx->context->setTensorAddress(ctx->masks_name.c_str(), ctx->d_masks.ptr);

  if (!ctx->context->enqueueV3(ctx->stream)) return -5;
  if (cudaMemcpyAsync(ctx->h_boxes.ptr, ctx->d_boxes.ptr, boxes_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess ||
      cudaMemcpyAsync(ctx->h_counts.ptr, ctx->d_counts.ptr, counts_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess) {
    return -6;
  }
  if (cudaStreamSynchronize(ctx->stream) != cudaSuccess) return -7;
  std::memcpy(boxes_out, ctx->h_boxes.ptr, boxes_bytes);
  std::memcpy(counts_out, ctx->h_counts.ptr, counts_bytes);
  return 0;
}

}  // extern "C"