#include #include #include #include #include #include #include #include #include #include #include namespace { class Logger final : public nvinfer1::ILogger { public: void log(Severity severity, const char* msg) noexcept override { if (severity <= Severity::kWARNING) { std::fprintf(stderr, "[trt] %s\n", msg); } } }; Logger g_logger; struct DeviceBuffer { void* ptr{nullptr}; size_t bytes{0}; ~DeviceBuffer() { reset(0); } bool reset(size_t nbytes) { if (nbytes == 0) { if (ptr) { cudaFree(ptr); ptr = nullptr; bytes = 0; } return true; } if (ptr && bytes >= nbytes) return true; if (ptr) { cudaFree(ptr); ptr = nullptr; bytes = 0; } if (cudaMalloc(&ptr, nbytes) != cudaSuccess) return false; bytes = nbytes; return true; } }; struct PinnedHostBuffer { void* ptr{nullptr}; size_t bytes{0}; ~PinnedHostBuffer() { reset(0); } bool reset(size_t nbytes) { if (nbytes == 0) { if (ptr) { cudaFreeHost(ptr); ptr = nullptr; bytes = 0; } return true; } if (ptr && bytes >= nbytes) return true; if (ptr) { cudaFreeHost(ptr); ptr = nullptr; bytes = 0; } if (cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault) != cudaSuccess) return false; bytes = nbytes; return true; } }; std::vector read_file(const char* path) { std::ifstream in(path, std::ios::binary); if (!in) return {}; in.seekg(0, std::ios::end); size_t size = static_cast(in.tellg()); in.seekg(0, std::ios::beg); std::vector data(size); in.read(data.data(), static_cast(size)); return data; } const char* find_tensor(nvinfer1::ICudaEngine* engine, nvinfer1::TensorIOMode mode, const char* preferred) { for (int i = 0; i < engine->getNbIOTensors(); ++i) { const char* name = engine->getIOTensorName(i); if (engine->getTensorIOMode(name) == mode && std::string(name) == preferred) return name; } return nullptr; } struct TrtContext { std::unique_ptr runtime; std::unique_ptr engine; std::unique_ptr context; std::mutex mu; cudaStream_t stream{nullptr}; DeviceBuffer d_image; DeviceBuffer d_im_shape; DeviceBuffer d_scale_factor; DeviceBuffer d_boxes; DeviceBuffer d_counts; DeviceBuffer d_masks; PinnedHostBuffer h_boxes; PinnedHostBuffer h_counts; std::string image_name{"image"}; std::string im_shape_name{"im_shape"}; std::string scale_factor_name{"scale_factor"}; std::string boxes_name{"fetch_name_0"}; std::string counts_name{"fetch_name_1"}; std::string masks_name{"fetch_name_2"}; int max_batch{1}; ~TrtContext() { if (stream) { cudaStreamDestroy(stream); stream = nullptr; } } }; } // namespace extern "C" { TrtContext* trt_create(const char* engine_path) { auto data = read_file(engine_path); if (data.empty()) return nullptr; auto* ctx = new TrtContext(); ctx->runtime.reset(nvinfer1::createInferRuntime(g_logger)); if (!ctx->runtime) { delete ctx; return nullptr; } ctx->engine.reset(ctx->runtime->deserializeCudaEngine(data.data(), data.size())); if (!ctx->engine) { delete ctx; return nullptr; } ctx->context.reset(ctx->engine->createExecutionContext()); if (!ctx->context) { delete ctx; return nullptr; } if (cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking) != cudaSuccess) { delete ctx; return nullptr; } if (!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->image_name.c_str()) || !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->im_shape_name.c_str()) || !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->scale_factor_name.c_str()) || !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->boxes_name.c_str()) || !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->counts_name.c_str())) { delete ctx; return nullptr; } auto profile = ctx->engine->getProfileShape(ctx->image_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX); ctx->max_batch = std::max(1, static_cast(profile.d[0])); return ctx; } void trt_destroy(TrtContext* ctx) { delete ctx; } int trt_max_batch(TrtContext* ctx) { return ctx ? ctx->max_batch : 0; } int trt_infer( TrtContext* ctx, const float* image, const float* im_shape, const float* scale_factor, int batch, float* boxes_out, int32_t* counts_out) { if (!ctx || batch <= 0 || batch > ctx->max_batch) return -1; std::lock_guard lock(ctx->mu); const size_t image_bytes = static_cast(batch) * 3 * 800 * 800 * sizeof(float); const size_t meta_bytes = static_cast(batch) * 2 * sizeof(float); const size_t boxes_bytes = static_cast(batch) * 300 * 7 * sizeof(float); const size_t counts_bytes = static_cast(batch) * sizeof(int32_t); const size_t masks_bytes = static_cast(batch) * 300 * 200 * 200 * sizeof(int32_t); if (!ctx->d_image.reset(image_bytes) || !ctx->d_im_shape.reset(meta_bytes) || !ctx->d_scale_factor.reset(meta_bytes) || !ctx->d_boxes.reset(boxes_bytes) || !ctx->d_counts.reset(counts_bytes) || !ctx->d_masks.reset(masks_bytes) || !ctx->h_boxes.reset(boxes_bytes) || !ctx->h_counts.reset(counts_bytes)) { return -2; } if (!ctx->context->setInputShape(ctx->image_name.c_str(), nvinfer1::Dims4{batch, 3, 800, 800}) || !ctx->context->setInputShape(ctx->im_shape_name.c_str(), nvinfer1::Dims2{batch, 2}) || !ctx->context->setInputShape(ctx->scale_factor_name.c_str(), nvinfer1::Dims2{batch, 2})) { return -3; } if (cudaMemcpyAsync(ctx->d_image.ptr, image, image_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess || cudaMemcpyAsync(ctx->d_im_shape.ptr, im_shape, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess || cudaMemcpyAsync(ctx->d_scale_factor.ptr, scale_factor, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess) { return -4; } ctx->context->setTensorAddress(ctx->image_name.c_str(), ctx->d_image.ptr); ctx->context->setTensorAddress(ctx->im_shape_name.c_str(), ctx->d_im_shape.ptr); ctx->context->setTensorAddress(ctx->scale_factor_name.c_str(), ctx->d_scale_factor.ptr); ctx->context->setTensorAddress(ctx->boxes_name.c_str(), ctx->d_boxes.ptr); ctx->context->setTensorAddress(ctx->counts_name.c_str(), ctx->d_counts.ptr); // The mask output is required by the engine but not by layout consumers. Keep it on-device. ctx->context->setTensorAddress(ctx->masks_name.c_str(), ctx->d_masks.ptr); if (!ctx->context->enqueueV3(ctx->stream)) return -5; if (cudaMemcpyAsync(ctx->h_boxes.ptr, ctx->d_boxes.ptr, boxes_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess || cudaMemcpyAsync(ctx->h_counts.ptr, ctx->d_counts.ptr, counts_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess) { return -6; } if (cudaStreamSynchronize(ctx->stream) != cudaSuccess) return -7; std::memcpy(boxes_out, ctx->h_boxes.ptr, boxes_bytes); std::memcpy(counts_out, ctx->h_counts.ptr, counts_bytes); return 0; } } // extern "C"