bndos's picture
Add pp-doclayout server source with score threshold
3c0d3e1 verified
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <cstring>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
namespace {
class Logger final : public nvinfer1::ILogger {
public:
void log(Severity severity, const char* msg) noexcept override {
if (severity <= Severity::kWARNING) {
std::fprintf(stderr, "[trt] %s\n", msg);
}
}
};
Logger g_logger;
struct DeviceBuffer {
void* ptr{nullptr};
size_t bytes{0};
~DeviceBuffer() { reset(0); }
bool reset(size_t nbytes) {
if (nbytes == 0) {
if (ptr) {
cudaFree(ptr);
ptr = nullptr;
bytes = 0;
}
return true;
}
if (ptr && bytes >= nbytes) return true;
if (ptr) {
cudaFree(ptr);
ptr = nullptr;
bytes = 0;
}
if (cudaMalloc(&ptr, nbytes) != cudaSuccess) return false;
bytes = nbytes;
return true;
}
};
struct PinnedHostBuffer {
void* ptr{nullptr};
size_t bytes{0};
~PinnedHostBuffer() { reset(0); }
bool reset(size_t nbytes) {
if (nbytes == 0) {
if (ptr) {
cudaFreeHost(ptr);
ptr = nullptr;
bytes = 0;
}
return true;
}
if (ptr && bytes >= nbytes) return true;
if (ptr) {
cudaFreeHost(ptr);
ptr = nullptr;
bytes = 0;
}
if (cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault) != cudaSuccess) return false;
bytes = nbytes;
return true;
}
};
std::vector<char> read_file(const char* path) {
std::ifstream in(path, std::ios::binary);
if (!in) return {};
in.seekg(0, std::ios::end);
size_t size = static_cast<size_t>(in.tellg());
in.seekg(0, std::ios::beg);
std::vector<char> data(size);
in.read(data.data(), static_cast<std::streamsize>(size));
return data;
}
const char* find_tensor(nvinfer1::ICudaEngine* engine, nvinfer1::TensorIOMode mode, const char* preferred) {
for (int i = 0; i < engine->getNbIOTensors(); ++i) {
const char* name = engine->getIOTensorName(i);
if (engine->getTensorIOMode(name) == mode && std::string(name) == preferred) return name;
}
return nullptr;
}
struct TrtContext {
std::unique_ptr<nvinfer1::IRuntime> runtime;
std::unique_ptr<nvinfer1::ICudaEngine> engine;
std::unique_ptr<nvinfer1::IExecutionContext> context;
std::mutex mu;
cudaStream_t stream{nullptr};
DeviceBuffer d_image;
DeviceBuffer d_im_shape;
DeviceBuffer d_scale_factor;
DeviceBuffer d_boxes;
DeviceBuffer d_counts;
DeviceBuffer d_masks;
PinnedHostBuffer h_boxes;
PinnedHostBuffer h_counts;
std::string image_name{"image"};
std::string im_shape_name{"im_shape"};
std::string scale_factor_name{"scale_factor"};
std::string boxes_name{"fetch_name_0"};
std::string counts_name{"fetch_name_1"};
std::string masks_name{"fetch_name_2"};
int max_batch{1};
~TrtContext() {
if (stream) {
cudaStreamDestroy(stream);
stream = nullptr;
}
}
};
} // namespace
extern "C" {
TrtContext* trt_create(const char* engine_path) {
auto data = read_file(engine_path);
if (data.empty()) return nullptr;
auto* ctx = new TrtContext();
ctx->runtime.reset(nvinfer1::createInferRuntime(g_logger));
if (!ctx->runtime) { delete ctx; return nullptr; }
ctx->engine.reset(ctx->runtime->deserializeCudaEngine(data.data(), data.size()));
if (!ctx->engine) { delete ctx; return nullptr; }
ctx->context.reset(ctx->engine->createExecutionContext());
if (!ctx->context) { delete ctx; return nullptr; }
if (cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking) != cudaSuccess) { delete ctx; return nullptr; }
if (!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->image_name.c_str()) ||
!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->im_shape_name.c_str()) ||
!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->scale_factor_name.c_str()) ||
!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->boxes_name.c_str()) ||
!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->counts_name.c_str())) {
delete ctx;
return nullptr;
}
auto profile = ctx->engine->getProfileShape(ctx->image_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX);
ctx->max_batch = std::max(1, static_cast<int>(profile.d[0]));
return ctx;
}
void trt_destroy(TrtContext* ctx) { delete ctx; }
int trt_max_batch(TrtContext* ctx) { return ctx ? ctx->max_batch : 0; }
int trt_infer(
TrtContext* ctx,
const float* image,
const float* im_shape,
const float* scale_factor,
int batch,
float* boxes_out,
int32_t* counts_out) {
if (!ctx || batch <= 0 || batch > ctx->max_batch) return -1;
std::lock_guard<std::mutex> lock(ctx->mu);
const size_t image_bytes = static_cast<size_t>(batch) * 3 * 800 * 800 * sizeof(float);
const size_t meta_bytes = static_cast<size_t>(batch) * 2 * sizeof(float);
const size_t boxes_bytes = static_cast<size_t>(batch) * 300 * 7 * sizeof(float);
const size_t counts_bytes = static_cast<size_t>(batch) * sizeof(int32_t);
const size_t masks_bytes = static_cast<size_t>(batch) * 300 * 200 * 200 * sizeof(int32_t);
if (!ctx->d_image.reset(image_bytes) || !ctx->d_im_shape.reset(meta_bytes) ||
!ctx->d_scale_factor.reset(meta_bytes) || !ctx->d_boxes.reset(boxes_bytes) ||
!ctx->d_counts.reset(counts_bytes) || !ctx->d_masks.reset(masks_bytes) ||
!ctx->h_boxes.reset(boxes_bytes) || !ctx->h_counts.reset(counts_bytes)) {
return -2;
}
if (!ctx->context->setInputShape(ctx->image_name.c_str(), nvinfer1::Dims4{batch, 3, 800, 800}) ||
!ctx->context->setInputShape(ctx->im_shape_name.c_str(), nvinfer1::Dims2{batch, 2}) ||
!ctx->context->setInputShape(ctx->scale_factor_name.c_str(), nvinfer1::Dims2{batch, 2})) {
return -3;
}
if (cudaMemcpyAsync(ctx->d_image.ptr, image, image_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess ||
cudaMemcpyAsync(ctx->d_im_shape.ptr, im_shape, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess ||
cudaMemcpyAsync(ctx->d_scale_factor.ptr, scale_factor, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess) {
return -4;
}
ctx->context->setTensorAddress(ctx->image_name.c_str(), ctx->d_image.ptr);
ctx->context->setTensorAddress(ctx->im_shape_name.c_str(), ctx->d_im_shape.ptr);
ctx->context->setTensorAddress(ctx->scale_factor_name.c_str(), ctx->d_scale_factor.ptr);
ctx->context->setTensorAddress(ctx->boxes_name.c_str(), ctx->d_boxes.ptr);
ctx->context->setTensorAddress(ctx->counts_name.c_str(), ctx->d_counts.ptr);
// The mask output is required by the engine but not by layout consumers. Keep it on-device.
ctx->context->setTensorAddress(ctx->masks_name.c_str(), ctx->d_masks.ptr);
if (!ctx->context->enqueueV3(ctx->stream)) return -5;
if (cudaMemcpyAsync(ctx->h_boxes.ptr, ctx->d_boxes.ptr, boxes_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess ||
cudaMemcpyAsync(ctx->h_counts.ptr, ctx->d_counts.ptr, counts_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess) {
return -6;
}
if (cudaStreamSynchronize(ctx->stream) != cudaSuccess) return -7;
std::memcpy(boxes_out, ctx->h_boxes.ptr, boxes_bytes);
std::memcpy(counts_out, ctx->h_counts.ptr, counts_bytes);
return 0;
}
} // extern "C"