Add pp-doclayout server source with score threshold

3c0d3e1 verified 7 days ago

7.39 kB

	#include <NvInfer.h>
	#include <cuda_runtime_api.h>

	#include <algorithm>
	#include <cstdint>
	#include <cstdio>
	#include <fstream>
	#include <cstring>
	#include <memory>
	#include <mutex>
	#include <string>
	#include <vector>

	namespace {
	class Logger final : public nvinfer1::ILogger {
	public:
	void log(Severity severity, const char* msg) noexcept override {
	if (severity <= Severity::kWARNING) {
	std::fprintf(stderr, "[trt] %s\n", msg);
	}
	}
	};

	Logger g_logger;

	struct DeviceBuffer {
	void* ptr{nullptr};
	size_t bytes{0};
	~DeviceBuffer() { reset(0); }
	bool reset(size_t nbytes) {
	if (nbytes == 0) {
	if (ptr) {
	cudaFree(ptr);
	ptr = nullptr;
	bytes = 0;
	}
	return true;
	}
	if (ptr && bytes >= nbytes) return true;
	if (ptr) {
	cudaFree(ptr);
	ptr = nullptr;
	bytes = 0;
	}
	if (cudaMalloc(&ptr, nbytes) != cudaSuccess) return false;
	bytes = nbytes;
	return true;
	}
	};

	struct PinnedHostBuffer {
	void* ptr{nullptr};
	size_t bytes{0};
	~PinnedHostBuffer() { reset(0); }
	bool reset(size_t nbytes) {
	if (nbytes == 0) {
	if (ptr) {
	cudaFreeHost(ptr);
	ptr = nullptr;
	bytes = 0;
	}
	return true;
	}
	if (ptr && bytes >= nbytes) return true;
	if (ptr) {
	cudaFreeHost(ptr);
	ptr = nullptr;
	bytes = 0;
	}
	if (cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault) != cudaSuccess) return false;
	bytes = nbytes;
	return true;
	}
	};

	std::vector<char> read_file(const char* path) {
	std::ifstream in(path, std::ios::binary);
	if (!in) return {};
	in.seekg(0, std::ios::end);
	size_t size = static_cast<size_t>(in.tellg());
	in.seekg(0, std::ios::beg);
	std::vector<char> data(size);
	in.read(data.data(), static_cast<std::streamsize>(size));
	return data;
	}

	const char* find_tensor(nvinfer1::ICudaEngine* engine, nvinfer1::TensorIOMode mode, const char* preferred) {
	for (int i = 0; i < engine->getNbIOTensors(); ++i) {
	const char* name = engine->getIOTensorName(i);
	if (engine->getTensorIOMode(name) == mode && std::string(name) == preferred) return name;
	}
	return nullptr;
	}

	struct TrtContext {
	std::unique_ptr<nvinfer1::IRuntime> runtime;
	std::unique_ptr<nvinfer1::ICudaEngine> engine;
	std::unique_ptr<nvinfer1::IExecutionContext> context;
	std::mutex mu;
	cudaStream_t stream{nullptr};
	DeviceBuffer d_image;
	DeviceBuffer d_im_shape;
	DeviceBuffer d_scale_factor;
	DeviceBuffer d_boxes;
	DeviceBuffer d_counts;
	DeviceBuffer d_masks;
	PinnedHostBuffer h_boxes;
	PinnedHostBuffer h_counts;
	std::string image_name{"image"};
	std::string im_shape_name{"im_shape"};
	std::string scale_factor_name{"scale_factor"};
	std::string boxes_name{"fetch_name_0"};
	std::string counts_name{"fetch_name_1"};
	std::string masks_name{"fetch_name_2"};
	int max_batch{1};
	~TrtContext() {
	if (stream) {
	cudaStreamDestroy(stream);
	stream = nullptr;
	}
	}
	};
	} // namespace

	extern "C" {

	TrtContext* trt_create(const char* engine_path) {
	auto data = read_file(engine_path);
	if (data.empty()) return nullptr;
	auto* ctx = new TrtContext();
	ctx->runtime.reset(nvinfer1::createInferRuntime(g_logger));
	if (!ctx->runtime) { delete ctx; return nullptr; }
	ctx->engine.reset(ctx->runtime->deserializeCudaEngine(data.data(), data.size()));
	if (!ctx->engine) { delete ctx; return nullptr; }
	ctx->context.reset(ctx->engine->createExecutionContext());
	if (!ctx->context) { delete ctx; return nullptr; }
	if (cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking) != cudaSuccess) { delete ctx; return nullptr; }

	if (!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->image_name.c_str()) \|\|
	!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->im_shape_name.c_str()) \|\|
	!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->scale_factor_name.c_str()) \|\|
	!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->boxes_name.c_str()) \|\|
	!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->counts_name.c_str())) {
	delete ctx;
	return nullptr;
	}
	auto profile = ctx->engine->getProfileShape(ctx->image_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX);
	ctx->max_batch = std::max(1, static_cast<int>(profile.d[0]));
	return ctx;
	}

	void trt_destroy(TrtContext* ctx) { delete ctx; }

	int trt_max_batch(TrtContext* ctx) { return ctx ? ctx->max_batch : 0; }

	int trt_infer(
	TrtContext* ctx,
	const float* image,
	const float* im_shape,
	const float* scale_factor,
	int batch,
	float* boxes_out,
	int32_t* counts_out) {
	if (!ctx \|\| batch <= 0 \|\| batch > ctx->max_batch) return -1;
	std::lock_guard<std::mutex> lock(ctx->mu);

	const size_t image_bytes = static_cast<size_t>(batch) * 3 * 800 * 800 * sizeof(float);
	const size_t meta_bytes = static_cast<size_t>(batch) * 2 * sizeof(float);
	const size_t boxes_bytes = static_cast<size_t>(batch) * 300 * 7 * sizeof(float);
	const size_t counts_bytes = static_cast<size_t>(batch) * sizeof(int32_t);
	const size_t masks_bytes = static_cast<size_t>(batch) * 300 * 200 * 200 * sizeof(int32_t);

	if (!ctx->d_image.reset(image_bytes) \|\| !ctx->d_im_shape.reset(meta_bytes) \|\|
	!ctx->d_scale_factor.reset(meta_bytes) \|\| !ctx->d_boxes.reset(boxes_bytes) \|\|
	!ctx->d_counts.reset(counts_bytes) \|\| !ctx->d_masks.reset(masks_bytes) \|\|
	!ctx->h_boxes.reset(boxes_bytes) \|\| !ctx->h_counts.reset(counts_bytes)) {
	return -2;
	}

	if (!ctx->context->setInputShape(ctx->image_name.c_str(), nvinfer1::Dims4{batch, 3, 800, 800}) \|\|
	!ctx->context->setInputShape(ctx->im_shape_name.c_str(), nvinfer1::Dims2{batch, 2}) \|\|
	!ctx->context->setInputShape(ctx->scale_factor_name.c_str(), nvinfer1::Dims2{batch, 2})) {
	return -3;
	}

	if (cudaMemcpyAsync(ctx->d_image.ptr, image, image_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess \|\|
	cudaMemcpyAsync(ctx->d_im_shape.ptr, im_shape, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess \|\|
	cudaMemcpyAsync(ctx->d_scale_factor.ptr, scale_factor, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess) {
	return -4;
	}

	ctx->context->setTensorAddress(ctx->image_name.c_str(), ctx->d_image.ptr);
	ctx->context->setTensorAddress(ctx->im_shape_name.c_str(), ctx->d_im_shape.ptr);
	ctx->context->setTensorAddress(ctx->scale_factor_name.c_str(), ctx->d_scale_factor.ptr);
	ctx->context->setTensorAddress(ctx->boxes_name.c_str(), ctx->d_boxes.ptr);
	ctx->context->setTensorAddress(ctx->counts_name.c_str(), ctx->d_counts.ptr);
	// The mask output is required by the engine but not by layout consumers. Keep it on-device.
	ctx->context->setTensorAddress(ctx->masks_name.c_str(), ctx->d_masks.ptr);

	if (!ctx->context->enqueueV3(ctx->stream)) return -5;
	if (cudaMemcpyAsync(ctx->h_boxes.ptr, ctx->d_boxes.ptr, boxes_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess \|\|
	cudaMemcpyAsync(ctx->h_counts.ptr, ctx->d_counts.ptr, counts_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess) {
	return -6;
	}
	if (cudaStreamSynchronize(ctx->stream) != cudaSuccess) return -7;
	std::memcpy(boxes_out, ctx->h_boxes.ptr, boxes_bytes);
	std::memcpy(counts_out, ctx->h_counts.ptr, counts_bytes);
	return 0;
	}

	} // extern "C"