Instructions to use bndos/pp-doclayout-v3-trt with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TensorRT
How to use bndos/pp-doclayout-v3-trt with TensorRT:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| namespace { | |
| class Logger final : public nvinfer1::ILogger { | |
| public: | |
| void log(Severity severity, const char* msg) noexcept override { | |
| if (severity <= Severity::kWARNING) { | |
| std::fprintf(stderr, "[trt] %s\n", msg); | |
| } | |
| } | |
| }; | |
| Logger g_logger; | |
| struct DeviceBuffer { | |
| void* ptr{nullptr}; | |
| size_t bytes{0}; | |
| ~DeviceBuffer() { reset(0); } | |
| bool reset(size_t nbytes) { | |
| if (nbytes == 0) { | |
| if (ptr) { | |
| cudaFree(ptr); | |
| ptr = nullptr; | |
| bytes = 0; | |
| } | |
| return true; | |
| } | |
| if (ptr && bytes >= nbytes) return true; | |
| if (ptr) { | |
| cudaFree(ptr); | |
| ptr = nullptr; | |
| bytes = 0; | |
| } | |
| if (cudaMalloc(&ptr, nbytes) != cudaSuccess) return false; | |
| bytes = nbytes; | |
| return true; | |
| } | |
| }; | |
| struct PinnedHostBuffer { | |
| void* ptr{nullptr}; | |
| size_t bytes{0}; | |
| ~PinnedHostBuffer() { reset(0); } | |
| bool reset(size_t nbytes) { | |
| if (nbytes == 0) { | |
| if (ptr) { | |
| cudaFreeHost(ptr); | |
| ptr = nullptr; | |
| bytes = 0; | |
| } | |
| return true; | |
| } | |
| if (ptr && bytes >= nbytes) return true; | |
| if (ptr) { | |
| cudaFreeHost(ptr); | |
| ptr = nullptr; | |
| bytes = 0; | |
| } | |
| if (cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault) != cudaSuccess) return false; | |
| bytes = nbytes; | |
| return true; | |
| } | |
| }; | |
| std::vector<char> read_file(const char* path) { | |
| std::ifstream in(path, std::ios::binary); | |
| if (!in) return {}; | |
| in.seekg(0, std::ios::end); | |
| size_t size = static_cast<size_t>(in.tellg()); | |
| in.seekg(0, std::ios::beg); | |
| std::vector<char> data(size); | |
| in.read(data.data(), static_cast<std::streamsize>(size)); | |
| return data; | |
| } | |
| const char* find_tensor(nvinfer1::ICudaEngine* engine, nvinfer1::TensorIOMode mode, const char* preferred) { | |
| for (int i = 0; i < engine->getNbIOTensors(); ++i) { | |
| const char* name = engine->getIOTensorName(i); | |
| if (engine->getTensorIOMode(name) == mode && std::string(name) == preferred) return name; | |
| } | |
| return nullptr; | |
| } | |
| struct TrtContext { | |
| std::unique_ptr<nvinfer1::IRuntime> runtime; | |
| std::unique_ptr<nvinfer1::ICudaEngine> engine; | |
| std::unique_ptr<nvinfer1::IExecutionContext> context; | |
| std::mutex mu; | |
| cudaStream_t stream{nullptr}; | |
| DeviceBuffer d_image; | |
| DeviceBuffer d_im_shape; | |
| DeviceBuffer d_scale_factor; | |
| DeviceBuffer d_boxes; | |
| DeviceBuffer d_counts; | |
| DeviceBuffer d_masks; | |
| PinnedHostBuffer h_boxes; | |
| PinnedHostBuffer h_counts; | |
| std::string image_name{"image"}; | |
| std::string im_shape_name{"im_shape"}; | |
| std::string scale_factor_name{"scale_factor"}; | |
| std::string boxes_name{"fetch_name_0"}; | |
| std::string counts_name{"fetch_name_1"}; | |
| std::string masks_name{"fetch_name_2"}; | |
| int max_batch{1}; | |
| ~TrtContext() { | |
| if (stream) { | |
| cudaStreamDestroy(stream); | |
| stream = nullptr; | |
| } | |
| } | |
| }; | |
| } // namespace | |
| extern "C" { | |
| TrtContext* trt_create(const char* engine_path) { | |
| auto data = read_file(engine_path); | |
| if (data.empty()) return nullptr; | |
| auto* ctx = new TrtContext(); | |
| ctx->runtime.reset(nvinfer1::createInferRuntime(g_logger)); | |
| if (!ctx->runtime) { delete ctx; return nullptr; } | |
| ctx->engine.reset(ctx->runtime->deserializeCudaEngine(data.data(), data.size())); | |
| if (!ctx->engine) { delete ctx; return nullptr; } | |
| ctx->context.reset(ctx->engine->createExecutionContext()); | |
| if (!ctx->context) { delete ctx; return nullptr; } | |
| if (cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking) != cudaSuccess) { delete ctx; return nullptr; } | |
| if (!find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->image_name.c_str()) || | |
| !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->im_shape_name.c_str()) || | |
| !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kINPUT, ctx->scale_factor_name.c_str()) || | |
| !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->boxes_name.c_str()) || | |
| !find_tensor(ctx->engine.get(), nvinfer1::TensorIOMode::kOUTPUT, ctx->counts_name.c_str())) { | |
| delete ctx; | |
| return nullptr; | |
| } | |
| auto profile = ctx->engine->getProfileShape(ctx->image_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX); | |
| ctx->max_batch = std::max(1, static_cast<int>(profile.d[0])); | |
| return ctx; | |
| } | |
| void trt_destroy(TrtContext* ctx) { delete ctx; } | |
| int trt_max_batch(TrtContext* ctx) { return ctx ? ctx->max_batch : 0; } | |
| int trt_infer( | |
| TrtContext* ctx, | |
| const float* image, | |
| const float* im_shape, | |
| const float* scale_factor, | |
| int batch, | |
| float* boxes_out, | |
| int32_t* counts_out) { | |
| if (!ctx || batch <= 0 || batch > ctx->max_batch) return -1; | |
| std::lock_guard<std::mutex> lock(ctx->mu); | |
| const size_t image_bytes = static_cast<size_t>(batch) * 3 * 800 * 800 * sizeof(float); | |
| const size_t meta_bytes = static_cast<size_t>(batch) * 2 * sizeof(float); | |
| const size_t boxes_bytes = static_cast<size_t>(batch) * 300 * 7 * sizeof(float); | |
| const size_t counts_bytes = static_cast<size_t>(batch) * sizeof(int32_t); | |
| const size_t masks_bytes = static_cast<size_t>(batch) * 300 * 200 * 200 * sizeof(int32_t); | |
| if (!ctx->d_image.reset(image_bytes) || !ctx->d_im_shape.reset(meta_bytes) || | |
| !ctx->d_scale_factor.reset(meta_bytes) || !ctx->d_boxes.reset(boxes_bytes) || | |
| !ctx->d_counts.reset(counts_bytes) || !ctx->d_masks.reset(masks_bytes) || | |
| !ctx->h_boxes.reset(boxes_bytes) || !ctx->h_counts.reset(counts_bytes)) { | |
| return -2; | |
| } | |
| if (!ctx->context->setInputShape(ctx->image_name.c_str(), nvinfer1::Dims4{batch, 3, 800, 800}) || | |
| !ctx->context->setInputShape(ctx->im_shape_name.c_str(), nvinfer1::Dims2{batch, 2}) || | |
| !ctx->context->setInputShape(ctx->scale_factor_name.c_str(), nvinfer1::Dims2{batch, 2})) { | |
| return -3; | |
| } | |
| if (cudaMemcpyAsync(ctx->d_image.ptr, image, image_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess || | |
| cudaMemcpyAsync(ctx->d_im_shape.ptr, im_shape, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess || | |
| cudaMemcpyAsync(ctx->d_scale_factor.ptr, scale_factor, meta_bytes, cudaMemcpyHostToDevice, ctx->stream) != cudaSuccess) { | |
| return -4; | |
| } | |
| ctx->context->setTensorAddress(ctx->image_name.c_str(), ctx->d_image.ptr); | |
| ctx->context->setTensorAddress(ctx->im_shape_name.c_str(), ctx->d_im_shape.ptr); | |
| ctx->context->setTensorAddress(ctx->scale_factor_name.c_str(), ctx->d_scale_factor.ptr); | |
| ctx->context->setTensorAddress(ctx->boxes_name.c_str(), ctx->d_boxes.ptr); | |
| ctx->context->setTensorAddress(ctx->counts_name.c_str(), ctx->d_counts.ptr); | |
| // The mask output is required by the engine but not by layout consumers. Keep it on-device. | |
| ctx->context->setTensorAddress(ctx->masks_name.c_str(), ctx->d_masks.ptr); | |
| if (!ctx->context->enqueueV3(ctx->stream)) return -5; | |
| if (cudaMemcpyAsync(ctx->h_boxes.ptr, ctx->d_boxes.ptr, boxes_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess || | |
| cudaMemcpyAsync(ctx->h_counts.ptr, ctx->d_counts.ptr, counts_bytes, cudaMemcpyDeviceToHost, ctx->stream) != cudaSuccess) { | |
| return -6; | |
| } | |
| if (cudaStreamSynchronize(ctx->stream) != cudaSuccess) return -7; | |
| std::memcpy(boxes_out, ctx->h_boxes.ptr, boxes_bytes); | |
| std::memcpy(counts_out, ctx->h_counts.ptr, counts_bytes); | |
| return 0; | |
| } | |
| } // extern "C" | |