Instructions to use bndos/pp-doclayout-v3-trt with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TensorRT
How to use bndos/pp-doclayout-v3-trt with TensorRT:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import argparse | |
| import concurrent.futures as cf | |
| import json | |
| import statistics | |
| import time | |
| from pathlib import Path | |
| import requests | |
| def call(url: str) -> tuple[float, int, float]: | |
| start = time.perf_counter() | |
| r = requests.post(url, json={"return_boxes": True}, timeout=30) | |
| r.raise_for_status() | |
| latency = (time.perf_counter() - start) * 1000.0 | |
| data = r.json() | |
| result = data["results"][0] | |
| return latency, int(result.get("batch_size", 0)), float(result.get("infer_us", 0.0)) / 1000.0 | |
| def run(url: str, concurrency: int, requests_count: int, warmup: int) -> dict: | |
| for _ in range(warmup): | |
| call(url) | |
| latencies = [] | |
| batch_sizes = [] | |
| infer_ms = [] | |
| start = time.perf_counter() | |
| with cf.ThreadPoolExecutor(max_workers=concurrency) as ex: | |
| futs = [ex.submit(call, url) for _ in range(requests_count)] | |
| for fut in cf.as_completed(futs): | |
| lat, batch_size, infer = fut.result() | |
| latencies.append(lat) | |
| batch_sizes.append(batch_size) | |
| infer_ms.append(infer) | |
| elapsed = time.perf_counter() - start | |
| latencies.sort() | |
| return { | |
| "server": "rust_dynamic_batcher", | |
| "concurrency": concurrency, | |
| "requests": requests_count, | |
| "pages": requests_count, | |
| "elapsed_s": elapsed, | |
| "pages_per_s": requests_count / elapsed, | |
| "p50_ms": statistics.median(latencies), | |
| "p95_ms": latencies[int(len(latencies) * 0.95) - 1], | |
| "p99_ms": latencies[int(len(latencies) * 0.99) - 1], | |
| "min_ms": latencies[0], | |
| "max_ms": latencies[-1], | |
| "avg_observed_batch": sum(batch_sizes) / len(batch_sizes), | |
| "avg_engine_infer_ms_per_batch": sum(infer_ms) / len(infer_ms), | |
| } | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--url", default="http://localhost:18082/v1/infer") | |
| ap.add_argument("--concurrency", type=int, required=True) | |
| ap.add_argument("--requests", type=int, default=200) | |
| ap.add_argument("--warmup", type=int, default=10) | |
| ap.add_argument("--output", type=Path) | |
| args = ap.parse_args() | |
| result = run(args.url, args.concurrency, args.requests, args.warmup) | |
| print(json.dumps(result, indent=2)) | |
| if args.output: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | |
| args.output.write_text(json.dumps(result, indent=2) + "\n", encoding="utf-8") | |
| if __name__ == "__main__": | |
| main() | |