add files

Browse files

Files changed (9) hide show

data/input_tensor_fp32.dat +0 -0
data/shape.bin +3 -0
encoder-epoch-9999-avg-1.onnx +3 -0
export_trt_8_6.log +0 -0
export_trt_8_6.sh +34 -0
export_trt_9_0.log +0 -0
export_trt_9_0_0_2.sh +35 -0
infer_trt_8_6.log +246 -0
infer_trt_9_0.log +247 -0

data/input_tensor_fp32.dat ADDED Viewed

Binary file (212 kB). View file

data/shape.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:576053992b4dd1d4a98dda6772f43515220a0414a1794b37f4928bb775f6a729
+size 8

encoder-epoch-9999-avg-1.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5f8ee5a0f1c61882e9124aee50b24069af0553cb626a55a78456bd02cdd4c6c
+size 330493901

export_trt_8_6.log ADDED Viewed

The diff for this file is too large to render. See raw diff

export_trt_8_6.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+# nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+export CUDA_VISIBLE_DEVICES="2" # A10
+onnx_model=./encoder-epoch-9999-avg-1.onnx
+trt_model=./encoder.trt86.plan
+MIN_BATCH=1
+OPT_BATCH=4
+MAX_BATCH=8
+ENC_MIN_LEN=16
+ENC_OPT_LEN=512
+ENC_MAX_LEN=2000
+trtexec \
+--onnx=$onnx_model \
+--minShapes=x:${MIN_BATCH}x${ENC_MIN_LEN}x80,x_lens:${MIN_BATCH} \
+--optShapes=x:${OPT_BATCH}x${ENC_OPT_LEN}x80,x_lens:${OPT_BATCH} \
+--maxShapes=x:${MAX_BATCH}x${ENC_MAX_LEN}x80,x_lens:${MAX_BATCH} \
+--fp16 \
+--skipInference \
+--verbose \
+--saveEngine=$trt_model > export_trt_8_6.log
+data=./data
+trtexec \
+--fp16 \
+--loadInputs=x:$data/input_tensor_fp32.dat,x_lens:$data/shape.bin \
+--shapes=x:1x663x80,x_lens:1 \
+--verbose \
+--loadEngine=$trt_model > infer_trt_8_6.log

export_trt_9_0.log ADDED Viewed

The diff for this file is too large to render. See raw diff

export_trt_9_0_0_2.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+# nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+export CUDA_VISIBLE_DEVICES="2" # A10
+trtexec_path=../TensorRT-9.0.0.2/bin/trtexec
+LD_LIBRARY_PATH=/tmpdisk/yuekaiz/quant/TensorRT-9.0.0.2/lib
+onnx_model=./encoder-epoch-9999-avg-1.onnx
+trt_model=./encoder.trt90.plan
+MIN_BATCH=1
+OPT_BATCH=4
+MAX_BATCH=8
+ENC_MIN_LEN=16
+ENC_OPT_LEN=512
+ENC_MAX_LEN=2000
+trtexec \
+--onnx=$onnx_model \
+--minShapes=x:${MIN_BATCH}x${ENC_MIN_LEN}x80,x_lens:${MIN_BATCH} \
+--optShapes=x:${OPT_BATCH}x${ENC_OPT_LEN}x80,x_lens:${OPT_BATCH} \
+--maxShapes=x:${MAX_BATCH}x${ENC_MAX_LEN}x80,x_lens:${MAX_BATCH} \
+--fp16 \
+--skipInference \
+--verbose \
+--saveEngine=$trt_model > export_trt_9_0.log
+data=./data
+trtexec \
+--fp16 \
+--loadInputs=x:$data/input_tensor_fp32.dat,x_lens:$data/shape.bin \
+--shapes=x:1x663x80,x_lens:1 \
+--verbose \
+--loadEngine=$trt_model > infer_trt_9_0.log

infer_trt_8_6.log ADDED Viewed

	@@ -0,0 +1,246 @@

+&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --fp16 --loadInputs=x:./data/input_tensor_fp32.dat,x_lens:./data/shape.bin --shapes=x:1x663x80,x_lens:1 --verbose --loadEngine=./encoder.trt86.plan
+[07/20/2023-06:34:15] [I] === Model Options ===
+[07/20/2023-06:34:15] [I] Format: *
+[07/20/2023-06:34:15] [I] Model:
+[07/20/2023-06:34:15] [I] Output:
+[07/20/2023-06:34:15] [I] === Build Options ===
+[07/20/2023-06:34:15] [I] Max batch: explicit batch
+[07/20/2023-06:34:15] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
+[07/20/2023-06:34:15] [I] minTiming: 1
+[07/20/2023-06:34:15] [I] avgTiming: 8
+[07/20/2023-06:34:15] [I] Precision: FP32+FP16
+[07/20/2023-06:34:15] [I] LayerPrecisions:
+[07/20/2023-06:34:15] [I] Layer Device Types:
+[07/20/2023-06:34:15] [I] Calibration:
+[07/20/2023-06:34:15] [I] Refit: Disabled
+[07/20/2023-06:34:15] [I] Version Compatible: Disabled
+[07/20/2023-06:34:15] [I] TensorRT runtime: full
+[07/20/2023-06:34:15] [I] Lean DLL Path:
+[07/20/2023-06:34:15] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
+[07/20/2023-06:34:15] [I] Exclude Lean Runtime: Disabled
+[07/20/2023-06:34:15] [I] Sparsity: Disabled
+[07/20/2023-06:34:15] [I] Safe mode: Disabled
+[07/20/2023-06:34:15] [I] Build DLA standalone loadable: Disabled
+[07/20/2023-06:34:15] [I] Allow GPU fallback for DLA: Disabled
+[07/20/2023-06:34:15] [I] DirectIO mode: Disabled
+[07/20/2023-06:34:15] [I] Restricted mode: Disabled
+[07/20/2023-06:34:15] [I] Skip inference: Disabled
+[07/20/2023-06:34:15] [I] Save engine:
+[07/20/2023-06:34:15] [I] Load engine: ./encoder.trt86.plan
+[07/20/2023-06:34:15] [I] Profiling verbosity: 0
+[07/20/2023-06:34:15] [I] Tactic sources: Using default tactic sources
+[07/20/2023-06:34:15] [I] timingCacheMode: local
+[07/20/2023-06:34:15] [I] timingCacheFile:
+[07/20/2023-06:34:15] [I] Heuristic: Disabled
+[07/20/2023-06:34:15] [I] Preview Features: Use default preview flags.
+[07/20/2023-06:34:15] [I] MaxAuxStreams: -1
+[07/20/2023-06:34:15] [I] BuilderOptimizationLevel: -1
+[07/20/2023-06:34:15] [I] Input(s)s format: fp32:CHW
+[07/20/2023-06:34:15] [I] Output(s)s format: fp32:CHW
+[07/20/2023-06:34:15] [I] Input build shape: x=1x663x80+1x663x80+1x663x80
+[07/20/2023-06:34:15] [I] Input build shape: x_lens=1+1+1
+[07/20/2023-06:34:15] [I] Input calibration shapes: model
+[07/20/2023-06:34:15] [I] === System Options ===
+[07/20/2023-06:34:15] [I] Device: 0
+[07/20/2023-06:34:15] [I] DLACore:
+[07/20/2023-06:34:15] [I] Plugins:
+[07/20/2023-06:34:15] [I] setPluginsToSerialize:
+[07/20/2023-06:34:15] [I] dynamicPlugins:
+[07/20/2023-06:34:15] [I] ignoreParsedPluginLibs: 0
+[07/20/2023-06:34:15] [I]
+[07/20/2023-06:34:15] [I] === Inference Options ===
+[07/20/2023-06:34:15] [I] Batch: Explicit
+[07/20/2023-06:34:15] [I] Input inference shape: x_lens=1
+[07/20/2023-06:34:15] [I] Input inference shape: x=1x663x80
+[07/20/2023-06:34:15] [I] Iterations: 10
+[07/20/2023-06:34:15] [I] Duration: 3s (+ 200ms warm up)
+[07/20/2023-06:34:15] [I] Sleep time: 0ms
+[07/20/2023-06:34:15] [I] Idle time: 0ms
+[07/20/2023-06:34:15] [I] Inference Streams: 1
+[07/20/2023-06:34:15] [I] ExposeDMA: Disabled
+[07/20/2023-06:34:15] [I] Data transfers: Enabled
+[07/20/2023-06:34:15] [I] Spin-wait: Disabled
+[07/20/2023-06:34:15] [I] Multithreading: Disabled
+[07/20/2023-06:34:15] [I] CUDA Graph: Disabled
+[07/20/2023-06:34:15] [I] Separate profiling: Disabled
+[07/20/2023-06:34:15] [I] Time Deserialize: Disabled
+[07/20/2023-06:34:15] [I] Time Refit: Disabled
+[07/20/2023-06:34:15] [I] NVTX verbosity: 0
+[07/20/2023-06:34:15] [I] Persistent Cache Ratio: 0
+[07/20/2023-06:34:15] [I] Inputs:
+[07/20/2023-06:34:15] [I] x_lens<-./data/shape.bin
+[07/20/2023-06:34:15] [I] x<-./data/input_tensor_fp32.dat
+[07/20/2023-06:34:15] [I] === Reporting Options ===
+[07/20/2023-06:34:15] [I] Verbose: Enabled
+[07/20/2023-06:34:15] [I] Averages: 10 inferences
+[07/20/2023-06:34:15] [I] Percentiles: 90,95,99
+[07/20/2023-06:34:15] [I] Dump refittable layers:Disabled
+[07/20/2023-06:34:15] [I] Dump output: Disabled
+[07/20/2023-06:34:15] [I] Profile: Disabled
+[07/20/2023-06:34:15] [I] Export timing to JSON file:
+[07/20/2023-06:34:15] [I] Export output to JSON file:
+[07/20/2023-06:34:15] [I] Export profile to JSON file:
+[07/20/2023-06:34:15] [I]
+[07/20/2023-06:34:15] [I] === Device Information ===
+[07/20/2023-06:34:15] [I] Selected Device: NVIDIA A10
+[07/20/2023-06:34:15] [I] Compute Capability: 8.6
+[07/20/2023-06:34:15] [I] SMs: 72
+[07/20/2023-06:34:15] [I] Device Global Memory: 22723 MiB
+[07/20/2023-06:34:15] [I] Shared Memory per SM: 100 KiB
+[07/20/2023-06:34:15] [I] Memory Bus Width: 384 bits (ECC enabled)
+[07/20/2023-06:34:15] [I] Application Compute Clock Rate: 1.695 GHz
+[07/20/2023-06:34:15] [I] Application Memory Clock Rate: 6.251 GHz
+[07/20/2023-06:34:15] [I]
+[07/20/2023-06:34:15] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
+[07/20/2023-06:34:15] [I]
+[07/20/2023-06:34:15] [I] TensorRT version: 8.6.1
+[07/20/2023-06:34:15] [I] Loading standard plugins
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::BatchedNMSDynamic_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::BatchedNMS_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::BatchTilePlugin_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Clip_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::CoordConvAC version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::CropAndResizeDynamic version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::CropAndResize version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::DecodeBbox3DPlugin version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::DetectionLayer_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::EfficientNMS_Explicit_TF_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::EfficientNMS_Implicit_TF_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::EfficientNMS_ONNX_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::EfficientNMS_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::FlattenConcat_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::GenerateDetection_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::GridAnchor_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::GridAnchorRect_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 2
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::LReLU_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ModulatedDeformConv2d version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::MultilevelCropAndResize_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::MultilevelProposeROI_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::MultiscaleDeformableAttnPlugin_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::NMSDynamic_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::NMS_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Normalize_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::PillarScatterPlugin version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::PriorBox_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ProposalDynamic version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ProposalLayer_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Proposal version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::PyramidROIAlign_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Region_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Reorg_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ResizeNearest_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ROIAlign_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::RPROI_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::ScatterND version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::SpecialSlice_TRT version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::Split version 1
+[07/20/2023-06:34:15] [V] [TRT] Registered plugin creator - ::VoxelGeneratorPlugin version 1
+[07/20/2023-06:34:16] [I] Engine loaded in 0.384222 sec.
+[07/20/2023-06:34:16] [I] [TRT] Loaded engine size: 202 MiB
+[07/20/2023-06:34:16] [V] [TRT] Deserialization required 182242 microseconds.
+[07/20/2023-06:34:16] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +187, now: CPU 0, GPU 187 (MiB)
+[07/20/2023-06:34:16] [I] Engine deserialized in 0.192446 sec.
+[07/20/2023-06:34:16] [I] [TRT] [MS] Running engine with multi stream info
+[07/20/2023-06:34:16] [I] [TRT] [MS] Number of aux streams is 1
+[07/20/2023-06:34:16] [I] [TRT] [MS] Number of total worker streams is 2
+[07/20/2023-06:34:16] [I] [TRT] [MS] The main stream provided by execute/enqueue calls is the first worker stream
+[07/20/2023-06:34:16] [V] [TRT] Total per-runner device persistent memory is 26624
+[07/20/2023-06:34:16] [V] [TRT] Total per-runner host persistent memory is 236944
+[07/20/2023-06:34:16] [V] [TRT] Allocated activation device memory of size 358747136
+[07/20/2023-06:34:17] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +342, now: CPU 0, GPU 529 (MiB)
+[07/20/2023-06:34:17] [V] [TRT] CUDA lazy loading is enabled.
+[07/20/2023-06:34:17] [I] Setting persistentCacheLimit to 0 bytes.
+[07/20/2023-06:34:17] [V] Using enqueueV3.
+[07/20/2023-06:34:17] [I] Using values loaded from ./data/input_tensor_fp32.dat for input x
+[07/20/2023-06:34:17] [I] Input binding for x with dimensions 1x663x80 is created.
+[07/20/2023-06:34:17] [I] Using values loaded from ./data/shape.bin for input x_lens
+[07/20/2023-06:34:17] [I] Input binding for x_lens with dimensions 1 is created.
+[07/20/2023-06:34:17] [I] Output binding for encoder_out_lens with dimensions 1 is created.
+[07/20/2023-06:34:17] [I] Output binding for encoder_out with dimensions 1x165x512 is created.
+[07/20/2023-06:34:17] [I] Starting inference
+[07/20/2023-06:34:20] [I] Warmup completed 1 queries over 200 ms
+[07/20/2023-06:34:20] [I] Timing trace has 567 queries over 2.6078 s
+[07/20/2023-06:34:20] [I]
+[07/20/2023-06:34:20] [I] === Trace details ===
+[07/20/2023-06:34:20] [I] Trace averages of 10 runs:
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.90053 ms - Host latency: 4.9611 ms (enqueue 4.91876 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.66885 ms - Host latency: 4.72742 ms (enqueue 4.68766 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.66589 ms - Host latency: 4.7231 ms (enqueue 4.68474 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.51881 ms - Host latency: 4.57554 ms (enqueue 4.5366 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.49072 ms - Host latency: 4.54744 ms (enqueue 4.50952 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.49101 ms - Host latency: 4.54795 ms (enqueue 4.50985 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.48234 ms - Host latency: 4.54015 ms (enqueue 4.49977 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.48087 ms - Host latency: 4.53997 ms (enqueue 4.49955 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 5.3283 ms - Host latency: 5.38591 ms (enqueue 5.34688 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46555 ms - Host latency: 4.52213 ms (enqueue 4.48347 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46613 ms - Host latency: 4.524 ms (enqueue 4.48506 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.4629 ms - Host latency: 4.51975 ms (enqueue 4.48197 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.47987 ms - Host latency: 4.53892 ms (enqueue 4.49877 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.47563 ms - Host latency: 4.53213 ms (enqueue 4.49468 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.53678 ms - Host latency: 4.59436 ms (enqueue 4.55585 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.48102 ms - Host latency: 4.53719 ms (enqueue 4.49963 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.5464 ms - Host latency: 4.60463 ms (enqueue 4.56405 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.52257 ms - Host latency: 4.5788 ms (enqueue 4.54126 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.49714 ms - Host latency: 4.55417 ms (enqueue 4.51604 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.48652 ms - Host latency: 4.54661 ms (enqueue 4.50435 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46458 ms - Host latency: 4.52255 ms (enqueue 4.48344 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.58135 ms - Host latency: 4.63997 ms (enqueue 4.6 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.54404 ms - Host latency: 4.60205 ms (enqueue 4.561 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.57368 ms - Host latency: 4.63359 ms (enqueue 4.59261 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.53652 ms - Host latency: 4.59491 ms (enqueue 4.55535 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.51669 ms - Host latency: 4.57617 ms (enqueue 4.5356 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.47855 ms - Host latency: 4.53638 ms (enqueue 4.49764 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.45673 ms - Host latency: 4.51466 ms (enqueue 4.4757 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.47465 ms - Host latency: 4.53146 ms (enqueue 4.49337 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.45446 ms - Host latency: 4.51252 ms (enqueue 4.47321 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 5.01163 ms - Host latency: 5.06942 ms (enqueue 5.03057 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.47191 ms - Host latency: 4.53047 ms (enqueue 4.48738 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.43701 ms - Host latency: 4.49416 ms (enqueue 4.45591 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.44302 ms - Host latency: 4.50129 ms (enqueue 4.46169 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.4696 ms - Host latency: 4.52693 ms (enqueue 4.4885 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.4438 ms - Host latency: 4.50142 ms (enqueue 4.46267 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46218 ms - Host latency: 4.52253 ms (enqueue 4.48193 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.44719 ms - Host latency: 4.50408 ms (enqueue 4.46602 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.43398 ms - Host latency: 4.49001 ms (enqueue 4.45293 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.48477 ms - Host latency: 4.54153 ms (enqueue 4.50374 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.50164 ms - Host latency: 4.55869 ms (enqueue 4.51868 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46689 ms - Host latency: 4.52322 ms (enqueue 4.48577 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46245 ms - Host latency: 4.51956 ms (enqueue 4.48162 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.50642 ms - Host latency: 4.56414 ms (enqueue 4.52544 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.44534 ms - Host latency: 4.50247 ms (enqueue 4.46443 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.53066 ms - Host latency: 4.59226 ms (enqueue 4.54941 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.61397 ms - Host latency: 4.67122 ms (enqueue 4.63289 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.54465 ms - Host latency: 4.60376 ms (enqueue 4.56128 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.57368 ms - Host latency: 4.63081 ms (enqueue 4.59238 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.52886 ms - Host latency: 4.58838 ms (enqueue 4.54763 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.54368 ms - Host latency: 4.60066 ms (enqueue 4.5627 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46804 ms - Host latency: 4.52537 ms (enqueue 4.48708 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.45071 ms - Host latency: 4.50845 ms (enqueue 4.46965 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.50767 ms - Host latency: 4.5646 ms (enqueue 4.5262 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46208 ms - Host latency: 4.51965 ms (enqueue 4.48098 ms)
+[07/20/2023-06:34:20] [I] Average on 10 runs - GPU latency: 4.46499 ms - Host latency: 4.52129 ms (enqueue 4.48401 ms)
+[07/20/2023-06:34:20] [I]
+[07/20/2023-06:34:20] [I] === Performance summary ===
+[07/20/2023-06:34:20] [I] Throughput: 217.425 qps
+[07/20/2023-06:34:20] [I] Latency: min = 4.43164 ms, max = 9.96777 ms, mean = 4.58715 ms, median = 4.5459 ms, percentile(90%) = 4.67822 ms, percentile(95%) = 4.72693 ms, percentile(99%) = 4.99823 ms
+[07/20/2023-06:34:20] [I] Enqueue Time: min = 4.39209 ms, max = 9.92981 ms, mean = 4.54799 ms, median = 4.50757 ms, percentile(90%) = 4.64062 ms, percentile(95%) = 4.68896 ms, percentile(99%) = 4.95935 ms
+[07/20/2023-06:34:20] [I] H2D Latency: min = 0.0234375 ms, max = 0.0705566 ms, mean = 0.0249881 ms, median = 0.0246582 ms, percentile(90%) = 0.0251465 ms, percentile(95%) = 0.0253906 ms, percentile(99%) = 0.036499 ms
+[07/20/2023-06:34:20] [I] GPU Compute Time: min = 4.37329 ms, max = 9.91211 ms, mean = 4.52938 ms, median = 4.48889 ms, percentile(90%) = 4.62195 ms, percentile(95%) = 4.67029 ms, percentile(99%) = 4.94055 ms
+[07/20/2023-06:34:20] [I] D2H Latency: min = 0.0300293 ms, max = 0.0609131 ms, mean = 0.0327816 ms, median = 0.0314941 ms, percentile(90%) = 0.0368652 ms, percentile(95%) = 0.0383301 ms, percentile(99%) = 0.0430908 ms
+[07/20/2023-06:34:20] [I] Total Host Walltime: 2.6078 s
+[07/20/2023-06:34:20] [I] Total GPU Compute Time: 2.56816 s
+[07/20/2023-06:34:20] [I] Explanations of the performance metrics are printed in the verbose logs.
+[07/20/2023-06:34:20] [V]
+[07/20/2023-06:34:20] [V] === Explanations of the performance metrics ===
+[07/20/2023-06:34:20] [V] Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the last query is completed.
+[07/20/2023-06:34:20] [V] GPU Compute Time: the GPU latency to execute the kernels for a query.
+[07/20/2023-06:34:20] [V] Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data transfers.
+[07/20/2023-06:34:20] [V] Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized because of host-side overheads or data transfers.
+[07/20/2023-06:34:20] [V] Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be under-utilized.
+[07/20/2023-06:34:20] [V] H2D Latency: the latency for host-to-device data transfers for input tensors of a single query.
+[07/20/2023-06:34:20] [V] D2H Latency: the latency for device-to-host data transfers for output tensors of a single query.
+[07/20/2023-06:34:20] [V] Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a single query.
+[07/20/2023-06:34:20] [I]
+&&&& PASSED TensorRT.trtexec [TensorRT v8601] # trtexec --fp16 --loadInputs=x:./data/input_tensor_fp32.dat,x_lens:./data/shape.bin --shapes=x:1x663x80,x_lens:1 --verbose --loadEngine=./encoder.trt86.plan

infer_trt_9_0.log ADDED Viewed

	@@ -0,0 +1,247 @@

+&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --fp16 --loadInputs=x:./data/input_tensor_fp32.dat,x_lens:./data/shape.bin --shapes=x:1x663x80,x_lens:1 --verbose --loadEngine=./encoder.trt90.plan
+[07/20/2023-06:39:32] [I] === Model Options ===
+[07/20/2023-06:39:32] [I] Format: *
+[07/20/2023-06:39:32] [I] Model:
+[07/20/2023-06:39:32] [I] Output:
+[07/20/2023-06:39:32] [I] === Build Options ===
+[07/20/2023-06:39:32] [I] Max batch: explicit batch
+[07/20/2023-06:39:32] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
+[07/20/2023-06:39:32] [I] minTiming: 1
+[07/20/2023-06:39:32] [I] avgTiming: 8
+[07/20/2023-06:39:32] [I] Precision: FP32+FP16
+[07/20/2023-06:39:32] [I] LayerPrecisions:
+[07/20/2023-06:39:32] [I] Layer Device Types:
+[07/20/2023-06:39:32] [I] Calibration:
+[07/20/2023-06:39:32] [I] Refit: Disabled
+[07/20/2023-06:39:32] [I] Version Compatible: Disabled
+[07/20/2023-06:39:32] [I] TensorRT runtime: full
+[07/20/2023-06:39:32] [I] Lean DLL Path:
+[07/20/2023-06:39:32] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
+[07/20/2023-06:39:32] [I] Exclude Lean Runtime: Disabled
+[07/20/2023-06:39:32] [I] Sparsity: Disabled
+[07/20/2023-06:39:32] [I] Safe mode: Disabled
+[07/20/2023-06:39:32] [I] Build DLA standalone loadable: Disabled
+[07/20/2023-06:39:32] [I] Allow GPU fallback for DLA: Disabled
+[07/20/2023-06:39:32] [I] DirectIO mode: Disabled
+[07/20/2023-06:39:32] [I] Restricted mode: Disabled
+[07/20/2023-06:39:32] [I] Skip inference: Disabled
+[07/20/2023-06:39:32] [I] Save engine:
+[07/20/2023-06:39:32] [I] Load engine: ./encoder.trt90.plan
+[07/20/2023-06:39:32] [I] Profiling verbosity: 0
+[07/20/2023-06:39:32] [I] Tactic sources: Using default tactic sources
+[07/20/2023-06:39:32] [I] timingCacheMode: local
+[07/20/2023-06:39:32] [I] timingCacheFile:
+[07/20/2023-06:39:32] [I] Heuristic: Disabled
+[07/20/2023-06:39:32] [I] Preview Features: Use default preview flags.
+[07/20/2023-06:39:32] [I] MaxAuxStreams: -1
+[07/20/2023-06:39:32] [I] BuilderOptimizationLevel: -1
+[07/20/2023-06:39:32] [I] Input(s)s format: fp32:CHW
+[07/20/2023-06:39:32] [I] Output(s)s format: fp32:CHW
+[07/20/2023-06:39:32] [I] Input build shape: x=1x663x80+1x663x80+1x663x80
+[07/20/2023-06:39:32] [I] Input build shape: x_lens=1+1+1
+[07/20/2023-06:39:32] [I] Input calibration shapes: model
+[07/20/2023-06:39:32] [I] === System Options ===
+[07/20/2023-06:39:32] [I] Device: 0
+[07/20/2023-06:39:32] [I] DLACore:
+[07/20/2023-06:39:32] [I] Plugins:
+[07/20/2023-06:39:32] [I] setPluginsToSerialize:
+[07/20/2023-06:39:32] [I] dynamicPlugins:
+[07/20/2023-06:39:32] [I] ignoreParsedPluginLibs: 0
+[07/20/2023-06:39:32] [I]
+[07/20/2023-06:39:32] [I] === Inference Options ===
+[07/20/2023-06:39:32] [I] Batch: Explicit
+[07/20/2023-06:39:32] [I] Input inference shape: x_lens=1
+[07/20/2023-06:39:32] [I] Input inference shape: x=1x663x80
+[07/20/2023-06:39:32] [I] Iterations: 10
+[07/20/2023-06:39:32] [I] Duration: 3s (+ 200ms warm up)
+[07/20/2023-06:39:32] [I] Sleep time: 0ms
+[07/20/2023-06:39:32] [I] Idle time: 0ms
+[07/20/2023-06:39:32] [I] Inference Streams: 1
+[07/20/2023-06:39:32] [I] ExposeDMA: Disabled
+[07/20/2023-06:39:32] [I] Data transfers: Enabled
+[07/20/2023-06:39:32] [I] Spin-wait: Disabled
+[07/20/2023-06:39:32] [I] Multithreading: Disabled
+[07/20/2023-06:39:32] [I] CUDA Graph: Disabled
+[07/20/2023-06:39:32] [I] Separate profiling: Disabled
+[07/20/2023-06:39:32] [I] Time Deserialize: Disabled
+[07/20/2023-06:39:32] [I] Time Refit: Disabled
+[07/20/2023-06:39:32] [I] NVTX verbosity: 0
+[07/20/2023-06:39:32] [I] Persistent Cache Ratio: 0
+[07/20/2023-06:39:32] [I] Inputs:
+[07/20/2023-06:39:32] [I] x_lens<-./data/shape.bin
+[07/20/2023-06:39:32] [I] x<-./data/input_tensor_fp32.dat
+[07/20/2023-06:39:32] [I] === Reporting Options ===
+[07/20/2023-06:39:32] [I] Verbose: Enabled
+[07/20/2023-06:39:32] [I] Averages: 10 inferences
+[07/20/2023-06:39:32] [I] Percentiles: 90,95,99
+[07/20/2023-06:39:32] [I] Dump refittable layers:Disabled
+[07/20/2023-06:39:32] [I] Dump output: Disabled
+[07/20/2023-06:39:32] [I] Profile: Disabled
+[07/20/2023-06:39:32] [I] Export timing to JSON file:
+[07/20/2023-06:39:32] [I] Export output to JSON file:
+[07/20/2023-06:39:32] [I] Export profile to JSON file:
+[07/20/2023-06:39:32] [I]
+[07/20/2023-06:39:32] [I] === Device Information ===
+[07/20/2023-06:39:32] [I] Selected Device: NVIDIA A10
+[07/20/2023-06:39:32] [I] Compute Capability: 8.6
+[07/20/2023-06:39:32] [I] SMs: 72
+[07/20/2023-06:39:32] [I] Device Global Memory: 22723 MiB
+[07/20/2023-06:39:32] [I] Shared Memory per SM: 100 KiB
+[07/20/2023-06:39:32] [I] Memory Bus Width: 384 bits (ECC enabled)
+[07/20/2023-06:39:32] [I] Application Compute Clock Rate: 1.695 GHz
+[07/20/2023-06:39:32] [I] Application Memory Clock Rate: 6.251 GHz
+[07/20/2023-06:39:32] [I]
+[07/20/2023-06:39:32] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
+[07/20/2023-06:39:32] [I]
+[07/20/2023-06:39:32] [I] TensorRT version: 8.6.1
+[07/20/2023-06:39:32] [I] Loading standard plugins
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::BatchedNMSDynamic_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::BatchedNMS_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::BatchTilePlugin_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Clip_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::CoordConvAC version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::CropAndResizeDynamic version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::CropAndResize version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::DecodeBbox3DPlugin version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::DetectionLayer_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::EfficientNMS_Explicit_TF_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::EfficientNMS_Implicit_TF_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::EfficientNMS_ONNX_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::EfficientNMS_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::FlattenConcat_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::GenerateDetection_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::GridAnchor_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::GridAnchorRect_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 2
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::LReLU_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ModulatedDeformConv2d version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::MultilevelCropAndResize_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::MultilevelProposeROI_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::MultiscaleDeformableAttnPlugin_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::NMSDynamic_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::NMS_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Normalize_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::PillarScatterPlugin version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::PriorBox_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ProposalDynamic version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ProposalLayer_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Proposal version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::PyramidROIAlign_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Region_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Reorg_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ResizeNearest_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ROIAlign_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::RPROI_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::ScatterND version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::SpecialSlice_TRT version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::Split version 1
+[07/20/2023-06:39:32] [V] [TRT] Registered plugin creator - ::VoxelGeneratorPlugin version 1
+[07/20/2023-06:39:32] [I] Engine loaded in 0.299739 sec.
+[07/20/2023-06:39:32] [I] [TRT] Loaded engine size: 172 MiB
+[07/20/2023-06:39:32] [V] [TRT] Deserialization required 168596 microseconds.
+[07/20/2023-06:39:32] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +157, now: CPU 0, GPU 157 (MiB)
+[07/20/2023-06:39:32] [I] Engine deserialized in 0.178316 sec.
+[07/20/2023-06:39:32] [I] [TRT] [MS] Running engine with multi stream info
+[07/20/2023-06:39:32] [I] [TRT] [MS] Number of aux streams is 1
+[07/20/2023-06:39:32] [I] [TRT] [MS] Number of total worker streams is 2
+[07/20/2023-06:39:32] [I] [TRT] [MS] The main stream provided by execute/enqueue calls is the first worker stream
+[07/20/2023-06:39:32] [V] [TRT] Total per-runner device persistent memory is 26624
+[07/20/2023-06:39:32] [V] [TRT] Total per-runner host persistent memory is 236944
+[07/20/2023-06:39:32] [V] [TRT] Allocated activation device memory of size 195747840
+[07/20/2023-06:39:34] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +187, now: CPU 0, GPU 344 (MiB)
+[07/20/2023-06:39:34] [V] [TRT] CUDA lazy loading is enabled.
+[07/20/2023-06:39:34] [I] Setting persistentCacheLimit to 0 bytes.
+[07/20/2023-06:39:34] [V] Using enqueueV3.
+[07/20/2023-06:39:34] [I] Using values loaded from ./data/input_tensor_fp32.dat for input x
+[07/20/2023-06:39:34] [I] Input binding for x with dimensions 1x663x80 is created.
+[07/20/2023-06:39:34] [I] Using values loaded from ./data/shape.bin for input x_lens
+[07/20/2023-06:39:34] [I] Input binding for x_lens with dimensions 1 is created.
+[07/20/2023-06:39:34] [I] Output binding for encoder_out_lens with dimensions 1 is created.
+[07/20/2023-06:39:34] [I] Output binding for encoder_out with dimensions 1x165x512 is created.
+[07/20/2023-06:39:34] [I] Starting inference
+[07/20/2023-06:39:37] [I] Warmup completed 1 queries over 200 ms
+[07/20/2023-06:39:37] [I] Timing trace has 574 queries over 2.61978 s
+[07/20/2023-06:39:37] [I]
+[07/20/2023-06:39:37] [I] === Trace details ===
+[07/20/2023-06:39:37] [I] Trace averages of 10 runs:
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.62081 ms - Host latency: 4.67862 ms (enqueue 4.63986 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42862 ms - Host latency: 4.48674 ms (enqueue 4.44771 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.44559 ms - Host latency: 4.5021 ms (enqueue 4.46495 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.78253 ms - Host latency: 4.84121 ms (enqueue 4.80099 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 5.10478 ms - Host latency: 5.1652 ms (enqueue 5.1234 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.47202 ms - Host latency: 4.52838 ms (enqueue 4.49141 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.53895 ms - Host latency: 4.59928 ms (enqueue 4.55797 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.43728 ms - Host latency: 4.49832 ms (enqueue 4.45474 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42999 ms - Host latency: 4.48735 ms (enqueue 4.44911 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 5.26001 ms - Host latency: 5.31818 ms (enqueue 5.27854 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.39064 ms - Host latency: 4.44811 ms (enqueue 4.40873 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.43865 ms - Host latency: 4.49565 ms (enqueue 4.45797 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42271 ms - Host latency: 4.47957 ms (enqueue 4.44207 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.38179 ms - Host latency: 4.44067 ms (enqueue 4.40121 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.45331 ms - Host latency: 4.51089 ms (enqueue 4.47168 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.41946 ms - Host latency: 4.47736 ms (enqueue 4.43855 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.39794 ms - Host latency: 4.45735 ms (enqueue 4.41689 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.37582 ms - Host latency: 4.43264 ms (enqueue 4.3953 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.46124 ms - Host latency: 4.51959 ms (enqueue 4.48035 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.48876 ms - Host latency: 4.54752 ms (enqueue 4.5079 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.51276 ms - Host latency: 4.57158 ms (enqueue 4.53197 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.48157 ms - Host latency: 4.53878 ms (enqueue 4.50073 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.49991 ms - Host latency: 4.55793 ms (enqueue 4.51823 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42782 ms - Host latency: 4.48536 ms (enqueue 4.44718 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.40978 ms - Host latency: 4.46761 ms (enqueue 4.42886 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.46381 ms - Host latency: 4.52454 ms (enqueue 4.48304 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.46398 ms - Host latency: 4.52181 ms (enqueue 4.48431 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.40571 ms - Host latency: 4.4624 ms (enqueue 4.42491 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42034 ms - Host latency: 4.47875 ms (enqueue 4.43923 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.48055 ms - Host latency: 4.5373 ms (enqueue 4.49954 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42739 ms - Host latency: 4.48442 ms (enqueue 4.44679 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.80045 ms - Host latency: 4.85796 ms (enqueue 4.82057 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.44644 ms - Host latency: 4.50496 ms (enqueue 4.46082 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.46069 ms - Host latency: 4.51868 ms (enqueue 4.47988 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.404 ms - Host latency: 4.46167 ms (enqueue 4.42393 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.39226 ms - Host latency: 4.44932 ms (enqueue 4.41165 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.51948 ms - Host latency: 4.57761 ms (enqueue 4.53865 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.53506 ms - Host latency: 4.5928 ms (enqueue 4.55435 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.53059 ms - Host latency: 4.58679 ms (enqueue 4.54939 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.81067 ms - Host latency: 4.8678 ms (enqueue 4.82961 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.50547 ms - Host latency: 4.56389 ms (enqueue 4.52473 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.47329 ms - Host latency: 4.53115 ms (enqueue 4.49106 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.39207 ms - Host latency: 4.45146 ms (enqueue 4.41189 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42327 ms - Host latency: 4.48164 ms (enqueue 4.44353 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.50684 ms - Host latency: 4.56575 ms (enqueue 4.52615 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.4865 ms - Host latency: 4.54316 ms (enqueue 4.50479 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.4781 ms - Host latency: 4.53464 ms (enqueue 4.49729 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.54763 ms - Host latency: 4.60559 ms (enqueue 4.5679 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.45374 ms - Host latency: 4.51267 ms (enqueue 4.47231 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.40134 ms - Host latency: 4.45923 ms (enqueue 4.42114 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.4249 ms - Host latency: 4.48562 ms (enqueue 4.44421 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42776 ms - Host latency: 4.48767 ms (enqueue 4.447 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.42134 ms - Host latency: 4.47893 ms (enqueue 4.44067 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.53555 ms - Host latency: 4.59202 ms (enqueue 4.55483 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.44817 ms - Host latency: 4.5072 ms (enqueue 4.46709 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.41675 ms - Host latency: 4.47527 ms (enqueue 4.43608 ms)
+[07/20/2023-06:39:37] [I] Average on 10 runs - GPU latency: 4.37573 ms - Host latency: 4.43433 ms (enqueue 4.39592 ms)
+[07/20/2023-06:39:37] [I]
+[07/20/2023-06:39:37] [I] === Performance summary ===
+[07/20/2023-06:39:37] [I] Throughput: 219.102 qps
+[07/20/2023-06:39:37] [I] Latency: min = 4.35315 ms, max = 12.0612 ms, mean = 4.55479 ms, median = 4.50323 ms, percentile(90%) = 4.63 ms, percentile(95%) = 4.76953 ms, percentile(99%) = 5.70691 ms
+[07/20/2023-06:39:37] [I] Enqueue Time: min = 4.31763 ms, max = 12.0179 ms, mean = 4.51579 ms, median = 4.46381 ms, percentile(90%) = 4.59314 ms, percentile(95%) = 4.73193 ms, percentile(99%) = 5.66431 ms
+[07/20/2023-06:39:37] [I] H2D Latency: min = 0.0230713 ms, max = 0.0419922 ms, mean = 0.0246631 ms, median = 0.0244141 ms, percentile(90%) = 0.0249023 ms, percentile(95%) = 0.0251465 ms, percentile(99%) = 0.0393066 ms
+[07/20/2023-06:39:37] [I] GPU Compute Time: min = 4.29761 ms, max = 12.0051 ms, mean = 4.49672 ms, median = 4.4447 ms, percentile(90%) = 4.57495 ms, percentile(95%) = 4.71344 ms, percentile(99%) = 5.64911 ms
+[07/20/2023-06:39:37] [I] D2H Latency: min = 0.0305176 ms, max = 0.0598145 ms, mean = 0.033407 ms, median = 0.0324707 ms, percentile(90%) = 0.0371094 ms, percentile(95%) = 0.0386963 ms, percentile(99%) = 0.0437012 ms
+[07/20/2023-06:39:37] [I] Total Host Walltime: 2.61978 s
+[07/20/2023-06:39:37] [I] Total GPU Compute Time: 2.58112 s
+[07/20/2023-06:39:37] [I] Explanations of the performance metrics are printed in the verbose logs.
+[07/20/2023-06:39:37] [V]
+[07/20/2023-06:39:37] [V] === Explanations of the performance metrics ===
+[07/20/2023-06:39:37] [V] Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the last query is completed.
+[07/20/2023-06:39:37] [V] GPU Compute Time: the GPU latency to execute the kernels for a query.
+[07/20/2023-06:39:37] [V] Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data transfers.
+[07/20/2023-06:39:37] [V] Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized because of host-side overheads or data transfers.
+[07/20/2023-06:39:37] [V] Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be under-utilized.
+[07/20/2023-06:39:37] [V] H2D Latency: the latency for host-to-device data transfers for input tensors of a single query.
+[07/20/2023-06:39:37] [V] D2H Latency: the latency for device-to-host data transfers for output tensors of a single query.
+[07/20/2023-06:39:37] [V] Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a single query.
+[07/20/2023-06:39:37] [I]
+&&&& PASSED TensorRT.trtexec [TensorRT v8601] # trtexec --fp16 --loadInputs=x:./data/input_tensor_fp32.dat,x_lens:./data/shape.bin --shapes=x:1x663x80,x_lens:1 --verbose --loadEngine=./encoder.trt90.plan