1231: g0plus dockerfile

38fb1f6 verified about 2 months ago

10.4 kB

	#
	# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import os
	import sys
	import time
	import argparse
	import numpy as np
	import tensorrt as trt
	from cuda import cudart

	sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
	import common

	from image_batcher import ImageBatcher
	from visualize import visualize_detections


	class TensorRTInfer:
	"""
	Implements inference for the EfficientDet TensorRT engine.
	"""

	def __init__(self, engine_path):
	"""
	:param engine_path: The path to the serialized engine to load from disk.
	"""
	# Load TRT engine
	self.logger = trt.Logger(trt.Logger.ERROR)
	trt.init_libnvinfer_plugins(self.logger, namespace="")
	with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
	assert runtime
	self.engine = runtime.deserialize_cuda_engine(f.read())
	assert self.engine
	self.context = self.engine.create_execution_context()
	assert self.context

	# Setup I/O bindings
	self.inputs = []
	self.outputs = []
	self.allocations = []
	for i in range(self.engine.num_io_tensors):
	name = self.engine.get_tensor_name(i)
	is_input = False
	if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
	is_input = True
	dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(name)))
	shape = self.context.get_tensor_shape(name)
	if is_input and shape[0] < 0:
	assert self.engine.num_optimization_profiles > 0
	profile_shape = self.engine.get_tensor_profile_shape(name, 0)
	assert len(profile_shape) == 3 # min,opt,max
	# Set the max profile as binding shape
	self.context.set_input_shape(name, profile_shape[2])
	shape = self.context.get_tensor_shape(name)
	if is_input:
	self.batch_size = shape[0]
	size = dtype.itemsize
	for s in shape:
	size *= s
	allocation = common.cuda_call(cudart.cudaMalloc(size))
	host_allocation = None if is_input else np.zeros(shape, dtype)
	binding = {
	"index": i,
	"name": name,
	"dtype": dtype,
	"shape": list(shape),
	"allocation": allocation,
	"host_allocation": host_allocation,
	}
	self.allocations.append(allocation)
	if is_input:
	self.inputs.append(binding)
	else:
	self.outputs.append(binding)
	print(
	"{} '{}' with shape {} and dtype {}".format(
	"Input" if is_input else "Output",
	binding["name"],
	binding["shape"],
	binding["dtype"],
	)
	)

	assert self.batch_size > 0
	assert len(self.inputs) > 0
	assert len(self.outputs) > 0
	assert len(self.allocations) > 0

	def input_spec(self):
	"""
	Get the specs for the input tensor of the network. Useful to prepare memory allocations.
	:return: Two items, the shape of the input tensor and its (numpy) datatype.
	"""
	return self.inputs[0]["shape"], self.inputs[0]["dtype"]

	def output_spec(self):
	"""
	Get the specs for the output tensors of the network. Useful to prepare memory allocations.
	:return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
	"""
	specs = []
	for o in self.outputs:
	specs.append((o["shape"], o["dtype"]))
	return specs

	def infer(self, batch):
	"""
	Execute inference on a batch of images.
	:param batch: A numpy array holding the image batch.
	:return A list of outputs as numpy arrays.
	"""
	# Copy I/O and Execute
	common.memcpy_host_to_device(self.inputs[0]["allocation"], batch)
	self.context.execute_v2(self.allocations)
	for o in range(len(self.outputs)):
	common.memcpy_device_to_host(
	self.outputs[o]["host_allocation"], self.outputs[o]["allocation"]
	)
	return [o["host_allocation"] for o in self.outputs]

	def process(self, batch, scales=None, nms_threshold=None):
	"""
	Execute inference on a batch of images. The images should already be batched and preprocessed, as prepared by
	the ImageBatcher class. Memory copying to and from the GPU device will be performed here.
	:param batch: A numpy array holding the image batch.
	:param scales: The image resize scales for each image in this batch. Default: No scale postprocessing applied.
	:return: A nested list for each image in the batch and each detection in the list.
	"""
	# Run inference
	outputs = self.infer(batch)

	# Process the results
	nums = outputs[0]
	boxes = outputs[1]
	scores = outputs[2]
	classes = outputs[3]
	detections = []
	normalized = np.max(boxes) < 2.0
	for i in range(self.batch_size):
	detections.append([])
	for n in range(int(nums[i])):
	scale = self.inputs[0]["shape"][2] if normalized else 1.0
	if scales and i < len(scales):
	scale /= scales[i]
	if nms_threshold and scores[i][n] < nms_threshold:
	continue
	detections[i].append(
	{
	"ymin": boxes[i][n][0] * scale,
	"xmin": boxes[i][n][1] * scale,
	"ymax": boxes[i][n][2] * scale,
	"xmax": boxes[i][n][3] * scale,
	"score": scores[i][n],
	"class": int(classes[i][n]),
	}
	)
	return detections


	def main(args):
	if args.output:
	output_dir = os.path.realpath(args.output)
	os.makedirs(output_dir, exist_ok=True)

	labels = []
	if args.labels:
	with open(args.labels) as f:
	for i, label in enumerate(f):
	labels.append(label.strip())

	trt_infer = TensorRTInfer(args.engine)
	if args.input:
	print("Inferring data in {}".format(args.input))
	batcher = ImageBatcher(args.input, *trt_infer.input_spec())
	for batch, images, scales in batcher.get_batch():
	print(
	"Processing Image {} / {}".format(
	batcher.image_index, batcher.num_images
	),
	end="\r",
	)
	detections = trt_infer.process(batch, scales, args.nms_threshold)
	if args.output:
	for i in range(len(images)):
	basename = os.path.splitext(os.path.basename(images[i]))[0]
	# Image Visualizations
	output_path = os.path.join(output_dir, "{}.png".format(basename))
	visualize_detections(images[i], output_path, detections[i], labels)
	# Text Results
	output_results = ""
	for d in detections[i]:
	line = [
	d["xmin"],
	d["ymin"],
	d["xmax"],
	d["ymax"],
	d["score"],
	d["class"],
	]
	output_results += "\t".join([str(f) for f in line]) + "\n"
	with open(
	os.path.join(output_dir, "{}.txt".format(basename)), "w"
	) as f:
	f.write(output_results)
	else:
	print("No input provided, running in benchmark mode")
	spec = trt_infer.input_spec()
	batch = 255 * np.random.rand(*spec[0]).astype(spec[1])
	iterations = 200
	times = []
	for i in range(20): # GPU warmup iterations
	trt_infer.infer(batch)
	for i in range(iterations):
	start = time.time()
	trt_infer.infer(batch)
	times.append(time.time() - start)
	print("Iteration {} / {}".format(i + 1, iterations), end="\r")
	print("Benchmark results include time for H2D and D2H memory copies")
	print("Average Latency: {:.3f} ms".format(1000 * np.average(times)))
	print(
	"Average Throughput: {:.1f} ips".format(
	trt_infer.batch_size / np.average(times)
	)
	)

	print()
	print("Finished Processing")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-e",
	"--engine",
	default=None,
	required=True,
	help="The serialized TensorRT engine",
	)
	parser.add_argument(
	"-i", "--input", default=None, help="Path to the image or directory to process"
	)
	parser.add_argument(
	"-o",
	"--output",
	default=None,
	help="Directory where to save the visualization results",
	)
	parser.add_argument(
	"-l",
	"--labels",
	default="./labels_coco.txt",
	help="File to use for reading the class labels from, default: ./labels_coco.txt",
	)
	parser.add_argument(
	"-t",
	"--nms_threshold",
	type=float,
	help="Override the score threshold for the NMS operation, if higher than the built-in threshold",
	)
	args = parser.parse_args()
	main(args)