1231: g0plus dockerfile

38fb1f6 verified about 2 months ago

6.76 kB

	#
	# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import os
	import sys
	import argparse

	import numpy as np
	import tensorrt as trt
	from cuda import cudart

	sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
	import common

	from image_batcher import ImageBatcher


	class TensorRTInfer:
	"""
	Implements inference for the EfficientNet TensorRT engine.
	"""

	def __init__(self, engine_path):
	"""
	:param engine_path: The path to the serialized engine to load from disk.
	"""
	# Load TRT engine
	self.logger = trt.Logger(trt.Logger.ERROR)
	with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
	assert runtime
	self.engine = runtime.deserialize_cuda_engine(f.read())
	assert self.engine
	self.context = self.engine.create_execution_context()
	assert self.context

	# Setup I/O bindings
	self.inputs = []
	self.outputs = []
	self.allocations = []
	for i in range(self.engine.num_io_tensors):
	name = self.engine.get_tensor_name(i)
	is_input = False
	if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
	is_input = True
	dtype = self.engine.get_tensor_dtype(name)
	shape = self.engine.get_tensor_shape(name)
	if is_input:
	self.batch_size = shape[0]
	size = np.dtype(trt.nptype(dtype)).itemsize
	for s in shape:
	size *= s
	allocation = common.cuda_call(cudart.cudaMalloc(size))
	binding = {
	"index": i,
	"name": name,
	"dtype": np.dtype(trt.nptype(dtype)),
	"shape": list(shape),
	"allocation": allocation,
	}
	self.allocations.append(allocation)
	if is_input:
	self.inputs.append(binding)
	else:
	self.outputs.append(binding)

	assert self.batch_size > 0
	assert len(self.inputs) > 0
	assert len(self.outputs) > 0
	assert len(self.allocations) > 0

	def input_spec(self):
	"""
	Get the specs for the input tensor of the network. Useful to prepare memory allocations.
	:return: Two items, the shape of the input tensor and its (numpy) datatype.
	"""
	return self.inputs[0]["shape"], self.inputs[0]["dtype"]

	def output_spec(self):
	"""
	Get the specs for the output tensor of the network. Useful to prepare memory allocations.
	:return: Two items, the shape of the output tensor and its (numpy) datatype.
	"""
	return self.outputs[0]["shape"], self.outputs[0]["dtype"]

	def infer(self, batch, top=1):
	"""
	Execute inference on a batch of images. The images should already be batched and preprocessed, as prepared by
	the ImageBatcher class. Memory copying to and from the GPU device will be performed here.
	:param batch: A numpy array holding the image batch.
	:param top: The number of classes to return as top_predicitons, in descending order by their score. By default,
	setting to one will return the same as the maximum score class. Useful for Top-5 accuracy metrics in validation.
	:return: Three items, as numpy arrays for each batch image: The maximum score class, the corresponding maximum
	score, and a list of the top N classes and scores.
	"""
	# Prepare the output data
	output = np.zeros(*self.output_spec())

	# Process I/O and execute the network
	common.memcpy_host_to_device(
	self.inputs[0]["allocation"], np.ascontiguousarray(batch)
	)
	self.context.execute_v2(self.allocations)
	common.memcpy_device_to_host(output, self.outputs[0]["allocation"])

	# Process the results
	classes = np.argmax(output, axis=1)
	scores = np.max(output, axis=1)
	top = min(top, output.shape[1])
	top_classes = np.flip(np.argsort(output, axis=1), axis=1)[:, 0:top]
	top_scores = np.flip(np.sort(output, axis=1), axis=1)[:, 0:top]

	return classes, scores, [top_classes, top_scores]


	def main(args):
	trt_infer = TensorRTInfer(args.engine)
	batcher = ImageBatcher(
	args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor
	)
	for batch, images in batcher.get_batch():
	classes, scores, top = trt_infer.infer(batch)
	for i in range(len(images)):
	if args.top == 1:
	print(images[i], classes[i], scores[i], sep=args.separator)
	else:
	line = [images[i]]
	assert args.top <= top[0].shape[1]
	for t in range(args.top):
	line.append(str(top[0][i][t]))
	for t in range(args.top):
	line.append(str(top[1][i][t]))
	print(args.separator.join(line))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
	parser.add_argument(
	"-i",
	"--input",
	help="The input to infer, either a single image path, or a directory of images",
	)
	parser.add_argument(
	"-t",
	"--top",
	default=1,
	type=int,
	help="The amount of top classes and scores to output per image, default: 1",
	)
	parser.add_argument(
	"-s",
	"--separator",
	default="\t",
	help="Separator to use between columns when printing the results, default: \\t",
	)
	parser.add_argument(
	"-p",
	"--preprocessor",
	default="V2",
	choices=["V1", "V1MS", "V2"],
	help="Select the image preprocessor to use, either 'V2', 'V1' or 'V1MS', default: V2",
	)
	args = parser.parse_args()
	if not all([args.engine, args.input]):
	parser.print_help()
	print("\nThese arguments are required: --engine and --input")
	sys.exit(1)
	main(args)