thanks to ncnn ❤

be903e2 over 2 years ago

6.38 kB

	# Tencent is pleased to support the open source community by making ncnn available.
	#
	# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
	#
	# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
	# in compliance with the License. You may obtain a copy of the License at
	#
	# https://opensource.org/licenses/BSD-3-Clause
	#
	# Unless required by applicable law or agreed to in writing, software distributed
	# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
	# CONDITIONS OF ANY KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations under the License.

	import sys
	import time
	import ncnn

	param_root = "../benchmark/"

	g_warmup_loop_count = 8
	g_loop_count = 4
	g_enable_cooling_down = True

	g_vkdev = None
	g_blob_vkallocator = None
	g_staging_vkallocator = None

	g_blob_pool_allocator = ncnn.UnlockedPoolAllocator()
	g_workspace_pool_allocator = ncnn.PoolAllocator()


	def benchmark(comment, _in, opt):
	_in.fill(0.01)

	g_blob_pool_allocator.clear()
	g_workspace_pool_allocator.clear()

	if opt.use_vulkan_compute:
	g_blob_vkallocator.clear()
	g_staging_vkallocator.clear()

	net = ncnn.Net()
	net.opt = opt

	if net.opt.use_vulkan_compute:
	net.set_vulkan_device(g_vkdev)

	net.load_param(param_root + comment + ".param")

	dr = ncnn.DataReaderFromEmpty()
	net.load_model(dr)

	input_names = net.input_names()
	output_names = net.output_names()

	if g_enable_cooling_down:
	time.sleep(10)

	# warm up
	for i in range(g_warmup_loop_count):
	# test with statement
	with net.create_extractor() as ex:
	ex.input(input_names[0], _in)
	ex.extract(output_names[0])

	time_min = sys.float_info.max
	time_max = -sys.float_info.max
	time_avg = 0.0

	for i in range(g_loop_count):
	start = time.time()

	# test net keep alive until ex freed
	ex = net.create_extractor()
	ex.input(input_names[0], _in)
	ex.extract(output_names[0])

	end = time.time()

	timespan = end - start

	time_min = timespan if timespan < time_min else time_min
	time_max = timespan if timespan > time_max else time_max
	time_avg += timespan

	time_avg /= g_loop_count

	print(
	"%20s min = %7.2f max = %7.2f avg = %7.2f"
	% (comment, time_min * 1000, time_max * 1000, time_avg * 1000)
	)


	if __name__ == "__main__":
	loop_count = 4
	num_threads = ncnn.get_cpu_count()
	powersave = 0
	gpu_device = -1
	cooling_down = 1

	argc = len(sys.argv)
	if argc >= 2:
	loop_count = int(sys.argv[1])
	if argc >= 3:
	num_threads = int(sys.argv[2])
	if argc >= 4:
	powersave = int(sys.argv[3])
	if argc >= 5:
	gpu_device = int(sys.argv[4])
	if argc >= 6:
	cooling_down = int(sys.argv[5])

	use_vulkan_compute = gpu_device != -1

	g_enable_cooling_down = cooling_down != 0

	g_loop_count = loop_count

	g_blob_pool_allocator.set_size_compare_ratio(0.0)
	g_workspace_pool_allocator.set_size_compare_ratio(0.5)

	if use_vulkan_compute:
	g_warmup_loop_count = 10

	g_vkdev = ncnn.get_gpu_device(gpu_device)

	g_blob_vkallocator = ncnn.VkBlobAllocator(g_vkdev)
	g_staging_vkallocator = ncnn.VkStagingAllocator(g_vkdev)

	opt = ncnn.Option()
	opt.lightmode = True
	opt.num_threads = num_threads
	opt.blob_allocator = g_blob_pool_allocator
	opt.workspace_allocator = g_workspace_pool_allocator
	if use_vulkan_compute:
	opt.blob_vkallocator = g_blob_vkallocator
	opt.workspace_vkallocator = g_blob_vkallocator
	opt.staging_vkallocator = g_staging_vkallocator
	opt.use_winograd_convolution = True
	opt.use_sgemm_convolution = True
	opt.use_int8_inference = True
	opt.use_vulkan_compute = use_vulkan_compute
	opt.use_fp16_packed = True
	opt.use_fp16_storage = True
	opt.use_fp16_arithmetic = True
	opt.use_int8_storage = True
	opt.use_int8_arithmetic = True
	opt.use_packing_layout = True
	opt.use_shader_pack8 = False
	opt.use_image_storage = False

	ncnn.set_cpu_powersave(powersave)
	ncnn.set_omp_dynamic(0)
	ncnn.set_omp_num_threads(num_threads)

	print("loop_count =", loop_count)
	print("num_threads =", num_threads)
	print("powersave =", ncnn.get_cpu_powersave())
	print("gpu_device =", gpu_device)
	print("cooling_down =", g_enable_cooling_down)

	benchmark("squeezenet", ncnn.Mat((227, 227, 3)), opt)
	benchmark("squeezenet_int8", ncnn.Mat((227, 227, 3)), opt)
	benchmark("mobilenet", ncnn.Mat((224, 224, 3)), opt)
	benchmark("mobilenet_int8", ncnn.Mat((224, 224, 3)), opt)
	benchmark("mobilenet_v2", ncnn.Mat((224, 224, 3)), opt)
	# benchmark("mobilenet_v2_int8", ncnn.Mat(w=224, h=224, c=3), opt)
	benchmark("mobilenet_v3", ncnn.Mat((224, 224, 3)), opt)
	benchmark("shufflenet", ncnn.Mat((224, 224, 3)), opt)
	benchmark("shufflenet_v2", ncnn.Mat((224, 224, 3)), opt)
	benchmark("mnasnet", ncnn.Mat((224, 224, 3)), opt)
	benchmark("proxylessnasnet", ncnn.Mat((224, 224, 3)), opt)
	benchmark("efficientnet_b0", ncnn.Mat((224, 224, 3)), opt)
	benchmark("regnety_400m", ncnn.Mat((224, 224, 3)), opt)
	benchmark("blazeface", ncnn.Mat((128, 128, 3)), opt)
	benchmark("googlenet", ncnn.Mat((224, 224, 3)), opt)
	benchmark("googlenet_int8", ncnn.Mat((224, 224, 3)), opt)
	benchmark("resnet18", ncnn.Mat((224, 224, 3)), opt)
	benchmark("resnet18_int8", ncnn.Mat((224, 224, 3)), opt)
	benchmark("alexnet", ncnn.Mat((227, 227, 3)), opt)
	benchmark("vgg16", ncnn.Mat((224, 224, 3)), opt)
	benchmark("vgg16_int8", ncnn.Mat((224, 224, 3)), opt)
	benchmark("resnet50", ncnn.Mat((224, 224, 3)), opt)
	benchmark("resnet50_int8", ncnn.Mat((224, 224, 3)), opt)
	benchmark("squeezenet_ssd", ncnn.Mat((300, 300, 3)), opt)
	benchmark("squeezenet_ssd_int8", ncnn.Mat((300, 300, 3)), opt)
	benchmark("mobilenet_ssd", ncnn.Mat((300, 300, 3)), opt)
	benchmark("mobilenet_ssd_int8", ncnn.Mat((300, 300, 3)), opt)
	benchmark("mobilenet_yolo", ncnn.Mat((416, 416, 3)), opt)
	benchmark("mobilenetv2_yolov3", ncnn.Mat((352, 352, 3)), opt)
	benchmark("yolov4-tiny", ncnn.Mat((416, 416, 3)), opt)