ncnn / src /net.cpp

thanks to ncnn ❤

be903e2 over 2 years ago

80.7 kB

	// Tencent is pleased to support the open source community by making ncnn available.
	//
	// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
	//
	// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
	// in compliance with the License. You may obtain a copy of the License at
	//
	// https://opensource.org/licenses/BSD-3-Clause
	//
	// Unless required by applicable law or agreed to in writing, software distributed
	// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
	// CONDITIONS OF ANY KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations under the License.

	#include "net.h"

	#include "cpu.h"
	#include "datareader.h"
	#include "layer_type.h"
	#include "modelbin.h"
	#include "paramdict.h"

	#include <stdarg.h>
	#include <stdint.h>
	#include <string.h>

	#if NCNN_BENCHMARK
	#include "benchmark.h"
	#endif // NCNN_BENCHMARK

	#if NCNN_VULKAN
	#include "command.h"
	#include "pipelinecache.h"
	#endif // NCNN_VULKAN

	namespace ncnn {

	class NetPrivate
	{
	public:
	NetPrivate(Option& _opt);

	Option& opt;

	#if NCNN_VULKAN

	int upload_model();

	#endif // NCNN_VULKAN

	friend class Extractor;
	int forward_layer(int layer_index, std::vector<Mat>& blob_mats, const Option& opt) const;

	#if NCNN_VULKAN
	int forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const;
	int forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<VkImageMat>& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const;
	#endif // NCNN_VULKAN

	int convert_layout(Mat& bottom_blob, const Layer* layer, const Option& opt) const;

	int do_forward_layer(const Layer* layer, std::vector<Mat>& blob_mats, const Option& opt) const;
	#if NCNN_VULKAN
	int do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const;
	int do_forward_layer(const Layer* layer, std::vector<VkImageMat>& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const;
	#endif // NCNN_VULKAN

	void update_input_output_indexes();
	#if NCNN_STRING
	void update_input_output_names();
	#endif // NCNN_STRING

	std::vector<Blob> blobs;
	std::vector<Layer*> layers;

	std::vector<int> input_blob_indexes;
	std::vector<int> output_blob_indexes;
	#if NCNN_STRING
	std::vector<const char*> input_blob_names;
	std::vector<const char*> output_blob_names;
	#endif // NCNN_STRING

	std::vector<custom_layer_registry_entry> custom_layer_registry;
	std::vector<overwrite_builtin_layer_registry_entry> overwrite_builtin_layer_registry;

	PoolAllocator* local_blob_allocator;
	PoolAllocator* local_workspace_allocator;

	#if NCNN_VULKAN
	const VulkanDevice* vkdev;

	VkAllocator* weight_vkallocator;
	VkAllocator* weight_staging_vkallocator;

	PipelineCache* pipeline_cache;
	#endif // NCNN_VULKAN
	};

	NetPrivate::NetPrivate(Option& _opt)
	: opt(_opt)
	{
	local_blob_allocator = 0;
	local_workspace_allocator = 0;

	#if NCNN_VULKAN
	vkdev = 0;
	weight_vkallocator = 0;
	weight_staging_vkallocator = 0;
	pipeline_cache = 0;
	#endif // NCNN_VULKAN
	}

	static Option get_masked_option(const Option& opt, int featmask)
	{
	// mask option usage as layer specific featmask
	Option opt1 = opt;
	opt1.use_fp16_arithmetic = opt1.use_fp16_arithmetic && !(featmask & (1 << 0));
	opt1.use_fp16_storage = opt1.use_fp16_storage && !(featmask & (1 << 1));
	opt1.use_fp16_packed = opt1.use_fp16_packed && !(featmask & (1 << 1));
	opt1.use_bf16_storage = opt1.use_bf16_storage && !(featmask & (1 << 2));
	opt1.use_int8_packed = opt1.use_int8_packed && !(featmask & (1 << 3));
	opt1.use_int8_storage = opt1.use_int8_storage && !(featmask & (1 << 3));
	opt1.use_int8_arithmetic = opt1.use_int8_arithmetic && !(featmask & (1 << 3));
	opt1.use_vulkan_compute = opt1.use_vulkan_compute && !(featmask & (1 << 4));
	opt1.use_image_storage = opt1.use_image_storage && !(featmask & (1 << 4));
	opt1.use_tensor_storage = opt1.use_tensor_storage && !(featmask & (1 << 4));
	opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5));
	opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6));

	return opt1;
	}

	#if NCNN_VULKAN
	int NetPrivate::upload_model()
	{
	ncnn::VkTransfer cmd(vkdev);

	// create gpu device allocator if null
	if (!weight_vkallocator)
	{
	weight_vkallocator = new VkWeightAllocator(vkdev);
	}
	if (!weight_staging_vkallocator)
	{
	weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev);
	}

	Option opt_upload = opt;
	opt_upload.blob_vkallocator = weight_vkallocator;
	opt_upload.workspace_vkallocator = weight_vkallocator;
	opt_upload.staging_vkallocator = weight_staging_vkallocator;

	for (size_t i = 0; i < layers.size(); i++)
	{
	if (layers[i]->support_vulkan)
	{
	int uret = layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask));
	if (uret != 0)
	{
	NCNN_LOGE("layer upload_model %d failed", (int)i);
	return -1;
	}
	}
	}

	return cmd.submit_and_wait();
	}
	#endif // NCNN_VULKAN

	int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, const Option& opt) const
	{
	const Layer* layer = layers[layer_index];

	// NCNN_LOGE("forward_layer %d %s", layer_index, layer->name.c_str());

	// load bottom blobs
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	if (blob_mats[bottom_blob_index].dims == 0)
	{
	int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt);
	if (ret != 0)
	return ret;
	}
	}

	#if NCNN_BENCHMARK
	double start = get_current_time();
	Mat bottom_blob;
	if (layer->one_blob_only)
	{
	int bottom_blob_index = layer->bottoms[0];
	bottom_blob.dims = blob_mats[bottom_blob_index].dims;
	bottom_blob.w = blob_mats[bottom_blob_index].w;
	bottom_blob.h = blob_mats[bottom_blob_index].h;
	bottom_blob.c = blob_mats[bottom_blob_index].c;
	bottom_blob.elempack = blob_mats[bottom_blob_index].elempack;
	bottom_blob.elemsize = blob_mats[bottom_blob_index].elemsize;
	}
	#endif
	int ret = 0;
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats, opt);
	}
	#if NCNN_BENCHMARK
	double end = get_current_time();
	if (layer->one_blob_only)
	{
	int top_blob_index = layer->tops[0];
	benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end);
	}
	else
	{
	benchmark(layer, start, end);
	}
	#endif
	if (ret != 0)
	return ret;

	// NCNN_LOGE("forward_layer %d %s done", layer_index, layer->name.c_str());
	// const Mat& blob = blob_mats[layer->tops[0]];
	// NCNN_LOGE("[%-2d %-16s %-16s] %d blobs count = %-3d size = %-3d x %-3d", layer_index, layer->type.c_str(), layer->name.c_str(), layer->tops[0], blob.c, blob.h, blob.w);

	return 0;
	}

	#if NCNN_VULKAN
	int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const
	{
	const Layer* layer = layers[layer_index];

	// NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str());

	bool cmd_submit_and_wait = false;

	// load bottom blobs
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	if (blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0)
	{
	int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt);
	if (ret != 0)
	return ret;
	}

	if (layer->support_vulkan)
	{
	if (blob_mats_gpu[bottom_blob_index].dims == 0)
	{
	// host to buffer
	cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats[bottom_blob_index].release();
	}
	}
	}
	else
	{
	if (blob_mats[bottom_blob_index].dims == 0)
	{
	Option opt_download = opt;
	opt_download.use_packing_layout = layer->support_packing;

	// buffer to host
	cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt_download);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu[bottom_blob_index].release();
	}

	cmd_submit_and_wait = true;
	}
	}
	}

	int ret;
	if (cmd_submit_and_wait)
	{
	ret = cmd.submit_and_wait();

	#if NCNN_BENCHMARK
	std::vector<uint64_t> results(layer_index * 2);
	cmd.get_query_pool_results(0, layer_index * 2, results);
	for (int i = 0; i < layer_index; i++)
	{
	uint64_t start = results[i * 2];
	uint64_t end = results[i * 2 + 1];
	if (start == 0 \|\| end == 0)
	continue;

	double duration_us = (end - start) * vkdev->info.timestamp_period() / 1000;
	NCNN_LOGE("%-24s %-30s %8.2lfus \|", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us);
	}
	#endif // NCNN_BENCHMARK

	cmd.reset();
	if (ret != 0)
	return ret;
	}

	if (layer->support_vulkan)
	{
	#if NCNN_BENCHMARK
	cmd.record_write_timestamp(layer_index * 2);
	#endif
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
	}
	#if NCNN_BENCHMARK
	cmd.record_write_timestamp(layer_index * 2 + 1);
	#endif
	}
	else
	{
	#if NCNN_BENCHMARK
	double start = get_current_time();
	Mat bottom_blob;
	if (layer->one_blob_only)
	{
	int bottom_blob_index = layer->bottoms[0];
	bottom_blob = blob_mats[bottom_blob_index].shape();
	}
	#endif
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats, opt);
	}
	#if NCNN_BENCHMARK
	double end = get_current_time();
	if (layer->one_blob_only)
	{
	int top_blob_index = layer->tops[0];
	benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end);
	}
	else
	{
	benchmark(layer, start, end);
	}
	#endif
	}
	if (ret != 0)
	return ret;

	// NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str());

	return 0;
	}

	int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<VkImageMat>& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const
	{
	const Layer* layer = layers[layer_index];

	// NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str());

	bool cmd_submit_and_wait = false;
	bool image_allocation_failed = false;

	IMAGE_ALLOCATION_FAILED:

	if (image_allocation_failed)
	{
	#if NCNN_STRING
	NCNN_LOGE("forward_layer %d %s image allocation failed, fallback to cpu", layer_index, layer->name.c_str());
	#else
	NCNN_LOGE("forward_layer %d image allocation failed, fallback to cpu", layer_index);
	#endif
	}

	// load bottom blobs
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	if (blob_mats_gpu_image[bottom_blob_index].dims == 0 && blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0)
	{
	int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt);
	if (ret != 0)
	return ret;
	}

	if (layer->support_vulkan && !image_allocation_failed)
	{
	if (layer->support_image_storage)
	{
	if (blob_mats_gpu_image[bottom_blob_index].dims == 0)
	{
	if (blob_mats_gpu[bottom_blob_index].dims == 0)
	{
	// host to image
	cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt);

	if (blob_mats_gpu_image[bottom_blob_index].empty())
	{
	image_allocation_failed = true;
	goto IMAGE_ALLOCATION_FAILED;
	}

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats[bottom_blob_index].release();
	}
	}
	else
	{
	// buffer to image
	cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt);

	if (blob_mats_gpu_image[bottom_blob_index].empty())
	{
	image_allocation_failed = true;
	goto IMAGE_ALLOCATION_FAILED;
	}

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu[bottom_blob_index].release();
	}
	}
	}
	}
	else
	{
	if (blob_mats_gpu[bottom_blob_index].dims == 0)
	{
	if (blob_mats_gpu_image[bottom_blob_index].dims == 0)
	{
	// host to buffer
	cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats[bottom_blob_index].release();
	}
	}
	else
	{
	// image to buffer
	cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu_image[bottom_blob_index].release();
	}
	}
	}
	}
	}
	else
	{
	if (blob_mats[bottom_blob_index].dims == 0)
	{
	if (blob_mats_gpu_image[bottom_blob_index].dims == 0)
	{
	// buffer to host
	cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu[bottom_blob_index].release();
	}

	cmd_submit_and_wait = true;
	}
	else
	{
	// image to host
	cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt);

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu_image[bottom_blob_index].release();
	}

	cmd_submit_and_wait = true;
	}
	}
	}
	}

	int ret;
	if (cmd_submit_and_wait)
	{
	ret = cmd.submit_and_wait();

	#if NCNN_BENCHMARK
	std::vector<uint64_t> results(layer_index * 2);
	cmd.get_query_pool_results(0, layer_index * 2, results);
	for (int i = 0; i < layer_index; i++)
	{
	uint64_t start = results[i * 2];
	uint64_t end = results[i * 2 + 1];
	if (start == 0 \|\| end == 0)
	continue;

	double duration_us = (end - start) * vkdev->info.timestamp_period() / 1000;
	NCNN_LOGE("%-24s %-30s %8.2lfus \|", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us);
	}
	#endif // NCNN_BENCHMARK

	cmd.reset();

	if (ret != 0)
	return ret;
	}

	if (layer->support_vulkan && !image_allocation_failed)
	{
	#if NCNN_BENCHMARK
	cmd.record_write_timestamp(layer_index * 2);
	#endif
	if (layer->support_image_storage)
	{
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt);
	}
	if (ret == -100)
	{
	image_allocation_failed = true;
	goto IMAGE_ALLOCATION_FAILED;
	}
	}
	else
	{
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
	}
	}
	#if NCNN_BENCHMARK
	cmd.record_write_timestamp(layer_index * 2 + 1);
	#endif
	}
	else
	{
	#if NCNN_BENCHMARK
	double start = get_current_time();
	Mat bottom_blob;
	if (layer->one_blob_only)
	{
	int bottom_blob_index = layer->bottoms[0];
	bottom_blob = blob_mats[bottom_blob_index].shape();
	}
	#endif
	if (layer->featmask)
	{
	ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
	}
	else
	{
	ret = do_forward_layer(layer, blob_mats, opt);
	}
	#if NCNN_BENCHMARK
	double end = get_current_time();
	if (layer->one_blob_only)
	{
	int top_blob_index = layer->tops[0];
	benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end);
	}
	else
	{
	benchmark(layer, start, end);
	}
	#endif
	}
	if (ret != 0)
	return ret;

	// NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str());

	return 0;
	}
	#endif // NCNN_VULKAN

	int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Option& opt) const
	{
	// clang-format off
	// INDENT-OFF
	#if NCNN_ARM82
	if (opt.use_fp16_storage && cpu_support_arm_asimdhp())
	{
	if (bottom_blob.elembits() == 32 && layer->support_fp16_storage)
	{
	Mat bottom_blob_fp16;
	cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
	bottom_blob = bottom_blob_fp16;
	}
	if (bottom_blob.elembits() == 16 && !layer->support_fp16_storage)
	{
	Mat bottom_blob_fp32;
	cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
	bottom_blob = bottom_blob_fp32;
	}
	}
	else
	#endif // NCNN_ARM82
	#if NCNN_RVV
	if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh())
	{
	if (bottom_blob.elembits() == 32 && layer->support_fp16_storage)
	{
	Mat bottom_blob_fp16;
	cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
	bottom_blob = bottom_blob_fp16;
	}
	if (bottom_blob.elembits() == 16 && !layer->support_fp16_storage)
	{
	Mat bottom_blob_fp32;
	cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
	bottom_blob = bottom_blob_fp32;
	}
	}
	else
	#endif // NCNN_RVV
	#if NCNN_BF16
	if (opt.use_bf16_storage)
	{
	if (bottom_blob.elembits() == 32 && layer->support_bf16_storage)
	{
	Mat bottom_blob_bf16;
	cast_float32_to_bfloat16(bottom_blob, bottom_blob_bf16, opt);
	bottom_blob = bottom_blob_bf16;
	}
	if (bottom_blob.elembits() == 16 && !layer->support_bf16_storage)
	{
	Mat bottom_blob_fp32;
	cast_bfloat16_to_float32(bottom_blob, bottom_blob_fp32, opt);
	bottom_blob = bottom_blob_fp32;
	}
	}
	else
	#endif // NCNN_BF16
	{
	// no type conversion
	}
	// INDENT-ON
	// clang-format on

	int dst_elempack = 1;
	if (opt.use_packing_layout)
	{
	// resolve dst_elempack
	int dims = bottom_blob.dims;
	int elemcount = 0;
	if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w;
	if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h;
	if (dims == 3 \|\| dims == 4) elemcount = bottom_blob.elempack * bottom_blob.c;

	int elembits = bottom_blob.elembits();

	if (layer->support_packing)
	{
	if (elembits == 32)
	{
	#if NCNN_AVX512
	if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
	dst_elempack = 16;
	else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
	dst_elempack = 8;
	else if (elemcount % 4 == 0)
	dst_elempack = 4;
	#elif NCNN_AVX
	if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
	dst_elempack = 8;
	else if (elemcount % 4 == 0)
	dst_elempack = 4;
	#elif NCNN_RVV
	const int packn = ncnn::cpu_riscv_vlenb() / 4;
	if (elemcount % packn == 0)
	dst_elempack = packn;
	#else
	if (elemcount % 4 == 0)
	dst_elempack = 4;
	#endif
	}
	if (elembits == 16)
	{
	#if NCNN_ARM82
	if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
	dst_elempack = 8;
	else if (elemcount % 4 == 0)
	dst_elempack = 4;
	#elif NCNN_RVV
	const int packn = ncnn::cpu_riscv_vlenb() / 2;
	if (elemcount % packn == 0)
	dst_elempack = packn;
	#else
	if (elemcount % 4 == 0)
	dst_elempack = 4;
	#endif
	}
	if (elembits == 8)
	{
	#if NCNN_RVV
	const int packn = ncnn::cpu_riscv_vlenb() / 1;
	if (elemcount % packn == 0)
	dst_elempack = packn;
	#else
	if (elemcount % 8 == 0)
	dst_elempack = 8;
	#endif
	}
	}
	}

	if (bottom_blob.elempack != dst_elempack)
	{
	Mat bottom_blob_packed;
	convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt);
	bottom_blob = bottom_blob_packed;
	}

	return 0;
	}

	int NetPrivate::do_forward_layer(const Layer* layer, std::vector<Mat>& blob_mats, const Option& opt) const
	{
	if (layer->one_blob_only)
	{
	int bottom_blob_index = layer->bottoms[0];
	int top_blob_index = layer->tops[0];

	Mat& bottom_blob_ref = blob_mats[bottom_blob_index];
	Mat bottom_blob;

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	bottom_blob = bottom_blob_ref.clone(opt.blob_allocator);
	}
	}
	if (bottom_blob.dims == 0)
	{
	bottom_blob = bottom_blob_ref;
	}

	convert_layout(bottom_blob, layer, opt);

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	Mat& bottom_top_blob = bottom_blob;
	int ret = layer->forward_inplace(bottom_top_blob, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats[top_blob_index] = bottom_top_blob;
	}
	else
	{
	Mat top_blob;
	int ret = layer->forward(bottom_blob, top_blob, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats[top_blob_index] = top_blob;
	}

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats[bottom_blob_index].release();
	}
	}
	else
	{
	std::vector<Mat> bottom_blobs(layer->bottoms.size());
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	Mat& bottom_blob_ref = blob_mats[bottom_blob_index];
	bottom_blobs[i].release();

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	bottom_blobs[i] = bottom_blob_ref.clone(opt.blob_allocator);
	}
	}
	if (bottom_blobs[i].dims == 0)
	{
	bottom_blobs[i] = bottom_blob_ref;
	}

	convert_layout(bottom_blobs[i], layer, opt);
	}

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	std::vector<Mat>& bottom_top_blobs = bottom_blobs;
	int ret = layer->forward_inplace(bottom_top_blobs, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats[top_blob_index] = bottom_top_blobs[i];
	}
	}
	else
	{
	std::vector<Mat> top_blobs(layer->tops.size());
	int ret = layer->forward(bottom_blobs, top_blobs, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats[top_blob_index] = top_blobs[i];
	}
	}

	if (opt.lightmode)
	{
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	// delete after taken in light mode
	blob_mats[bottom_blob_index].release();
	}
	}
	}

	return 0;
	}

	#if NCNN_VULKAN
	int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const
	{
	if (layer->one_blob_only)
	{
	// load bottom blob
	int bottom_blob_index = layer->bottoms[0];
	int top_blob_index = layer->tops[0];

	VkMat& bottom_blob_ref = blob_mats_gpu[bottom_blob_index];
	VkMat bottom_blob;

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	cmd.record_clone(bottom_blob_ref, bottom_blob, opt);
	// NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blob.buffer(), bottom_blob.buffer_offset());
	}
	}
	if (bottom_blob.dims == 0)
	{
	bottom_blob = bottom_blob_ref;
	}

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	VkMat& bottom_top_blob = bottom_blob;
	int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats_gpu[top_blob_index] = bottom_top_blob;
	}
	else
	{
	VkMat top_blob;
	int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats_gpu[top_blob_index] = top_blob;
	}

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu[bottom_blob_index].release();
	}
	}
	else
	{
	// load bottom blobs
	std::vector<VkMat> bottom_blobs(layer->bottoms.size());
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	VkMat& bottom_blob_ref = blob_mats_gpu[bottom_blob_index];
	bottom_blobs[i].release();

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	cmd.record_clone(bottom_blob_ref, bottom_blobs[i], opt);
	// NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset());
	}
	}
	if (bottom_blobs[i].dims == 0)
	{
	bottom_blobs[i] = bottom_blob_ref;
	}
	}

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
	int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats_gpu[top_blob_index] = bottom_top_blobs[i];
	}
	}
	else
	{
	std::vector<VkMat> top_blobs(layer->tops.size());
	int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats_gpu[top_blob_index] = top_blobs[i];
	}
	}

	if (opt.lightmode)
	{
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	// delete after taken in light mode
	blob_mats_gpu[bottom_blob_index].release();
	}
	}
	}

	return 0;
	}

	int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkImageMat>& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const
	{
	if (layer->one_blob_only)
	{
	// load bottom blob
	int bottom_blob_index = layer->bottoms[0];
	int top_blob_index = layer->tops[0];

	VkImageMat& bottom_blob_ref = blob_mats_gpu_image[bottom_blob_index];
	VkImageMat bottom_blob;

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	cmd.record_clone(bottom_blob_ref, bottom_blob, opt);
	// NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blob.buffer(), bottom_blob.buffer_offset());
	}
	}
	if (bottom_blob.dims == 0)
	{
	bottom_blob = bottom_blob_ref;
	}

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	VkImageMat& bottom_top_blob = bottom_blob;
	int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats_gpu_image[top_blob_index] = bottom_top_blob;
	}
	else
	{
	VkImageMat top_blob;
	int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blob
	blob_mats_gpu_image[top_blob_index] = top_blob;
	}

	if (opt.lightmode)
	{
	// delete after taken in light mode
	blob_mats_gpu_image[bottom_blob_index].release();
	}
	}
	else
	{
	// load bottom blobs
	std::vector<VkImageMat> bottom_blobs(layer->bottoms.size());
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	VkImageMat& bottom_blob_ref = blob_mats_gpu_image[bottom_blob_index];

	if (opt.lightmode)
	{
	// deep copy for inplace forward if data is shared
	if (layer->support_inplace && *bottom_blob_ref.refcount != 1)
	{
	cmd.record_clone(bottom_blob_ref, bottom_blobs[i], opt);
	// NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset());
	}
	}
	if (bottom_blobs[i].dims == 0)
	{
	bottom_blobs[i] = bottom_blob_ref;
	}
	}

	// forward
	if (opt.lightmode && layer->support_inplace)
	{
	std::vector<VkImageMat>& bottom_top_blobs = bottom_blobs;
	int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats_gpu_image[top_blob_index] = bottom_top_blobs[i];
	}
	}
	else
	{
	std::vector<VkImageMat> top_blobs(layer->tops.size());
	int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
	if (ret != 0)
	return ret;

	// store top blobs
	for (size_t i = 0; i < layer->tops.size(); i++)
	{
	int top_blob_index = layer->tops[i];

	blob_mats_gpu_image[top_blob_index] = top_blobs[i];
	}
	}

	if (opt.lightmode)
	{
	for (size_t i = 0; i < layer->bottoms.size(); i++)
	{
	int bottom_blob_index = layer->bottoms[i];

	// delete after taken in light mode
	blob_mats_gpu_image[bottom_blob_index].release();
	}
	}
	}

	return 0;
	}
	#endif // NCNN_VULKAN

	void NetPrivate::update_input_output_indexes()
	{
	input_blob_indexes.clear();
	output_blob_indexes.clear();

	for (size_t i = 0; i < layers.size(); i++)
	{
	if (layers[i]->typeindex == LayerType::Input)
	{
	int blob_index = layers[i]->tops[0];
	input_blob_indexes.push_back(blob_index);
	}
	}

	for (size_t i = 0; i < blobs.size(); i++)
	{
	if (blobs[i].producer != -1 && blobs[i].consumer == -1)
	{
	output_blob_indexes.push_back(i);
	}
	}
	}

	#if NCNN_STRING
	void NetPrivate::update_input_output_names()
	{
	input_blob_names.clear();
	output_blob_names.clear();

	for (size_t i = 0; i < input_blob_indexes.size(); i++)
	{
	int blob_index = input_blob_indexes[i];
	input_blob_names.push_back(blobs[blob_index].name.c_str());
	}

	for (size_t i = 0; i < output_blob_indexes.size(); i++)
	{
	int blob_index = output_blob_indexes[i];
	output_blob_names.push_back(blobs[blob_index].name.c_str());
	}
	}
	#endif // NCNN_STRING

	Net::Net()
	: d(new NetPrivate(opt))
	{
	}

	Net::~Net()
	{
	clear();

	delete d;
	}

	Net::Net(const Net&)
	: d(0)
	{
	}

	Net& Net::operator=(const Net&)
	{
	return *this;
	}

	#if NCNN_STRING
	int Net::register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer, void* userdata)
	{
	int typeindex = layer_to_index(type);
	if (typeindex != -1)
	{
	NCNN_LOGE("overwrite built-in layer type %s", type);

	for (size_t i = 0; i < d->overwrite_builtin_layer_registry.size(); i++)
	{
	if (d->overwrite_builtin_layer_registry[i].typeindex == typeindex)
	{
	NCNN_LOGE("overwrite existing overwritten built-in layer index %d", typeindex);

	d->overwrite_builtin_layer_registry[i].creator = creator;
	d->overwrite_builtin_layer_registry[i].destroyer = destroyer;
	d->overwrite_builtin_layer_registry[i].userdata = userdata;
	return 0;
	}
	}

	struct overwrite_builtin_layer_registry_entry entry = {typeindex, creator, destroyer, userdata};
	d->overwrite_builtin_layer_registry.push_back(entry);
	return 0;
	}

	int custom_index = custom_layer_to_index(type);
	if (custom_index == -1)
	{
	struct custom_layer_registry_entry entry = {type, creator, destroyer, userdata};
	d->custom_layer_registry.push_back(entry);
	}
	else
	{
	NCNN_LOGE("overwrite existing custom layer type %s", type);
	d->custom_layer_registry[custom_index].name = type;
	d->custom_layer_registry[custom_index].creator = creator;
	d->custom_layer_registry[custom_index].destroyer = destroyer;
	d->custom_layer_registry[custom_index].userdata = userdata;
	}

	return 0;
	}
	#endif // NCNN_STRING

	int Net::register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer, void* userdata)
	{
	int custom_index = index & ~LayerType::CustomBit;
	if (index == custom_index)
	{
	NCNN_LOGE("overwrite built-in layer type %d", index);

	for (size_t i = 0; i < d->overwrite_builtin_layer_registry.size(); i++)
	{
	if (d->overwrite_builtin_layer_registry[i].typeindex == index)
	{
	NCNN_LOGE("overwrite existing overwritten built-in layer index %d", index);

	d->overwrite_builtin_layer_registry[i].creator = creator;
	d->overwrite_builtin_layer_registry[i].destroyer = destroyer;
	d->overwrite_builtin_layer_registry[i].userdata = userdata;
	return 0;
	}
	}

	struct overwrite_builtin_layer_registry_entry entry = {index, creator, destroyer, userdata};
	d->overwrite_builtin_layer_registry.push_back(entry);
	return 0;
	}

	if ((int)d->custom_layer_registry.size() <= custom_index)
	{
	#if NCNN_STRING
	struct custom_layer_registry_entry dummy = {"", 0, 0, 0};
	#else
	struct custom_layer_registry_entry dummy = {0, 0, 0};
	#endif // NCNN_STRING
	d->custom_layer_registry.resize(custom_index + 1, dummy);
	}

	if (d->custom_layer_registry[custom_index].creator)
	{
	NCNN_LOGE("overwrite existing custom layer index %d", custom_index);
	}

	d->custom_layer_registry[custom_index].creator = creator;
	d->custom_layer_registry[custom_index].destroyer = destroyer;
	d->custom_layer_registry[custom_index].userdata = userdata;
	return 0;
	}

	#if NCNN_STRING
	int Net::load_param(const DataReader& dr)
	{
	#define SCAN_VALUE(fmt, v) \
	if (dr.scan(fmt, &v) != 1) \
	{ \
	NCNN_LOGE("parse " #v " failed"); \
	return -1; \
	}

	int magic = 0;
	SCAN_VALUE("%d", magic)
	if (magic != 7767517)
	{
	NCNN_LOGE("param is too old, please regenerate");
	return -1;
	}

	// parse
	int layer_count = 0;
	int blob_count = 0;
	SCAN_VALUE("%d", layer_count)
	SCAN_VALUE("%d", blob_count)
	if (layer_count <= 0 \|\| blob_count <= 0)
	{
	NCNN_LOGE("invalid layer_count or blob_count");
	return -1;
	}

	d->layers.resize((size_t)layer_count);
	d->blobs.resize((size_t)blob_count);

	#if NCNN_VULKAN
	// TODO enable gpu when bf16 conversion implemented
	if (opt.use_bf16_storage)
	opt.use_vulkan_compute = false;

	if (opt.use_vulkan_compute)
	{
	if (!d->vkdev) d->vkdev = get_gpu_device();
	if (!d->vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu
	}
	if (opt.use_vulkan_compute)
	{
	// sanitize use options
	if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
	if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
	if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
	if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
	if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
	if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

	if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false;

	// enable local memory optimization on discrete gpu only
	if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false;

	// fp16a makes no sense when fp16 storage disabled
	if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
	}
	else
	{
	// fp16a makes no sense when fp16 storage disabled
	if (!opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
	}
	#endif // NCNN_VULKAN

	ParamDict pd;

	int blob_index = 0;
	for (int i = 0; i < layer_count; i++)
	{
	char layer_type[256];
	char layer_name[256];
	int bottom_count = 0;
	int top_count = 0;
	SCAN_VALUE("%255s", layer_type)
	SCAN_VALUE("%255s", layer_name)
	SCAN_VALUE("%d", bottom_count)
	SCAN_VALUE("%d", top_count)

	Layer* layer = create_overwrite_builtin_layer(layer_type);
	if (!layer)
	{
	layer = create_layer(layer_type);
	}
	if (!layer)
	{
	layer = create_custom_layer(layer_type);
	}
	if (!layer)
	{
	NCNN_LOGE("layer %s not exists or registered", layer_type);
	clear();
	return -1;
	}

	#if NCNN_VULKAN
	if (opt.use_vulkan_compute)
	layer->vkdev = d->vkdev;
	#endif // NCNN_VULKAN

	layer->type = std::string(layer_type);
	layer->name = std::string(layer_name);
	// NCNN_LOGE("new layer %d %s", i, layer_name);

	layer->bottoms.resize(bottom_count);

	for (int j = 0; j < bottom_count; j++)
	{
	char bottom_name[256];
	SCAN_VALUE("%255s", bottom_name)

	int bottom_blob_index = find_blob_index_by_name(bottom_name);
	if (bottom_blob_index == -1)
	{
	Blob& blob = d->blobs[blob_index];

	bottom_blob_index = blob_index;

	blob.name = std::string(bottom_name);
	// NCNN_LOGE("new blob %s", bottom_name);

	blob_index++;
	}

	Blob& blob = d->blobs[bottom_blob_index];

	blob.consumer = i;

	layer->bottoms[j] = bottom_blob_index;
	}

	layer->tops.resize(top_count);
	for (int j = 0; j < top_count; j++)
	{
	Blob& blob = d->blobs[blob_index];

	char blob_name[256];
	SCAN_VALUE("%255s", blob_name)

	blob.name = std::string(blob_name);
	// NCNN_LOGE("new blob %s", blob_name);

	blob.producer = i;

	layer->tops[j] = blob_index;

	blob_index++;
	}

	// layer specific params
	int pdlr = pd.load_param(dr);
	if (pdlr != 0)
	{
	NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
	continue;
	}

	if (layer->support_int8_storage)
	{
	// no int8 gpu support yet
	opt.use_vulkan_compute = false;
	}

	// pull out top shape hints
	Mat shape_hints = pd.get(30, Mat());
	if (!shape_hints.empty())
	{
	const int* psh = shape_hints;
	for (int j = 0; j < top_count; j++)
	{
	Blob& blob = d->blobs[layer->tops[j]];

	int dims = psh[0];
	if (dims == 1)
	{
	blob.shape = Mat(psh[1], (void*)0, 4u, 1);
	}
	if (dims == 2)
	{
	blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1);
	}
	if (dims == 3)
	{
	blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1);
	}

	psh += 4;
	}
	}

	// set bottom and top shape hints
	layer->bottom_shapes.resize(bottom_count);
	for (int j = 0; j < bottom_count; j++)
	{
	layer->bottom_shapes[j] = d->blobs[layer->bottoms[j]].shape;
	}

	layer->top_shapes.resize(top_count);
	for (int j = 0; j < top_count; j++)
	{
	layer->top_shapes[j] = d->blobs[layer->tops[j]].shape;
	}

	// pull out layer specific feature disabled set
	layer->featmask = pd.get(31, 0);

	int lr = layer->load_param(pd);
	if (lr != 0)
	{
	NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
	continue;
	}

	d->layers[i] = layer;
	}

	d->update_input_output_indexes();
	d->update_input_output_names();

	#undef SCAN_VALUE
	return 0;
	}
	#endif // NCNN_STRING

	int Net::load_param_bin(const DataReader& dr)
	{
	#define READ_VALUE(buf) \
	if (dr.read(&buf, sizeof(buf)) != sizeof(buf)) \
	{ \
	NCNN_LOGE("read " #buf " failed"); \
	return -1; \
	}

	int magic = 0;
	READ_VALUE(magic)
	if (magic != 7767517)
	{
	NCNN_LOGE("param is too old, please regenerate");
	return -1;
	}

	int layer_count = 0;
	int blob_count = 0;
	READ_VALUE(layer_count)
	READ_VALUE(blob_count)
	if (layer_count <= 0 \|\| blob_count <= 0)
	{
	NCNN_LOGE("invalid layer_count or blob_count");
	return -1;
	}

	d->layers.resize(layer_count);
	d->blobs.resize(blob_count);

	#if NCNN_VULKAN
	// TODO enable gpu when bf16 conversion implemented
	if (opt.use_bf16_storage)
	opt.use_vulkan_compute = false;

	if (opt.use_vulkan_compute)
	{
	if (!d->vkdev) d->vkdev = get_gpu_device();
	if (!d->vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu
	}
	if (opt.use_vulkan_compute)
	{
	// sanitize use options
	if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
	if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
	if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
	if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
	if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
	if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

	if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false;

	// enable local memory optimization on discrete gpu only
	if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false;

	// fp16a makes no sense when fp16 storage disabled
	if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
	}
	else
	{
	// fp16a makes no sense when fp16 storage disabled
	if (!opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
	}
	#endif // NCNN_VULKAN

	ParamDict pd;

	for (int i = 0; i < layer_count; i++)
	{
	int typeindex;
	int bottom_count;
	int top_count;
	READ_VALUE(typeindex)
	READ_VALUE(bottom_count)
	READ_VALUE(top_count)

	Layer* layer = create_overwrite_builtin_layer(typeindex);
	if (!layer)
	{
	layer = create_layer(typeindex);
	}
	if (!layer)
	{
	int custom_index = typeindex & ~LayerType::CustomBit;
	layer = create_custom_layer(custom_index);
	}
	if (!layer)
	{
	NCNN_LOGE("layer %d not exists or registered", typeindex);
	clear();
	return -1;
	}

	#if NCNN_VULKAN
	if (opt.use_vulkan_compute)
	layer->vkdev = d->vkdev;
	#endif // NCNN_VULKAN

	// layer->type = std::string(layer_type);
	// layer->name = std::string(layer_name);
	// NCNN_LOGE("new layer %d", typeindex);

	layer->bottoms.resize(bottom_count);
	for (int j = 0; j < bottom_count; j++)
	{
	int bottom_blob_index;
	READ_VALUE(bottom_blob_index)

	Blob& blob = d->blobs[bottom_blob_index];

	blob.consumer = i;

	layer->bottoms[j] = bottom_blob_index;
	}

	layer->tops.resize(top_count);
	for (int j = 0; j < top_count; j++)
	{
	int top_blob_index;
	READ_VALUE(top_blob_index)

	Blob& blob = d->blobs[top_blob_index];

	// blob.name = std::string(blob_name);
	// NCNN_LOGE("new blob %s", blob_name);

	blob.producer = i;

	layer->tops[j] = top_blob_index;
	}

	// layer specific params
	int pdlr = pd.load_param_bin(dr);
	if (pdlr != 0)
	{
	#if NCNN_STRING
	NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
	#else
	NCNN_LOGE("ParamDict load_param %d failed", i);
	#endif
	continue;
	}

	if (layer->support_int8_storage)
	{
	// no int8 gpu support yet
	opt.use_vulkan_compute = false;
	}

	// pull out top blob shape hints
	Mat shape_hints = pd.get(30, Mat());
	if (!shape_hints.empty())
	{
	const int* psh = shape_hints;
	for (int j = 0; j < top_count; j++)
	{
	Blob& blob = d->blobs[layer->tops[j]];

	int dims = psh[0];
	if (dims == 1)
	{
	blob.shape = Mat(psh[1], (void*)0, 4u, 1);
	}
	if (dims == 2)
	{
	blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1);
	}
	if (dims == 3)
	{
	blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1);
	}

	psh += 4;
	}
	}

	// set bottom and top shape hints
	layer->bottom_shapes.resize(bottom_count);
	for (int j = 0; j < bottom_count; j++)
	{
	layer->bottom_shapes[j] = d->blobs[layer->bottoms[j]].shape;
	}

	layer->top_shapes.resize(top_count);
	for (int j = 0; j < top_count; j++)
	{
	layer->top_shapes[j] = d->blobs[layer->tops[j]].shape;
	}

	// pull out layer specific feature disabled set
	layer->featmask = pd.get(31, 0);

	int lr = layer->load_param(pd);
	if (lr != 0)
	{
	#if NCNN_STRING
	NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
	#else
	NCNN_LOGE("layer load_param %d failed", i);
	#endif
	continue;
	}

	d->layers[i] = layer;
	}

	d->update_input_output_indexes();

	#undef READ_VALUE
	return 0;
	}

	int Net::load_model(const DataReader& dr)
	{
	if (d->layers.empty())
	{
	NCNN_LOGE("network graph not ready");
	return -1;
	}

	int layer_count = (int)d->layers.size();

	// load file
	int ret = 0;

	#if NCNN_VULKAN
	if (opt.use_vulkan_compute)
	{
	if (!opt.pipeline_cache)
	{
	if (!d->pipeline_cache)
	d->pipeline_cache = new PipelineCache(d->vkdev);
	opt.pipeline_cache = d->pipeline_cache;
	}
	}
	#endif // NCNN_VULKAN

	ModelBinFromDataReader mb(dr);
	for (int i = 0; i < layer_count; i++)
	{
	Layer* layer = d->layers[i];

	//Here we found inconsistent content in the parameter file.
	if (!layer)
	{
	NCNN_LOGE("load_model error at layer %d, parameter file has inconsistent content.", i);
	ret = -1;
	break;
	}

	int lret = layer->load_model(mb);
	if (lret != 0)
	{
	#if NCNN_STRING
	NCNN_LOGE("layer load_model %d %s failed", i, layer->name.c_str());
	#else
	NCNN_LOGE("layer load_model %d failed", i);
	#endif
	ret = -1;
	break;
	}

	if (layer->support_int8_storage)
	{
	// no int8 gpu support yet
	opt.use_vulkan_compute = false;
	}

	Option opt1 = get_masked_option(opt, layer->featmask);
	#if NCNN_VULKAN
	if (opt1.use_vulkan_compute)
	{
	if (!layer->support_image_storage) opt1.use_image_storage = false;
	}
	else
	{
	layer->vkdev = 0;
	layer->support_vulkan = false;
	}
	#endif // NCNN_VULKAN

	int cret = layer->create_pipeline(opt1);
	if (cret != 0)
	{
	#if NCNN_STRING
	NCNN_LOGE("layer create_pipeline %d %s failed", i, layer->name.c_str());
	#else
	NCNN_LOGE("layer create_pipeline %d failed", i);
	#endif
	ret = -1;
	break;
	}
	}

	if (opt.use_local_pool_allocator)
	{
	if (opt.blob_allocator == 0)
	{
	if (!d->local_blob_allocator)
	{
	d->local_blob_allocator = new PoolAllocator;
	d->local_blob_allocator->set_size_compare_ratio(0.f);
	}
	}
	if (opt.workspace_allocator == 0)
	{
	if (!d->local_workspace_allocator)
	{
	d->local_workspace_allocator = new PoolAllocator;
	d->local_workspace_allocator->set_size_compare_ratio(0.f);
	}
	}
	}

	#if NCNN_VULKAN
	if (ret == 0 && opt.use_vulkan_compute)
	{
	ret = d->upload_model();
	}
	#endif // NCNN_VULKAN

	return ret;
	}

	#if NCNN_STDIO
	#if NCNN_STRING
	int Net::load_param(FILE* fp)
	{
	DataReaderFromStdio dr(fp);
	return load_param(dr);
	}

	int Net::load_param_mem(const char* _mem)
	{
	const unsigned char* mem = (const unsigned char*)_mem;
	DataReaderFromMemory dr(mem);
	return load_param(dr);
	}

	int Net::load_param(const char* protopath)
	{
	FILE* fp = fopen(protopath, "rb");
	if (!fp)
	{
	NCNN_LOGE("fopen %s failed", protopath);
	return -1;
	}

	int ret = load_param(fp);
	fclose(fp);
	return ret;
	}
	#endif // NCNN_STRING

	int Net::load_param_bin(FILE* fp)
	{
	DataReaderFromStdio dr(fp);
	return load_param_bin(dr);
	}

	int Net::load_param_bin(const char* protopath)
	{
	FILE* fp = fopen(protopath, "rb");
	if (!fp)
	{
	NCNN_LOGE("fopen %s failed", protopath);
	return -1;
	}

	int ret = load_param_bin(fp);
	fclose(fp);
	return ret;
	}

	int Net::load_model(FILE* fp)
	{
	DataReaderFromStdio dr(fp);
	return load_model(dr);
	}

	int Net::load_model(const char* modelpath)
	{
	FILE* fp = fopen(modelpath, "rb");
	if (!fp)
	{
	NCNN_LOGE("fopen %s failed", modelpath);
	return -1;
	}

	int ret = load_model(fp);
	fclose(fp);
	return ret;
	}
	#endif // NCNN_STDIO

	int Net::load_param(const unsigned char* _mem)
	{
	const unsigned char* mem = _mem;
	DataReaderFromMemory dr(mem);
	load_param_bin(dr);
	return static_cast<int>(mem - _mem);
	}

	int Net::load_model(const unsigned char* _mem)
	{
	const unsigned char* mem = _mem;
	DataReaderFromMemory dr(mem);
	load_model(dr);
	return static_cast<int>(mem - _mem);
	}

	#if NCNN_PLATFORM_API
	#if __ANDROID_API__ >= 9
	#if NCNN_STRING
	int Net::load_param(AAsset* asset)
	{
	DataReaderFromAndroidAsset dr(asset);
	return load_param(dr);
	}

	int Net::load_param(AAssetManager* mgr, const char* assetpath)
	{
	AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER);
	if (!asset)
	{
	NCNN_LOGE("AAssetManager_open %s failed", assetpath);
	return -1;
	}

	int ret = load_param(asset);
	AAsset_close(asset);
	return ret;
	}
	#endif // NCNN_STRING

	int Net::load_param_bin(AAsset* asset)
	{
	DataReaderFromAndroidAsset dr(asset);
	return load_param_bin(dr);
	}

	int Net::load_param_bin(AAssetManager* mgr, const char* assetpath)
	{
	AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER);
	if (!asset)
	{
	NCNN_LOGE("AAssetManager_open %s failed", assetpath);
	return -1;
	}

	int ret = load_param_bin(asset);
	AAsset_close(asset);
	return ret;
	}

	int Net::load_model(AAsset* asset)
	{
	DataReaderFromAndroidAsset dr(asset);
	return load_model(dr);
	}

	int Net::load_model(AAssetManager* mgr, const char* assetpath)
	{
	AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_STREAMING);
	if (!asset)
	{
	NCNN_LOGE("AAssetManager_open %s failed", assetpath);
	return -1;
	}

	int ret = load_model(asset);
	AAsset_close(asset);
	return ret;
	}
	#endif // __ANDROID_API__ >= 9
	#endif // NCNN_PLATFORM_API

	void Net::clear()
	{
	d->blobs.clear();
	for (size_t i = 0; i < d->layers.size(); i++)
	{
	Layer* layer = d->layers[i];

	Option opt1 = get_masked_option(opt, layer->featmask);
	#if NCNN_VULKAN
	if (!layer->support_image_storage)
	{
	opt1.use_image_storage = false;
	}
	#endif // NCNN_VULKAN

	int dret = layer->destroy_pipeline(opt1);
	if (dret != 0)
	{
	NCNN_LOGE("layer destroy_pipeline failed");
	// ignore anyway
	}

	if (layer->typeindex & ncnn::LayerType::CustomBit)
	{
	int custom_index = layer->typeindex & ~ncnn::LayerType::CustomBit;
	if (d->custom_layer_registry[custom_index].destroyer)
	{
	d->custom_layer_registry[custom_index].destroyer(layer, d->custom_layer_registry[custom_index].userdata);
	}
	else
	{
	delete layer;
	}
	}
	else
	{
	// check overwrite builtin layer destroyer
	int index = -1;
	const size_t overwrite_builtin_layer_registry_entry_count = d->overwrite_builtin_layer_registry.size();
	for (size_t i = 0; i < overwrite_builtin_layer_registry_entry_count; i++)
	{
	if (d->overwrite_builtin_layer_registry[i].typeindex == layer->typeindex)
	{
	index = i;
	break;
	}
	}

	if (index != -1 && d->overwrite_builtin_layer_registry[index].destroyer)
	{
	d->overwrite_builtin_layer_registry[index].destroyer(layer, d->overwrite_builtin_layer_registry[index].userdata);
	}
	else
	{
	delete layer;
	}
	}
	}
	d->layers.clear();

	if (d->local_blob_allocator)
	{
	delete d->local_blob_allocator;
	d->local_blob_allocator = 0;
	}
	if (d->local_workspace_allocator)
	{
	delete d->local_workspace_allocator;
	d->local_workspace_allocator = 0;
	}

	#if NCNN_VULKAN
	if (d->weight_vkallocator)
	{
	delete d->weight_vkallocator;
	d->weight_vkallocator = 0;
	}
	if (d->weight_staging_vkallocator)
	{
	delete d->weight_staging_vkallocator;
	d->weight_staging_vkallocator = 0;
	}
	if (d->pipeline_cache)
	{
	delete d->pipeline_cache;
	d->pipeline_cache = 0;
	opt.pipeline_cache = 0;
	}
	#endif // NCNN_VULKAN
	}

	Extractor Net::create_extractor() const
	{
	return Extractor(this, d->blobs.size());
	}

	const std::vector<int>& Net::input_indexes() const
	{
	return d->input_blob_indexes;
	}

	const std::vector<int>& Net::output_indexes() const
	{
	return d->output_blob_indexes;
	}

	#if NCNN_STRING
	const std::vector<const char*>& Net::input_names() const
	{
	return d->input_blob_names;
	}

	const std::vector<const char*>& Net::output_names() const
	{
	return d->output_blob_names;
	}
	#endif

	const std::vector<Blob>& Net::blobs() const
	{
	return d->blobs;
	}

	const std::vector<Layer*>& Net::layers() const
	{
	return d->layers;
	}

	std::vector<Blob>& Net::mutable_blobs()
	{
	return d->blobs;
	}

	std::vector<Layer*>& Net::mutable_layers()
	{
	return d->layers;
	}

	#if NCNN_VULKAN
	void Net::set_vulkan_device(int device_index)
	{
	d->vkdev = get_gpu_device(device_index);
	}

	void Net::set_vulkan_device(const VulkanDevice* _vkdev)
	{
	d->vkdev = _vkdev;
	}

	const VulkanDevice* Net::vulkan_device() const
	{
	return d->vkdev;
	}
	#endif // NCNN_VULKAN

	#if NCNN_STRING
	int Net::find_blob_index_by_name(const char* name) const
	{
	for (size_t i = 0; i < d->blobs.size(); i++)
	{
	const Blob& blob = d->blobs[i];
	if (blob.name == name)
	{
	return static_cast<int>(i);
	}
	}

	NCNN_LOGE("find_blob_index_by_name %s failed", name);
	return -1;
	}

	int Net::find_layer_index_by_name(const char* name) const
	{
	for (size_t i = 0; i < d->layers.size(); i++)
	{
	const Layer* layer = d->layers[i];
	if (layer->name == name)
	{
	return static_cast<int>(i);
	}
	}

	NCNN_LOGE("find_layer_index_by_name %s failed", name);
	return -1;
	}

	int Net::custom_layer_to_index(const char* type)
	{
	const size_t custom_layer_registry_entry_count = d->custom_layer_registry.size();
	for (size_t i = 0; i < custom_layer_registry_entry_count; i++)
	{
	if (strcmp(type, d->custom_layer_registry[i].name) == 0)
	return static_cast<int>(i);
	}

	return -1;
	}

	Layer* Net::create_custom_layer(const char* type)
	{
	int index = custom_layer_to_index(type);
	if (index == -1)
	return 0;

	return create_custom_layer(index);
	}

	Layer* Net::create_overwrite_builtin_layer(const char* type)
	{
	int typeindex = layer_to_index(type);
	if (typeindex == -1)
	return 0;

	return create_overwrite_builtin_layer(typeindex);
	}
	#endif // NCNN_STRING

	Layer* Net::create_custom_layer(int index)
	{
	const size_t custom_layer_registry_entry_count = d->custom_layer_registry.size();
	if (index < 0 \|\| static_cast<unsigned int>(index) >= custom_layer_registry_entry_count)
	return 0;

	layer_creator_func layer_creator = d->custom_layer_registry[index].creator;
	if (!layer_creator)
	return 0;

	Layer* layer = layer_creator(d->custom_layer_registry[index].userdata);
	layer->typeindex = ncnn::LayerType::CustomBit \| index;
	return layer;
	}

	Layer* Net::create_overwrite_builtin_layer(int typeindex)
	{
	int index = -1;
	const size_t overwrite_builtin_layer_registry_entry_count = d->overwrite_builtin_layer_registry.size();
	for (size_t i = 0; i < overwrite_builtin_layer_registry_entry_count; i++)
	{
	if (d->overwrite_builtin_layer_registry[i].typeindex == typeindex)
	{
	index = i;
	break;
	}
	}

	if (index == -1)
	return 0;

	layer_creator_func layer_creator = d->overwrite_builtin_layer_registry[index].creator;
	if (!layer_creator)
	return 0;

	Layer* layer = layer_creator(d->overwrite_builtin_layer_registry[index].userdata);
	layer->typeindex = typeindex;
	return layer;
	}

	class ExtractorPrivate
	{
	public:
	ExtractorPrivate(const Net* _net)
	: net(_net)
	{
	}
	const Net* net;
	std::vector<Mat> blob_mats;
	Option opt;

	#if NCNN_VULKAN
	VkAllocator* local_blob_vkallocator;
	VkAllocator* local_staging_vkallocator;

	std::vector<VkMat> blob_mats_gpu;
	std::vector<VkImageMat> blob_mats_gpu_image;
	#endif // NCNN_VULKAN
	};

	Extractor::Extractor(const Net* _net, size_t blob_count)
	: d(new ExtractorPrivate(_net))
	{
	d->blob_mats.resize(blob_count);
	d->opt = d->net->opt;

	#if NCNN_VULKAN
	if (d->net->opt.use_vulkan_compute)
	{
	d->local_blob_vkallocator = 0;
	d->local_staging_vkallocator = 0;

	d->blob_mats_gpu.resize(blob_count);
	d->blob_mats_gpu_image.resize(blob_count);
	}
	#endif // NCNN_VULKAN
	}

	Extractor::~Extractor()
	{
	clear();

	delete d;
	}

	Extractor::Extractor(const Extractor& rhs)
	: d(new ExtractorPrivate(0))
	{
	d->net = rhs.d->net;
	d->blob_mats = rhs.d->blob_mats;
	d->opt = rhs.d->opt;

	#if NCNN_VULKAN
	d->local_blob_vkallocator = 0;
	d->local_staging_vkallocator = 0;

	d->blob_mats_gpu = rhs.d->blob_mats_gpu;
	d->blob_mats_gpu_image = rhs.d->blob_mats_gpu_image;
	#endif // NCNN_VULKAN
	}

	Extractor& Extractor::operator=(const Extractor& rhs)
	{
	if (this == &rhs)
	return *this;

	d->net = rhs.d->net;
	d->blob_mats = rhs.d->blob_mats;
	d->opt = rhs.d->opt;

	#if NCNN_VULKAN
	d->local_blob_vkallocator = 0;
	d->local_staging_vkallocator = 0;

	d->blob_mats_gpu = rhs.d->blob_mats_gpu;
	d->blob_mats_gpu_image = rhs.d->blob_mats_gpu_image;
	#endif // NCNN_VULKAN

	return *this;
	}

	void Extractor::clear()
	{
	d->blob_mats.clear();

	#if NCNN_VULKAN
	if (d->opt.use_vulkan_compute)
	{
	d->blob_mats_gpu.clear();
	d->blob_mats_gpu_image.clear();

	if (d->local_blob_vkallocator)
	{
	d->net->vulkan_device()->reclaim_blob_allocator(d->local_blob_vkallocator);
	}
	if (d->local_staging_vkallocator)
	{
	d->net->vulkan_device()->reclaim_staging_allocator(d->local_staging_vkallocator);
	}
	}
	#endif // NCNN_VULKAN
	}

	void Extractor::set_light_mode(bool enable)
	{
	d->opt.lightmode = enable;
	}

	void Extractor::set_num_threads(int num_threads)
	{
	d->opt.num_threads = num_threads;
	}

	void Extractor::set_blob_allocator(Allocator* allocator)
	{
	d->opt.blob_allocator = allocator;
	}

	void Extractor::set_workspace_allocator(Allocator* allocator)
	{
	d->opt.workspace_allocator = allocator;
	}

	#if NCNN_VULKAN
	void Extractor::set_vulkan_compute(bool enable)
	{
	if (d->net->d->opt.use_vulkan_compute)
	{
	d->opt.use_vulkan_compute = enable;
	}
	else
	{
	NCNN_LOGE("set_vulkan_compute failed, network use_vulkan_compute disabled");
	}
	}

	void Extractor::set_blob_vkallocator(VkAllocator* allocator)
	{
	d->opt.blob_vkallocator = allocator;
	}

	void Extractor::set_workspace_vkallocator(VkAllocator* allocator)
	{
	d->opt.workspace_vkallocator = allocator;
	}

	void Extractor::set_staging_vkallocator(VkAllocator* allocator)
	{
	d->opt.staging_vkallocator = allocator;
	}
	#endif // NCNN_VULKAN

	#if NCNN_STRING
	int Extractor::input(const char* blob_name, const Mat& in)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& input_names = d->net->input_names();
	for (size_t i = 0; i < input_names.size(); i++)
	{
	NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i);
	}

	return -1;
	}

	return input(blob_index, in);
	}

	int Extractor::extract(const char* blob_name, Mat& feat, int type)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& output_names = d->net->output_names();
	for (size_t i = 0; i < output_names.size(); i++)
	{
	NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i);
	}

	return -1;
	}

	return extract(blob_index, feat, type);
	}
	#endif // NCNN_STRING

	int Extractor::input(int blob_index, const Mat& in)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	d->blob_mats[blob_index] = in;

	return 0;
	}

	int Extractor::extract(int blob_index, Mat& feat, int type)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	int old_blocktime = get_kmp_blocktime();
	set_kmp_blocktime(d->opt.openmp_blocktime);

	int old_flush_denormals = get_flush_denormals();
	set_flush_denormals(d->opt.flush_denormals);

	int ret = 0;

	if (d->blob_mats[blob_index].dims == 0)
	{
	int layer_index = d->net->blobs()[blob_index].producer;

	// use local allocator
	if (d->opt.use_local_pool_allocator)
	{
	if (!d->opt.blob_allocator)
	{
	d->opt.blob_allocator = d->net->d->local_blob_allocator;
	}
	if (!d->opt.workspace_allocator)
	{
	d->opt.workspace_allocator = d->net->d->local_workspace_allocator;
	}
	}

	#if NCNN_VULKAN
	if (d->opt.use_vulkan_compute)
	{
	// use local allocator
	if (!d->opt.blob_vkallocator)
	{
	d->local_blob_vkallocator = d->net->vulkan_device()->acquire_blob_allocator();
	d->opt.blob_vkallocator = d->local_blob_vkallocator;
	}
	if (!d->opt.workspace_vkallocator)
	{
	d->opt.workspace_vkallocator = d->opt.blob_vkallocator;
	}
	if (!d->opt.staging_vkallocator)
	{
	d->local_staging_vkallocator = d->net->vulkan_device()->acquire_staging_allocator();
	d->opt.staging_vkallocator = d->local_staging_vkallocator;
	}

	ncnn::VkCompute cmd(d->net->vulkan_device());
	#if NCNN_BENCHMARK
	cmd.create_query_pool(d->net->layers().size() * 2);
	#endif // NCNN_BENCHMARK

	// TODO vkimagemat for adreno
	if (d->opt.use_image_storage)
	{
	VkImageMat feat_gpu;
	ret = extract(blob_index, feat_gpu, cmd);

	if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
	{
	cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);

	ret = cmd.submit_and_wait();

	#if NCNN_BENCHMARK
	std::vector<uint64_t> results(d->net->layers().size() * 2);
	cmd.get_query_pool_results(0, d->net->layers().size() * 2, results);
	for (size_t i = 0; i < d->net->layers().size(); i++)
	{
	uint64_t start = results[i * 2];
	uint64_t end = results[i * 2 + 1];
	if (start == 0 \|\| end == 0)
	continue;

	double duration_us = (end - start) * d->net->vulkan_device()->info.timestamp_period() / 1000;
	NCNN_LOGE("%-24s %-30s %8.2lfus \|", d->net->layers()[i]->type.c_str(), d->net->layers()[i]->name.c_str(), duration_us);
	}
	#endif // NCNN_BENCHMARK
	}
	}
	else
	{
	VkMat feat_gpu;
	ret = extract(blob_index, feat_gpu, cmd);

	if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
	{
	cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);

	ret = cmd.submit_and_wait();

	#if NCNN_BENCHMARK
	std::vector<uint64_t> results(d->net->layers().size() * 2);
	cmd.get_query_pool_results(0, d->net->layers().size() * 2, results);
	for (size_t i = 0; i < d->net->layers().size(); i++)
	{
	uint64_t start = results[i * 2];
	uint64_t end = results[i * 2 + 1];
	if (start == 0 \|\| end == 0)
	continue;

	double duration_us = (end - start) * d->net->vulkan_device()->info.timestamp_period() / 1000;
	NCNN_LOGE("%-24s %-30s %8.2lfus \|", d->net->layers()[i]->type.c_str(), d->net->layers()[i]->name.c_str(), duration_us);
	}
	#endif // NCNN_BENCHMARK
	}
	}
	}
	else
	{
	ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->opt);
	}
	#else
	ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->opt);
	#endif // NCNN_VULKAN
	}

	feat = d->blob_mats[blob_index];

	if (d->opt.use_packing_layout && (type == 0) && feat.elempack != 1)
	{
	Mat bottom_blob_unpacked;
	convert_packing(feat, bottom_blob_unpacked, 1, d->opt);
	feat = bottom_blob_unpacked;
	}

	// clang-format off
	// INDENT-OFF
	#if NCNN_ARM82
	if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
	{
	if (feat.elembits() == 16)
	{
	Mat feat_fp32;
	cast_float16_to_float32(feat, feat_fp32, d->opt);
	feat = feat_fp32;
	}
	}
	else
	#endif // NCNN_ARM82
	#if NCNN_BF16
	if (d->opt.use_bf16_storage && (type == 0))
	{
	if (feat.elembits() == 16)
	{
	Mat feat_fp32;
	cast_bfloat16_to_float32(feat, feat_fp32, d->opt);
	feat = feat_fp32;
	}
	}
	else
	#endif // NCNN_BF16
	if (feat.elembits() == 8 && (type == 0))
	{
	Mat feat_fp32;
	cast_int8_to_float32(feat, feat_fp32, d->opt);
	feat = feat_fp32;
	}
	// INDENT-ON
	// clang-format on

	if (d->opt.use_local_pool_allocator && feat.allocator == d->net->d->local_blob_allocator)
	{
	// detach the returned mat from local pool allocator
	// so we could destroy net instance much earlier
	feat = feat.clone();
	}

	set_kmp_blocktime(old_blocktime);
	set_flush_denormals(old_flush_denormals);

	return ret;
	}

	#if NCNN_VULKAN
	#if NCNN_STRING
	int Extractor::input(const char* blob_name, const VkMat& in)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& input_names = d->net->input_names();
	for (size_t i = 0; i < input_names.size(); i++)
	{
	NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i);
	}

	return -1;
	}

	return input(blob_index, in);
	}

	int Extractor::extract(const char* blob_name, VkMat& feat, VkCompute& cmd)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& output_names = d->net->output_names();
	for (size_t i = 0; i < output_names.size(); i++)
	{
	NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i);
	}

	return -1;
	}

	return extract(blob_index, feat, cmd);
	}

	int Extractor::input(const char* blob_name, const VkImageMat& in)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& input_names = d->net->input_names();
	for (size_t i = 0; i < input_names.size(); i++)
	{
	NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i);
	}

	return -1;
	}

	return input(blob_index, in);
	}

	int Extractor::extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd)
	{
	int blob_index = d->net->find_blob_index_by_name(blob_name);
	if (blob_index == -1)
	{
	NCNN_LOGE("Try");
	const std::vector<const char*>& output_names = d->net->output_names();
	for (size_t i = 0; i < output_names.size(); i++)
	{
	NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i);
	}

	return -1;
	}

	return extract(blob_index, feat, cmd);
	}
	#endif // NCNN_STRING

	int Extractor::input(int blob_index, const VkMat& in)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	d->blob_mats_gpu[blob_index] = in;

	return 0;
	}

	int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	int old_blocktime = get_kmp_blocktime();
	set_kmp_blocktime(d->opt.openmp_blocktime);

	int old_flush_denormals = get_flush_denormals();
	set_flush_denormals(d->opt.flush_denormals);

	int ret = 0;

	if (d->blob_mats_gpu[blob_index].dims == 0)
	{
	if (d->blob_mats_gpu_image[blob_index].dims != 0)
	{
	// image to buffer
	cmd.record_image_to_buffer(d->blob_mats_gpu_image[blob_index], d->blob_mats_gpu[blob_index], d->opt);
	}
	else if (d->blob_mats[blob_index].dims != 0)
	{
	// host to buffer
	cmd.record_upload(d->blob_mats[blob_index], d->blob_mats_gpu[blob_index], d->opt);
	}
	else
	{
	int layer_index = d->net->blobs()[blob_index].producer;
	ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, cmd, d->opt);
	}
	}

	feat = d->blob_mats_gpu[blob_index];

	set_kmp_blocktime(old_blocktime);
	set_flush_denormals(old_flush_denormals);

	return ret;
	}

	int Extractor::input(int blob_index, const VkImageMat& in)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	d->blob_mats_gpu_image[blob_index] = in;

	return 0;
	}

	int Extractor::extract(int blob_index, VkImageMat& feat, VkCompute& cmd)
	{
	if (blob_index < 0 \|\| blob_index >= (int)d->blob_mats.size())
	return -1;

	int old_blocktime = get_kmp_blocktime();
	set_kmp_blocktime(d->opt.openmp_blocktime);

	int old_flush_denormals = get_flush_denormals();
	set_flush_denormals(d->opt.flush_denormals);

	int ret = 0;

	if (d->blob_mats_gpu_image[blob_index].dims == 0)
	{
	if (d->blob_mats_gpu[blob_index].dims != 0)
	{
	// buffer to image
	cmd.record_buffer_to_image(d->blob_mats_gpu[blob_index], d->blob_mats_gpu_image[blob_index], d->opt);
	}
	else if (d->blob_mats[blob_index].dims != 0)
	{
	// host to image
	cmd.record_upload(d->blob_mats[blob_index], d->blob_mats_gpu_image[blob_index], d->opt);
	}
	else
	{
	int layer_index = d->net->blobs()[blob_index].producer;
	ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, d->blob_mats_gpu_image, cmd, d->opt);
	}
	}

	feat = d->blob_mats_gpu_image[blob_index];

	if (feat.empty())
	{
	NCNN_LOGE("extract %d image allocation failed", blob_index);
	ret = -100;
	}

	set_kmp_blocktime(old_blocktime);
	set_flush_denormals(old_flush_denormals);

	return ret;
	}
	#endif // NCNN_VULKAN

	} // namespace ncnn