// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "net.h" #include "cpu.h" #include "datareader.h" #include "layer_type.h" #include "modelbin.h" #include "paramdict.h" #include #include #include #if NCNN_BENCHMARK #include "benchmark.h" #endif // NCNN_BENCHMARK #if NCNN_VULKAN #include "command.h" #include "pipelinecache.h" #endif // NCNN_VULKAN namespace ncnn { class NetPrivate { public: NetPrivate(Option& _opt); Option& opt; #if NCNN_VULKAN int upload_model(); #endif // NCNN_VULKAN friend class Extractor; int forward_layer(int layer_index, std::vector& blob_mats, const Option& opt) const; #if NCNN_VULKAN int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const; int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const; #endif // NCNN_VULKAN int convert_layout(Mat& bottom_blob, const Layer* layer, const Option& opt) const; int do_forward_layer(const Layer* layer, std::vector& blob_mats, const Option& opt) const; #if NCNN_VULKAN int do_forward_layer(const Layer* layer, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const; int do_forward_layer(const Layer* layer, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const; #endif // NCNN_VULKAN void update_input_output_indexes(); #if NCNN_STRING void update_input_output_names(); #endif // NCNN_STRING std::vector blobs; std::vector layers; std::vector input_blob_indexes; std::vector output_blob_indexes; #if NCNN_STRING std::vector input_blob_names; std::vector output_blob_names; #endif // NCNN_STRING std::vector custom_layer_registry; std::vector overwrite_builtin_layer_registry; PoolAllocator* local_blob_allocator; PoolAllocator* local_workspace_allocator; #if NCNN_VULKAN const VulkanDevice* vkdev; VkAllocator* weight_vkallocator; VkAllocator* weight_staging_vkallocator; PipelineCache* pipeline_cache; #endif // NCNN_VULKAN }; NetPrivate::NetPrivate(Option& _opt) : opt(_opt) { local_blob_allocator = 0; local_workspace_allocator = 0; #if NCNN_VULKAN vkdev = 0; weight_vkallocator = 0; weight_staging_vkallocator = 0; pipeline_cache = 0; #endif // NCNN_VULKAN } static Option get_masked_option(const Option& opt, int featmask) { // mask option usage as layer specific featmask Option opt1 = opt; opt1.use_fp16_arithmetic = opt1.use_fp16_arithmetic && !(featmask & (1 << 0)); opt1.use_fp16_storage = opt1.use_fp16_storage && !(featmask & (1 << 1)); opt1.use_fp16_packed = opt1.use_fp16_packed && !(featmask & (1 << 1)); opt1.use_bf16_storage = opt1.use_bf16_storage && !(featmask & (1 << 2)); opt1.use_int8_packed = opt1.use_int8_packed && !(featmask & (1 << 3)); opt1.use_int8_storage = opt1.use_int8_storage && !(featmask & (1 << 3)); opt1.use_int8_arithmetic = opt1.use_int8_arithmetic && !(featmask & (1 << 3)); opt1.use_vulkan_compute = opt1.use_vulkan_compute && !(featmask & (1 << 4)); opt1.use_image_storage = opt1.use_image_storage && !(featmask & (1 << 4)); opt1.use_tensor_storage = opt1.use_tensor_storage && !(featmask & (1 << 4)); opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5)); opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6)); return opt1; } #if NCNN_VULKAN int NetPrivate::upload_model() { ncnn::VkTransfer cmd(vkdev); // create gpu device allocator if null if (!weight_vkallocator) { weight_vkallocator = new VkWeightAllocator(vkdev); } if (!weight_staging_vkallocator) { weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev); } Option opt_upload = opt; opt_upload.blob_vkallocator = weight_vkallocator; opt_upload.workspace_vkallocator = weight_vkallocator; opt_upload.staging_vkallocator = weight_staging_vkallocator; for (size_t i = 0; i < layers.size(); i++) { if (layers[i]->support_vulkan) { int uret = layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask)); if (uret != 0) { NCNN_LOGE("layer upload_model %d failed", (int)i); return -1; } } } return cmd.submit_and_wait(); } #endif // NCNN_VULKAN int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %s", layer_index, layer->name.c_str()); // load bottom blobs for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt); if (ret != 0) return ret; } } #if NCNN_BENCHMARK double start = get_current_time(); Mat bottom_blob; if (layer->one_blob_only) { int bottom_blob_index = layer->bottoms[0]; bottom_blob.dims = blob_mats[bottom_blob_index].dims; bottom_blob.w = blob_mats[bottom_blob_index].w; bottom_blob.h = blob_mats[bottom_blob_index].h; bottom_blob.c = blob_mats[bottom_blob_index].c; bottom_blob.elempack = blob_mats[bottom_blob_index].elempack; bottom_blob.elemsize = blob_mats[bottom_blob_index].elemsize; } #endif int ret = 0; if (layer->featmask) { ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats, opt); } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) { int top_blob_index = layer->tops[0]; benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end); } else { benchmark(layer, start, end); } #endif if (ret != 0) return ret; // NCNN_LOGE("forward_layer %d %s done", layer_index, layer->name.c_str()); // const Mat& blob = blob_mats[layer->tops[0]]; // NCNN_LOGE("[%-2d %-16s %-16s] %d blobs count = %-3d size = %-3d x %-3d", layer_index, layer->type.c_str(), layer->name.c_str(), layer->tops[0], blob.c, blob.h, blob.w); return 0; } #if NCNN_VULKAN int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str()); bool cmd_submit_and_wait = false; // load bottom blobs for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt); if (ret != 0) return ret; } if (layer->support_vulkan) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); } } } else { if (blob_mats[bottom_blob_index].dims == 0) { Option opt_download = opt; opt_download.use_packing_layout = layer->support_packing; // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt_download); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } cmd_submit_and_wait = true; } } } int ret; if (cmd_submit_and_wait) { ret = cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period() / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); if (ret != 0) return ret; } if (layer->support_vulkan) { #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); #endif if (layer->featmask) { ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); } #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2 + 1); #endif } else { #if NCNN_BENCHMARK double start = get_current_time(); Mat bottom_blob; if (layer->one_blob_only) { int bottom_blob_index = layer->bottoms[0]; bottom_blob = blob_mats[bottom_blob_index].shape(); } #endif if (layer->featmask) { ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats, opt); } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) { int top_blob_index = layer->tops[0]; benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end); } else { benchmark(layer, start, end); } #endif } if (ret != 0) return ret; // NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str()); return 0; } int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str()); bool cmd_submit_and_wait = false; bool image_allocation_failed = false; IMAGE_ALLOCATION_FAILED: if (image_allocation_failed) { #if NCNN_STRING NCNN_LOGE("forward_layer %d %s image allocation failed, fallback to cpu", layer_index, layer->name.c_str()); #else NCNN_LOGE("forward_layer %d image allocation failed, fallback to cpu", layer_index); #endif } // load bottom blobs for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats_gpu_image[bottom_blob_index].dims == 0 && blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (layer->support_vulkan && !image_allocation_failed) { if (layer->support_image_storage) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to image cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (blob_mats_gpu_image[bottom_blob_index].empty()) { image_allocation_failed = true; goto IMAGE_ALLOCATION_FAILED; } if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); } } else { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (blob_mats_gpu_image[bottom_blob_index].empty()) { image_allocation_failed = true; goto IMAGE_ALLOCATION_FAILED; } if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } } else { if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); } } else { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } } else { if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } cmd_submit_and_wait = true; } else { // image to host cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } cmd_submit_and_wait = true; } } } } int ret; if (cmd_submit_and_wait) { ret = cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period() / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); if (ret != 0) return ret; } if (layer->support_vulkan && !image_allocation_failed) { #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); #endif if (layer->support_image_storage) { if (layer->featmask) { ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt); } if (ret == -100) { image_allocation_failed = true; goto IMAGE_ALLOCATION_FAILED; } } else { if (layer->featmask) { ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); } } #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2 + 1); #endif } else { #if NCNN_BENCHMARK double start = get_current_time(); Mat bottom_blob; if (layer->one_blob_only) { int bottom_blob_index = layer->bottoms[0]; bottom_blob = blob_mats[bottom_blob_index].shape(); } #endif if (layer->featmask) { ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); } else { ret = do_forward_layer(layer, blob_mats, opt); } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) { int top_blob_index = layer->tops[0]; benchmark(layer, bottom_blob, blob_mats[top_blob_index], start, end); } else { benchmark(layer, start, end); } #endif } if (ret != 0) return ret; // NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str()); return 0; } #endif // NCNN_VULKAN int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Option& opt) const { // clang-format off // *INDENT-OFF* #if NCNN_ARM82 if (opt.use_fp16_storage && cpu_support_arm_asimdhp()) { if (bottom_blob.elembits() == 32 && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt); bottom_blob = bottom_blob_fp16; } if (bottom_blob.elembits() == 16 && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } } else #endif // NCNN_ARM82 #if NCNN_RVV if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh()) { if (bottom_blob.elembits() == 32 && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt); bottom_blob = bottom_blob_fp16; } if (bottom_blob.elembits() == 16 && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } } else #endif // NCNN_RVV #if NCNN_BF16 if (opt.use_bf16_storage) { if (bottom_blob.elembits() == 32 && layer->support_bf16_storage) { Mat bottom_blob_bf16; cast_float32_to_bfloat16(bottom_blob, bottom_blob_bf16, opt); bottom_blob = bottom_blob_bf16; } if (bottom_blob.elembits() == 16 && !layer->support_bf16_storage) { Mat bottom_blob_fp32; cast_bfloat16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } } else #endif // NCNN_BF16 { // no type conversion } // *INDENT-ON* // clang-format on int dst_elempack = 1; if (opt.use_packing_layout) { // resolve dst_elempack int dims = bottom_blob.dims; int elemcount = 0; if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w; if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h; if (dims == 3 || dims == 4) elemcount = bottom_blob.elempack * bottom_blob.c; int elembits = bottom_blob.elembits(); if (layer->support_packing) { if (elembits == 32) { #if NCNN_AVX512 if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512()) dst_elempack = 16; else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx()) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; #elif NCNN_AVX if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx()) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; #elif NCNN_RVV const int packn = ncnn::cpu_riscv_vlenb() / 4; if (elemcount % packn == 0) dst_elempack = packn; #else if (elemcount % 4 == 0) dst_elempack = 4; #endif } if (elembits == 16) { #if NCNN_ARM82 if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; #elif NCNN_RVV const int packn = ncnn::cpu_riscv_vlenb() / 2; if (elemcount % packn == 0) dst_elempack = packn; #else if (elemcount % 4 == 0) dst_elempack = 4; #endif } if (elembits == 8) { #if NCNN_RVV const int packn = ncnn::cpu_riscv_vlenb() / 1; if (elemcount % packn == 0) dst_elempack = packn; #else if (elemcount % 8 == 0) dst_elempack = 8; #endif } } } if (bottom_blob.elempack != dst_elempack) { Mat bottom_blob_packed; convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt); bottom_blob = bottom_blob_packed; } return 0; } int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_mats, const Option& opt) const { if (layer->one_blob_only) { int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; Mat& bottom_blob_ref = blob_mats[bottom_blob_index]; Mat bottom_blob; if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { bottom_blob = bottom_blob_ref.clone(opt.blob_allocator); } } if (bottom_blob.dims == 0) { bottom_blob = bottom_blob_ref; } convert_layout(bottom_blob, layer, opt); // forward if (opt.lightmode && layer->support_inplace) { Mat& bottom_top_blob = bottom_blob; int ret = layer->forward_inplace(bottom_top_blob, opt); if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = bottom_top_blob; } else { Mat top_blob; int ret = layer->forward(bottom_blob, top_blob, opt); if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = top_blob; } if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); } } else { std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; Mat& bottom_blob_ref = blob_mats[bottom_blob_index]; bottom_blobs[i].release(); if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { bottom_blobs[i] = bottom_blob_ref.clone(opt.blob_allocator); } } if (bottom_blobs[i].dims == 0) { bottom_blobs[i] = bottom_blob_ref; } convert_layout(bottom_blobs[i], layer, opt); } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; int ret = layer->forward_inplace(bottom_top_blobs, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); int ret = layer->forward(bottom_blobs, top_blobs, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = top_blobs[i]; } } if (opt.lightmode) { for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; // delete after taken in light mode blob_mats[bottom_blob_index].release(); } } } return 0; } #if NCNN_VULKAN int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; VkMat& bottom_blob_ref = blob_mats_gpu[bottom_blob_index]; VkMat bottom_blob; if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { cmd.record_clone(bottom_blob_ref, bottom_blob, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blob.buffer(), bottom_blob.buffer_offset()); } } if (bottom_blob.dims == 0) { bottom_blob = bottom_blob_ref; } // forward if (opt.lightmode && layer->support_inplace) { VkMat& bottom_top_blob = bottom_blob; int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = bottom_top_blob; } else { VkMat top_blob; int ret = layer->forward(bottom_blob, top_blob, cmd, opt); if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = top_blob; } if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; VkMat& bottom_blob_ref = blob_mats_gpu[bottom_blob_index]; bottom_blobs[i].release(); if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { cmd.record_clone(bottom_blob_ref, bottom_blobs[i], opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset()); } } if (bottom_blobs[i].dims == 0) { bottom_blobs[i] = bottom_blob_ref; } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = top_blobs[i]; } } if (opt.lightmode) { for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } return 0; } int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; VkImageMat& bottom_blob_ref = blob_mats_gpu_image[bottom_blob_index]; VkImageMat bottom_blob; if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { cmd.record_clone(bottom_blob_ref, bottom_blob, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blob.buffer(), bottom_blob.buffer_offset()); } } if (bottom_blob.dims == 0) { bottom_blob = bottom_blob_ref; } // forward if (opt.lightmode && layer->support_inplace) { VkImageMat& bottom_top_blob = bottom_blob; int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); if (ret != 0) return ret; // store top blob blob_mats_gpu_image[top_blob_index] = bottom_top_blob; } else { VkImageMat top_blob; int ret = layer->forward(bottom_blob, top_blob, cmd, opt); if (ret != 0) return ret; // store top blob blob_mats_gpu_image[top_blob_index] = top_blob; } if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; VkImageMat& bottom_blob_ref = blob_mats_gpu_image[bottom_blob_index]; if (opt.lightmode) { // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob_ref.refcount != 1) { cmd.record_clone(bottom_blob_ref, bottom_blobs[i], opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob_ref.buffer(), bottom_blob_ref.buffer_offset(), bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset()); } } if (bottom_blobs[i].dims == 0) { bottom_blobs[i] = bottom_blob_ref; } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu_image[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu_image[top_blob_index] = top_blobs[i]; } } if (opt.lightmode) { for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } return 0; } #endif // NCNN_VULKAN void NetPrivate::update_input_output_indexes() { input_blob_indexes.clear(); output_blob_indexes.clear(); for (size_t i = 0; i < layers.size(); i++) { if (layers[i]->typeindex == LayerType::Input) { int blob_index = layers[i]->tops[0]; input_blob_indexes.push_back(blob_index); } } for (size_t i = 0; i < blobs.size(); i++) { if (blobs[i].producer != -1 && blobs[i].consumer == -1) { output_blob_indexes.push_back(i); } } } #if NCNN_STRING void NetPrivate::update_input_output_names() { input_blob_names.clear(); output_blob_names.clear(); for (size_t i = 0; i < input_blob_indexes.size(); i++) { int blob_index = input_blob_indexes[i]; input_blob_names.push_back(blobs[blob_index].name.c_str()); } for (size_t i = 0; i < output_blob_indexes.size(); i++) { int blob_index = output_blob_indexes[i]; output_blob_names.push_back(blobs[blob_index].name.c_str()); } } #endif // NCNN_STRING Net::Net() : d(new NetPrivate(opt)) { } Net::~Net() { clear(); delete d; } Net::Net(const Net&) : d(0) { } Net& Net::operator=(const Net&) { return *this; } #if NCNN_STRING int Net::register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer, void* userdata) { int typeindex = layer_to_index(type); if (typeindex != -1) { NCNN_LOGE("overwrite built-in layer type %s", type); for (size_t i = 0; i < d->overwrite_builtin_layer_registry.size(); i++) { if (d->overwrite_builtin_layer_registry[i].typeindex == typeindex) { NCNN_LOGE("overwrite existing overwritten built-in layer index %d", typeindex); d->overwrite_builtin_layer_registry[i].creator = creator; d->overwrite_builtin_layer_registry[i].destroyer = destroyer; d->overwrite_builtin_layer_registry[i].userdata = userdata; return 0; } } struct overwrite_builtin_layer_registry_entry entry = {typeindex, creator, destroyer, userdata}; d->overwrite_builtin_layer_registry.push_back(entry); return 0; } int custom_index = custom_layer_to_index(type); if (custom_index == -1) { struct custom_layer_registry_entry entry = {type, creator, destroyer, userdata}; d->custom_layer_registry.push_back(entry); } else { NCNN_LOGE("overwrite existing custom layer type %s", type); d->custom_layer_registry[custom_index].name = type; d->custom_layer_registry[custom_index].creator = creator; d->custom_layer_registry[custom_index].destroyer = destroyer; d->custom_layer_registry[custom_index].userdata = userdata; } return 0; } #endif // NCNN_STRING int Net::register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer, void* userdata) { int custom_index = index & ~LayerType::CustomBit; if (index == custom_index) { NCNN_LOGE("overwrite built-in layer type %d", index); for (size_t i = 0; i < d->overwrite_builtin_layer_registry.size(); i++) { if (d->overwrite_builtin_layer_registry[i].typeindex == index) { NCNN_LOGE("overwrite existing overwritten built-in layer index %d", index); d->overwrite_builtin_layer_registry[i].creator = creator; d->overwrite_builtin_layer_registry[i].destroyer = destroyer; d->overwrite_builtin_layer_registry[i].userdata = userdata; return 0; } } struct overwrite_builtin_layer_registry_entry entry = {index, creator, destroyer, userdata}; d->overwrite_builtin_layer_registry.push_back(entry); return 0; } if ((int)d->custom_layer_registry.size() <= custom_index) { #if NCNN_STRING struct custom_layer_registry_entry dummy = {"", 0, 0, 0}; #else struct custom_layer_registry_entry dummy = {0, 0, 0}; #endif // NCNN_STRING d->custom_layer_registry.resize(custom_index + 1, dummy); } if (d->custom_layer_registry[custom_index].creator) { NCNN_LOGE("overwrite existing custom layer index %d", custom_index); } d->custom_layer_registry[custom_index].creator = creator; d->custom_layer_registry[custom_index].destroyer = destroyer; d->custom_layer_registry[custom_index].userdata = userdata; return 0; } #if NCNN_STRING int Net::load_param(const DataReader& dr) { #define SCAN_VALUE(fmt, v) \ if (dr.scan(fmt, &v) != 1) \ { \ NCNN_LOGE("parse " #v " failed"); \ return -1; \ } int magic = 0; SCAN_VALUE("%d", magic) if (magic != 7767517) { NCNN_LOGE("param is too old, please regenerate"); return -1; } // parse int layer_count = 0; int blob_count = 0; SCAN_VALUE("%d", layer_count) SCAN_VALUE("%d", blob_count) if (layer_count <= 0 || blob_count <= 0) { NCNN_LOGE("invalid layer_count or blob_count"); return -1; } d->layers.resize((size_t)layer_count); d->blobs.resize((size_t)blob_count); #if NCNN_VULKAN // TODO enable gpu when bf16 conversion implemented if (opt.use_bf16_storage) opt.use_vulkan_compute = false; if (opt.use_vulkan_compute) { if (!d->vkdev) d->vkdev = get_gpu_device(); if (!d->vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu } if (opt.use_vulkan_compute) { // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false; if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false; // enable local memory optimization on discrete gpu only if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false; // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } else { // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } #endif // NCNN_VULKAN ParamDict pd; int blob_index = 0; for (int i = 0; i < layer_count; i++) { char layer_type[256]; char layer_name[256]; int bottom_count = 0; int top_count = 0; SCAN_VALUE("%255s", layer_type) SCAN_VALUE("%255s", layer_name) SCAN_VALUE("%d", bottom_count) SCAN_VALUE("%d", top_count) Layer* layer = create_overwrite_builtin_layer(layer_type); if (!layer) { layer = create_layer(layer_type); } if (!layer) { layer = create_custom_layer(layer_type); } if (!layer) { NCNN_LOGE("layer %s not exists or registered", layer_type); clear(); return -1; } #if NCNN_VULKAN if (opt.use_vulkan_compute) layer->vkdev = d->vkdev; #endif // NCNN_VULKAN layer->type = std::string(layer_type); layer->name = std::string(layer_name); // NCNN_LOGE("new layer %d %s", i, layer_name); layer->bottoms.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { char bottom_name[256]; SCAN_VALUE("%255s", bottom_name) int bottom_blob_index = find_blob_index_by_name(bottom_name); if (bottom_blob_index == -1) { Blob& blob = d->blobs[blob_index]; bottom_blob_index = blob_index; blob.name = std::string(bottom_name); // NCNN_LOGE("new blob %s", bottom_name); blob_index++; } Blob& blob = d->blobs[bottom_blob_index]; blob.consumer = i; layer->bottoms[j] = bottom_blob_index; } layer->tops.resize(top_count); for (int j = 0; j < top_count; j++) { Blob& blob = d->blobs[blob_index]; char blob_name[256]; SCAN_VALUE("%255s", blob_name) blob.name = std::string(blob_name); // NCNN_LOGE("new blob %s", blob_name); blob.producer = i; layer->tops[j] = blob_index; blob_index++; } // layer specific params int pdlr = pd.load_param(dr); if (pdlr != 0) { NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str()); continue; } if (layer->support_int8_storage) { // no int8 gpu support yet opt.use_vulkan_compute = false; } // pull out top shape hints Mat shape_hints = pd.get(30, Mat()); if (!shape_hints.empty()) { const int* psh = shape_hints; for (int j = 0; j < top_count; j++) { Blob& blob = d->blobs[layer->tops[j]]; int dims = psh[0]; if (dims == 1) { blob.shape = Mat(psh[1], (void*)0, 4u, 1); } if (dims == 2) { blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1); } if (dims == 3) { blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1); } psh += 4; } } // set bottom and top shape hints layer->bottom_shapes.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { layer->bottom_shapes[j] = d->blobs[layer->bottoms[j]].shape; } layer->top_shapes.resize(top_count); for (int j = 0; j < top_count; j++) { layer->top_shapes[j] = d->blobs[layer->tops[j]].shape; } // pull out layer specific feature disabled set layer->featmask = pd.get(31, 0); int lr = layer->load_param(pd); if (lr != 0) { NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str()); continue; } d->layers[i] = layer; } d->update_input_output_indexes(); d->update_input_output_names(); #undef SCAN_VALUE return 0; } #endif // NCNN_STRING int Net::load_param_bin(const DataReader& dr) { #define READ_VALUE(buf) \ if (dr.read(&buf, sizeof(buf)) != sizeof(buf)) \ { \ NCNN_LOGE("read " #buf " failed"); \ return -1; \ } int magic = 0; READ_VALUE(magic) if (magic != 7767517) { NCNN_LOGE("param is too old, please regenerate"); return -1; } int layer_count = 0; int blob_count = 0; READ_VALUE(layer_count) READ_VALUE(blob_count) if (layer_count <= 0 || blob_count <= 0) { NCNN_LOGE("invalid layer_count or blob_count"); return -1; } d->layers.resize(layer_count); d->blobs.resize(blob_count); #if NCNN_VULKAN // TODO enable gpu when bf16 conversion implemented if (opt.use_bf16_storage) opt.use_vulkan_compute = false; if (opt.use_vulkan_compute) { if (!d->vkdev) d->vkdev = get_gpu_device(); if (!d->vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu } if (opt.use_vulkan_compute) { // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false; if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false; // enable local memory optimization on discrete gpu only if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false; // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } else { // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } #endif // NCNN_VULKAN ParamDict pd; for (int i = 0; i < layer_count; i++) { int typeindex; int bottom_count; int top_count; READ_VALUE(typeindex) READ_VALUE(bottom_count) READ_VALUE(top_count) Layer* layer = create_overwrite_builtin_layer(typeindex); if (!layer) { layer = create_layer(typeindex); } if (!layer) { int custom_index = typeindex & ~LayerType::CustomBit; layer = create_custom_layer(custom_index); } if (!layer) { NCNN_LOGE("layer %d not exists or registered", typeindex); clear(); return -1; } #if NCNN_VULKAN if (opt.use_vulkan_compute) layer->vkdev = d->vkdev; #endif // NCNN_VULKAN // layer->type = std::string(layer_type); // layer->name = std::string(layer_name); // NCNN_LOGE("new layer %d", typeindex); layer->bottoms.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { int bottom_blob_index; READ_VALUE(bottom_blob_index) Blob& blob = d->blobs[bottom_blob_index]; blob.consumer = i; layer->bottoms[j] = bottom_blob_index; } layer->tops.resize(top_count); for (int j = 0; j < top_count; j++) { int top_blob_index; READ_VALUE(top_blob_index) Blob& blob = d->blobs[top_blob_index]; // blob.name = std::string(blob_name); // NCNN_LOGE("new blob %s", blob_name); blob.producer = i; layer->tops[j] = top_blob_index; } // layer specific params int pdlr = pd.load_param_bin(dr); if (pdlr != 0) { #if NCNN_STRING NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str()); #else NCNN_LOGE("ParamDict load_param %d failed", i); #endif continue; } if (layer->support_int8_storage) { // no int8 gpu support yet opt.use_vulkan_compute = false; } // pull out top blob shape hints Mat shape_hints = pd.get(30, Mat()); if (!shape_hints.empty()) { const int* psh = shape_hints; for (int j = 0; j < top_count; j++) { Blob& blob = d->blobs[layer->tops[j]]; int dims = psh[0]; if (dims == 1) { blob.shape = Mat(psh[1], (void*)0, 4u, 1); } if (dims == 2) { blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1); } if (dims == 3) { blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1); } psh += 4; } } // set bottom and top shape hints layer->bottom_shapes.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { layer->bottom_shapes[j] = d->blobs[layer->bottoms[j]].shape; } layer->top_shapes.resize(top_count); for (int j = 0; j < top_count; j++) { layer->top_shapes[j] = d->blobs[layer->tops[j]].shape; } // pull out layer specific feature disabled set layer->featmask = pd.get(31, 0); int lr = layer->load_param(pd); if (lr != 0) { #if NCNN_STRING NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str()); #else NCNN_LOGE("layer load_param %d failed", i); #endif continue; } d->layers[i] = layer; } d->update_input_output_indexes(); #undef READ_VALUE return 0; } int Net::load_model(const DataReader& dr) { if (d->layers.empty()) { NCNN_LOGE("network graph not ready"); return -1; } int layer_count = (int)d->layers.size(); // load file int ret = 0; #if NCNN_VULKAN if (opt.use_vulkan_compute) { if (!opt.pipeline_cache) { if (!d->pipeline_cache) d->pipeline_cache = new PipelineCache(d->vkdev); opt.pipeline_cache = d->pipeline_cache; } } #endif // NCNN_VULKAN ModelBinFromDataReader mb(dr); for (int i = 0; i < layer_count; i++) { Layer* layer = d->layers[i]; //Here we found inconsistent content in the parameter file. if (!layer) { NCNN_LOGE("load_model error at layer %d, parameter file has inconsistent content.", i); ret = -1; break; } int lret = layer->load_model(mb); if (lret != 0) { #if NCNN_STRING NCNN_LOGE("layer load_model %d %s failed", i, layer->name.c_str()); #else NCNN_LOGE("layer load_model %d failed", i); #endif ret = -1; break; } if (layer->support_int8_storage) { // no int8 gpu support yet opt.use_vulkan_compute = false; } Option opt1 = get_masked_option(opt, layer->featmask); #if NCNN_VULKAN if (opt1.use_vulkan_compute) { if (!layer->support_image_storage) opt1.use_image_storage = false; } else { layer->vkdev = 0; layer->support_vulkan = false; } #endif // NCNN_VULKAN int cret = layer->create_pipeline(opt1); if (cret != 0) { #if NCNN_STRING NCNN_LOGE("layer create_pipeline %d %s failed", i, layer->name.c_str()); #else NCNN_LOGE("layer create_pipeline %d failed", i); #endif ret = -1; break; } } if (opt.use_local_pool_allocator) { if (opt.blob_allocator == 0) { if (!d->local_blob_allocator) { d->local_blob_allocator = new PoolAllocator; d->local_blob_allocator->set_size_compare_ratio(0.f); } } if (opt.workspace_allocator == 0) { if (!d->local_workspace_allocator) { d->local_workspace_allocator = new PoolAllocator; d->local_workspace_allocator->set_size_compare_ratio(0.f); } } } #if NCNN_VULKAN if (ret == 0 && opt.use_vulkan_compute) { ret = d->upload_model(); } #endif // NCNN_VULKAN return ret; } #if NCNN_STDIO #if NCNN_STRING int Net::load_param(FILE* fp) { DataReaderFromStdio dr(fp); return load_param(dr); } int Net::load_param_mem(const char* _mem) { const unsigned char* mem = (const unsigned char*)_mem; DataReaderFromMemory dr(mem); return load_param(dr); } int Net::load_param(const char* protopath) { FILE* fp = fopen(protopath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", protopath); return -1; } int ret = load_param(fp); fclose(fp); return ret; } #endif // NCNN_STRING int Net::load_param_bin(FILE* fp) { DataReaderFromStdio dr(fp); return load_param_bin(dr); } int Net::load_param_bin(const char* protopath) { FILE* fp = fopen(protopath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", protopath); return -1; } int ret = load_param_bin(fp); fclose(fp); return ret; } int Net::load_model(FILE* fp) { DataReaderFromStdio dr(fp); return load_model(dr); } int Net::load_model(const char* modelpath) { FILE* fp = fopen(modelpath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", modelpath); return -1; } int ret = load_model(fp); fclose(fp); return ret; } #endif // NCNN_STDIO int Net::load_param(const unsigned char* _mem) { const unsigned char* mem = _mem; DataReaderFromMemory dr(mem); load_param_bin(dr); return static_cast(mem - _mem); } int Net::load_model(const unsigned char* _mem) { const unsigned char* mem = _mem; DataReaderFromMemory dr(mem); load_model(dr); return static_cast(mem - _mem); } #if NCNN_PLATFORM_API #if __ANDROID_API__ >= 9 #if NCNN_STRING int Net::load_param(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_param(dr); } int Net::load_param(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_param(asset); AAsset_close(asset); return ret; } #endif // NCNN_STRING int Net::load_param_bin(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_param_bin(dr); } int Net::load_param_bin(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_param_bin(asset); AAsset_close(asset); return ret; } int Net::load_model(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_model(dr); } int Net::load_model(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_STREAMING); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_model(asset); AAsset_close(asset); return ret; } #endif // __ANDROID_API__ >= 9 #endif // NCNN_PLATFORM_API void Net::clear() { d->blobs.clear(); for (size_t i = 0; i < d->layers.size(); i++) { Layer* layer = d->layers[i]; Option opt1 = get_masked_option(opt, layer->featmask); #if NCNN_VULKAN if (!layer->support_image_storage) { opt1.use_image_storage = false; } #endif // NCNN_VULKAN int dret = layer->destroy_pipeline(opt1); if (dret != 0) { NCNN_LOGE("layer destroy_pipeline failed"); // ignore anyway } if (layer->typeindex & ncnn::LayerType::CustomBit) { int custom_index = layer->typeindex & ~ncnn::LayerType::CustomBit; if (d->custom_layer_registry[custom_index].destroyer) { d->custom_layer_registry[custom_index].destroyer(layer, d->custom_layer_registry[custom_index].userdata); } else { delete layer; } } else { // check overwrite builtin layer destroyer int index = -1; const size_t overwrite_builtin_layer_registry_entry_count = d->overwrite_builtin_layer_registry.size(); for (size_t i = 0; i < overwrite_builtin_layer_registry_entry_count; i++) { if (d->overwrite_builtin_layer_registry[i].typeindex == layer->typeindex) { index = i; break; } } if (index != -1 && d->overwrite_builtin_layer_registry[index].destroyer) { d->overwrite_builtin_layer_registry[index].destroyer(layer, d->overwrite_builtin_layer_registry[index].userdata); } else { delete layer; } } } d->layers.clear(); if (d->local_blob_allocator) { delete d->local_blob_allocator; d->local_blob_allocator = 0; } if (d->local_workspace_allocator) { delete d->local_workspace_allocator; d->local_workspace_allocator = 0; } #if NCNN_VULKAN if (d->weight_vkallocator) { delete d->weight_vkallocator; d->weight_vkallocator = 0; } if (d->weight_staging_vkallocator) { delete d->weight_staging_vkallocator; d->weight_staging_vkallocator = 0; } if (d->pipeline_cache) { delete d->pipeline_cache; d->pipeline_cache = 0; opt.pipeline_cache = 0; } #endif // NCNN_VULKAN } Extractor Net::create_extractor() const { return Extractor(this, d->blobs.size()); } const std::vector& Net::input_indexes() const { return d->input_blob_indexes; } const std::vector& Net::output_indexes() const { return d->output_blob_indexes; } #if NCNN_STRING const std::vector& Net::input_names() const { return d->input_blob_names; } const std::vector& Net::output_names() const { return d->output_blob_names; } #endif const std::vector& Net::blobs() const { return d->blobs; } const std::vector& Net::layers() const { return d->layers; } std::vector& Net::mutable_blobs() { return d->blobs; } std::vector& Net::mutable_layers() { return d->layers; } #if NCNN_VULKAN void Net::set_vulkan_device(int device_index) { d->vkdev = get_gpu_device(device_index); } void Net::set_vulkan_device(const VulkanDevice* _vkdev) { d->vkdev = _vkdev; } const VulkanDevice* Net::vulkan_device() const { return d->vkdev; } #endif // NCNN_VULKAN #if NCNN_STRING int Net::find_blob_index_by_name(const char* name) const { for (size_t i = 0; i < d->blobs.size(); i++) { const Blob& blob = d->blobs[i]; if (blob.name == name) { return static_cast(i); } } NCNN_LOGE("find_blob_index_by_name %s failed", name); return -1; } int Net::find_layer_index_by_name(const char* name) const { for (size_t i = 0; i < d->layers.size(); i++) { const Layer* layer = d->layers[i]; if (layer->name == name) { return static_cast(i); } } NCNN_LOGE("find_layer_index_by_name %s failed", name); return -1; } int Net::custom_layer_to_index(const char* type) { const size_t custom_layer_registry_entry_count = d->custom_layer_registry.size(); for (size_t i = 0; i < custom_layer_registry_entry_count; i++) { if (strcmp(type, d->custom_layer_registry[i].name) == 0) return static_cast(i); } return -1; } Layer* Net::create_custom_layer(const char* type) { int index = custom_layer_to_index(type); if (index == -1) return 0; return create_custom_layer(index); } Layer* Net::create_overwrite_builtin_layer(const char* type) { int typeindex = layer_to_index(type); if (typeindex == -1) return 0; return create_overwrite_builtin_layer(typeindex); } #endif // NCNN_STRING Layer* Net::create_custom_layer(int index) { const size_t custom_layer_registry_entry_count = d->custom_layer_registry.size(); if (index < 0 || static_cast(index) >= custom_layer_registry_entry_count) return 0; layer_creator_func layer_creator = d->custom_layer_registry[index].creator; if (!layer_creator) return 0; Layer* layer = layer_creator(d->custom_layer_registry[index].userdata); layer->typeindex = ncnn::LayerType::CustomBit | index; return layer; } Layer* Net::create_overwrite_builtin_layer(int typeindex) { int index = -1; const size_t overwrite_builtin_layer_registry_entry_count = d->overwrite_builtin_layer_registry.size(); for (size_t i = 0; i < overwrite_builtin_layer_registry_entry_count; i++) { if (d->overwrite_builtin_layer_registry[i].typeindex == typeindex) { index = i; break; } } if (index == -1) return 0; layer_creator_func layer_creator = d->overwrite_builtin_layer_registry[index].creator; if (!layer_creator) return 0; Layer* layer = layer_creator(d->overwrite_builtin_layer_registry[index].userdata); layer->typeindex = typeindex; return layer; } class ExtractorPrivate { public: ExtractorPrivate(const Net* _net) : net(_net) { } const Net* net; std::vector blob_mats; Option opt; #if NCNN_VULKAN VkAllocator* local_blob_vkallocator; VkAllocator* local_staging_vkallocator; std::vector blob_mats_gpu; std::vector blob_mats_gpu_image; #endif // NCNN_VULKAN }; Extractor::Extractor(const Net* _net, size_t blob_count) : d(new ExtractorPrivate(_net)) { d->blob_mats.resize(blob_count); d->opt = d->net->opt; #if NCNN_VULKAN if (d->net->opt.use_vulkan_compute) { d->local_blob_vkallocator = 0; d->local_staging_vkallocator = 0; d->blob_mats_gpu.resize(blob_count); d->blob_mats_gpu_image.resize(blob_count); } #endif // NCNN_VULKAN } Extractor::~Extractor() { clear(); delete d; } Extractor::Extractor(const Extractor& rhs) : d(new ExtractorPrivate(0)) { d->net = rhs.d->net; d->blob_mats = rhs.d->blob_mats; d->opt = rhs.d->opt; #if NCNN_VULKAN d->local_blob_vkallocator = 0; d->local_staging_vkallocator = 0; d->blob_mats_gpu = rhs.d->blob_mats_gpu; d->blob_mats_gpu_image = rhs.d->blob_mats_gpu_image; #endif // NCNN_VULKAN } Extractor& Extractor::operator=(const Extractor& rhs) { if (this == &rhs) return *this; d->net = rhs.d->net; d->blob_mats = rhs.d->blob_mats; d->opt = rhs.d->opt; #if NCNN_VULKAN d->local_blob_vkallocator = 0; d->local_staging_vkallocator = 0; d->blob_mats_gpu = rhs.d->blob_mats_gpu; d->blob_mats_gpu_image = rhs.d->blob_mats_gpu_image; #endif // NCNN_VULKAN return *this; } void Extractor::clear() { d->blob_mats.clear(); #if NCNN_VULKAN if (d->opt.use_vulkan_compute) { d->blob_mats_gpu.clear(); d->blob_mats_gpu_image.clear(); if (d->local_blob_vkallocator) { d->net->vulkan_device()->reclaim_blob_allocator(d->local_blob_vkallocator); } if (d->local_staging_vkallocator) { d->net->vulkan_device()->reclaim_staging_allocator(d->local_staging_vkallocator); } } #endif // NCNN_VULKAN } void Extractor::set_light_mode(bool enable) { d->opt.lightmode = enable; } void Extractor::set_num_threads(int num_threads) { d->opt.num_threads = num_threads; } void Extractor::set_blob_allocator(Allocator* allocator) { d->opt.blob_allocator = allocator; } void Extractor::set_workspace_allocator(Allocator* allocator) { d->opt.workspace_allocator = allocator; } #if NCNN_VULKAN void Extractor::set_vulkan_compute(bool enable) { if (d->net->d->opt.use_vulkan_compute) { d->opt.use_vulkan_compute = enable; } else { NCNN_LOGE("set_vulkan_compute failed, network use_vulkan_compute disabled"); } } void Extractor::set_blob_vkallocator(VkAllocator* allocator) { d->opt.blob_vkallocator = allocator; } void Extractor::set_workspace_vkallocator(VkAllocator* allocator) { d->opt.workspace_vkallocator = allocator; } void Extractor::set_staging_vkallocator(VkAllocator* allocator) { d->opt.staging_vkallocator = allocator; } #endif // NCNN_VULKAN #if NCNN_STRING int Extractor::input(const char* blob_name, const Mat& in) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& input_names = d->net->input_names(); for (size_t i = 0; i < input_names.size(); i++) { NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i); } return -1; } return input(blob_index, in); } int Extractor::extract(const char* blob_name, Mat& feat, int type) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& output_names = d->net->output_names(); for (size_t i = 0; i < output_names.size(); i++) { NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i); } return -1; } return extract(blob_index, feat, type); } #endif // NCNN_STRING int Extractor::input(int blob_index, const Mat& in) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; d->blob_mats[blob_index] = in; return 0; } int Extractor::extract(int blob_index, Mat& feat, int type) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; int old_blocktime = get_kmp_blocktime(); set_kmp_blocktime(d->opt.openmp_blocktime); int old_flush_denormals = get_flush_denormals(); set_flush_denormals(d->opt.flush_denormals); int ret = 0; if (d->blob_mats[blob_index].dims == 0) { int layer_index = d->net->blobs()[blob_index].producer; // use local allocator if (d->opt.use_local_pool_allocator) { if (!d->opt.blob_allocator) { d->opt.blob_allocator = d->net->d->local_blob_allocator; } if (!d->opt.workspace_allocator) { d->opt.workspace_allocator = d->net->d->local_workspace_allocator; } } #if NCNN_VULKAN if (d->opt.use_vulkan_compute) { // use local allocator if (!d->opt.blob_vkallocator) { d->local_blob_vkallocator = d->net->vulkan_device()->acquire_blob_allocator(); d->opt.blob_vkallocator = d->local_blob_vkallocator; } if (!d->opt.workspace_vkallocator) { d->opt.workspace_vkallocator = d->opt.blob_vkallocator; } if (!d->opt.staging_vkallocator) { d->local_staging_vkallocator = d->net->vulkan_device()->acquire_staging_allocator(); d->opt.staging_vkallocator = d->local_staging_vkallocator; } ncnn::VkCompute cmd(d->net->vulkan_device()); #if NCNN_BENCHMARK cmd.create_query_pool(d->net->layers().size() * 2); #endif // NCNN_BENCHMARK // TODO vkimagemat for adreno if (d->opt.use_image_storage) { VkImageMat feat_gpu; ret = extract(blob_index, feat_gpu, cmd); if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt); ret = cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(d->net->layers().size() * 2); cmd.get_query_pool_results(0, d->net->layers().size() * 2, results); for (size_t i = 0; i < d->net->layers().size(); i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * d->net->vulkan_device()->info.timestamp_period() / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", d->net->layers()[i]->type.c_str(), d->net->layers()[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK } } else { VkMat feat_gpu; ret = extract(blob_index, feat_gpu, cmd); if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt); ret = cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(d->net->layers().size() * 2); cmd.get_query_pool_results(0, d->net->layers().size() * 2, results); for (size_t i = 0; i < d->net->layers().size(); i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * d->net->vulkan_device()->info.timestamp_period() / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", d->net->layers()[i]->type.c_str(), d->net->layers()[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK } } } else { ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->opt); } #else ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->opt); #endif // NCNN_VULKAN } feat = d->blob_mats[blob_index]; if (d->opt.use_packing_layout && (type == 0) && feat.elempack != 1) { Mat bottom_blob_unpacked; convert_packing(feat, bottom_blob_unpacked, 1, d->opt); feat = bottom_blob_unpacked; } // clang-format off // *INDENT-OFF* #if NCNN_ARM82 if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0)) { if (feat.elembits() == 16) { Mat feat_fp32; cast_float16_to_float32(feat, feat_fp32, d->opt); feat = feat_fp32; } } else #endif // NCNN_ARM82 #if NCNN_BF16 if (d->opt.use_bf16_storage && (type == 0)) { if (feat.elembits() == 16) { Mat feat_fp32; cast_bfloat16_to_float32(feat, feat_fp32, d->opt); feat = feat_fp32; } } else #endif // NCNN_BF16 if (feat.elembits() == 8 && (type == 0)) { Mat feat_fp32; cast_int8_to_float32(feat, feat_fp32, d->opt); feat = feat_fp32; } // *INDENT-ON* // clang-format on if (d->opt.use_local_pool_allocator && feat.allocator == d->net->d->local_blob_allocator) { // detach the returned mat from local pool allocator // so we could destroy net instance much earlier feat = feat.clone(); } set_kmp_blocktime(old_blocktime); set_flush_denormals(old_flush_denormals); return ret; } #if NCNN_VULKAN #if NCNN_STRING int Extractor::input(const char* blob_name, const VkMat& in) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& input_names = d->net->input_names(); for (size_t i = 0; i < input_names.size(); i++) { NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i); } return -1; } return input(blob_index, in); } int Extractor::extract(const char* blob_name, VkMat& feat, VkCompute& cmd) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& output_names = d->net->output_names(); for (size_t i = 0; i < output_names.size(); i++) { NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i); } return -1; } return extract(blob_index, feat, cmd); } int Extractor::input(const char* blob_name, const VkImageMat& in) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& input_names = d->net->input_names(); for (size_t i = 0; i < input_names.size(); i++) { NCNN_LOGE(" ex.input(\"%s\", in%d);", input_names[i], (int)i); } return -1; } return input(blob_index, in); } int Extractor::extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd) { int blob_index = d->net->find_blob_index_by_name(blob_name); if (blob_index == -1) { NCNN_LOGE("Try"); const std::vector& output_names = d->net->output_names(); for (size_t i = 0; i < output_names.size(); i++) { NCNN_LOGE(" ex.extract(\"%s\", out%d);", output_names[i], (int)i); } return -1; } return extract(blob_index, feat, cmd); } #endif // NCNN_STRING int Extractor::input(int blob_index, const VkMat& in) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; d->blob_mats_gpu[blob_index] = in; return 0; } int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; int old_blocktime = get_kmp_blocktime(); set_kmp_blocktime(d->opt.openmp_blocktime); int old_flush_denormals = get_flush_denormals(); set_flush_denormals(d->opt.flush_denormals); int ret = 0; if (d->blob_mats_gpu[blob_index].dims == 0) { if (d->blob_mats_gpu_image[blob_index].dims != 0) { // image to buffer cmd.record_image_to_buffer(d->blob_mats_gpu_image[blob_index], d->blob_mats_gpu[blob_index], d->opt); } else if (d->blob_mats[blob_index].dims != 0) { // host to buffer cmd.record_upload(d->blob_mats[blob_index], d->blob_mats_gpu[blob_index], d->opt); } else { int layer_index = d->net->blobs()[blob_index].producer; ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, cmd, d->opt); } } feat = d->blob_mats_gpu[blob_index]; set_kmp_blocktime(old_blocktime); set_flush_denormals(old_flush_denormals); return ret; } int Extractor::input(int blob_index, const VkImageMat& in) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; d->blob_mats_gpu_image[blob_index] = in; return 0; } int Extractor::extract(int blob_index, VkImageMat& feat, VkCompute& cmd) { if (blob_index < 0 || blob_index >= (int)d->blob_mats.size()) return -1; int old_blocktime = get_kmp_blocktime(); set_kmp_blocktime(d->opt.openmp_blocktime); int old_flush_denormals = get_flush_denormals(); set_flush_denormals(d->opt.flush_denormals); int ret = 0; if (d->blob_mats_gpu_image[blob_index].dims == 0) { if (d->blob_mats_gpu[blob_index].dims != 0) { // buffer to image cmd.record_buffer_to_image(d->blob_mats_gpu[blob_index], d->blob_mats_gpu_image[blob_index], d->opt); } else if (d->blob_mats[blob_index].dims != 0) { // host to image cmd.record_upload(d->blob_mats[blob_index], d->blob_mats_gpu_image[blob_index], d->opt); } else { int layer_index = d->net->blobs()[blob_index].producer; ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, d->blob_mats_gpu_image, cmd, d->opt); } } feat = d->blob_mats_gpu_image[blob_index]; if (feat.empty()) { NCNN_LOGE("extract %d image allocation failed", blob_index); ret = -100; } set_kmp_blocktime(old_blocktime); set_flush_denormals(old_flush_denormals); return ret; } #endif // NCNN_VULKAN } // namespace ncnn