// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "command.h" #if NCNN_VULKAN #include "option.h" #include "pipeline.h" namespace ncnn { class VkComputePrivate { public: VkComputePrivate(const VulkanDevice* _vkdev); ~VkComputePrivate(); int init(); int begin_command_buffer(); int end_command_buffer(); const VulkanDevice* vkdev; VkCommandPool compute_command_pool; VkCommandBuffer compute_command_buffer; VkFence compute_command_fence; std::vector upload_staging_buffers; std::vector download_post_buffers; std::vector download_post_mats_fp16; std::vector download_post_mats; std::vector image_blocks_to_destroy; // the good-old path for device without VK_KHR_push_descriptor std::vector descriptor_pools; std::vector descriptorsets; struct record { enum { TYPE_copy_buffer, TYPE_copy_image, TYPE_copy_buffer_to_image, TYPE_copy_image_to_buffer, TYPE_bind_pipeline, TYPE_bind_descriptorsets, TYPE_push_constants, TYPE_dispatch, TYPE_memory_barrers, TYPE_buffer_barrers, TYPE_image_barrers, #if NCNN_BENCHMARK TYPE_write_timestamp, #endif // NCNN_BENCHMARK TYPE_post_download, TYPE_post_cast_float16_to_float32, }; int type; VkCommandBuffer command_buffer; union { struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image; struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image; struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer; struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants; struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch; struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers; struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers; struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers; #if NCNN_BENCHMARK struct { uint32_t query; } write_timestamp; #endif // NCNN_BENCHMARK struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download; struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; int num_threads; } post_cast_float16_to_float32; }; }; std::vector delayed_records; #if NCNN_BENCHMARK uint32_t query_count; VkQueryPool query_pool; #endif // NCNN_BENCHMARK }; VkComputePrivate::VkComputePrivate(const VulkanDevice* _vkdev) : vkdev(_vkdev) { compute_command_pool = 0; compute_command_buffer = 0; compute_command_fence = 0; #if NCNN_BENCHMARK query_count = 0; query_pool = 0; #endif // NCNN_BENCHMARK init(); } VkComputePrivate::~VkComputePrivate() { for (size_t i = 0; i < image_blocks_to_destroy.size(); i++) { VkImageMemory* ptr = image_blocks_to_destroy[i]; int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1); if (ptr->refcount == 0 && old_command_refcount == 1) { // no userspace reference and we are the last command reference vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); delete ptr; } else { // reference exists in user code or other command } } image_blocks_to_destroy.clear(); if (!vkdev->info.support_VK_KHR_push_descriptor()) { for (size_t i = 0; i < descriptorsets.size(); i++) { vkFreeDescriptorSets(vkdev->vkdevice(), descriptor_pools[i], 1, &descriptorsets[i]); vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0); } } #if NCNN_BENCHMARK if (query_pool) { // all submitted commands that refer to queryPool must have completed execution vkResetCommandBuffer(compute_command_buffer, 0); vkDestroyQueryPool(vkdev->vkdevice(), query_pool, 0); } #endif // NCNN_BENCHMARK vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); } int VkComputePrivate::init() { // compute_command_pool { VkCommandPoolCreateInfo commandPoolCreateInfo; commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; commandPoolCreateInfo.pNext = 0; commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index(); VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateCommandPool failed %d", ret); return -1; } } // compute_command_buffer { VkCommandBufferAllocateInfo commandBufferAllocateInfo; commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; commandBufferAllocateInfo.pNext = 0; commandBufferAllocateInfo.commandPool = compute_command_pool; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); return -1; } } // compute_command_fence { VkFenceCreateInfo fenceCreateInfo; fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; fenceCreateInfo.pNext = 0; fenceCreateInfo.flags = 0; VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateFence failed %d", ret); return -1; } } if (vkdev->info.support_VK_KHR_push_descriptor()) { begin_command_buffer(); #if NCNN_BENCHMARK if (query_pool) vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); #endif // NCNN_BENCHMARK } return 0; } int VkComputePrivate::begin_command_buffer() { VkCommandBufferBeginInfo commandBufferBeginInfo; commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; commandBufferBeginInfo.pNext = 0; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; commandBufferBeginInfo.pInheritanceInfo = 0; VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); if (ret != VK_SUCCESS) { NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); return -1; } return 0; } int VkComputePrivate::end_command_buffer() { VkResult ret = vkEndCommandBuffer(compute_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEndCommandBuffer failed %d", ret); return -1; } return 0; } VkCompute::VkCompute(const VulkanDevice* _vkdev) : vkdev(_vkdev), d(new VkComputePrivate(_vkdev)) { } VkCompute::~VkCompute() { delete d; } void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_upload buffer"); Mat src_fp16; if (src.elemsize == src.elempack * 4u) { // cpu cast to fp16 (discrete gpu) if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) { ncnn::cast_float32_to_float16(src, src_fp16, opt); } else { src_fp16 = src; } } else { src_fp16 = src; } // upload VkMat dst_staging; dst_staging.create_like(src_fp16, opt.staging_vkallocator); if (dst_staging.empty()) return; // stash staging d->upload_staging_buffers.push_back(dst_staging); // NCNN_LOGE("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); // memcpy src to device memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); dst_staging.allocator->flush(dst_staging.data); // mark device host-write @ null dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; // resolve dst_elempack int dims = src_fp16.dims; int elemcount = 0; if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h; if (dims == 3 || dims == 4) elemcount = src_fp16.elempack * src_fp16.c; int dst_elempack = 1; if (opt.use_shader_pack8) dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; else dst_elempack = elemcount % 4 == 0 ? 4 : 1; // gpu cast to fp16 on the fly (integrated gpu) vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); } void VkCompute::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_upload image"); Mat src_fp16; if (src.elemsize == src.elempack * 4u) { // cpu cast to fp16 (discrete gpu) if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) { ncnn::cast_float32_to_float16(src, src_fp16, opt); } else { src_fp16 = src; } } else { src_fp16 = src; } // upload VkMat dst_staging; dst_staging.create_like(src_fp16, opt.staging_vkallocator); if (dst_staging.empty()) return; // stash staging d->upload_staging_buffers.push_back(dst_staging); // memcpy src to device memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); dst_staging.allocator->flush(dst_staging.data); // mark device host-write @ null dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; // resolve dst_elempack int dims = src_fp16.dims; int elemcount = 0; if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h; if (dims == 3 || dims == 4) elemcount = src_fp16.elempack * src_fp16.c; int dst_elempack = 1; if (opt.use_shader_pack8) dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; else dst_elempack = elemcount % 4 == 0 ? 4 : 1; // gpu cast to fp16 on the fly (integrated gpu) if (vkdev->info.bug_buffer_image_load_zero()) { // clone buffer to bridge image VkImageMat dst_image; record_clone(dst_staging, dst_image, opt); if (dst_image.empty()) return; vkdev->convert_packing(dst_image, dst, dst_elempack, *this, opt); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&dst_image.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(dst_image.data); submit_and_wait(); reset(); } else { vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); } } void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_download buffer"); // resolve dst_elempack int dims = src.dims; int elemcount = 0; if (dims == 1) elemcount = src.elempack * src.w; if (dims == 2) elemcount = src.elempack * src.h; if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; int dst_elempack = 1; if (opt.use_packing_layout) dst_elempack = elemcount % 4 == 0 ? 4 : 1; else dst_elempack = 1; // gpu cast to fp32 on the fly (integrated gpu) Option opt_staging = opt; if (vkdev->info.type() != 0) { opt_staging.use_fp16_packed = false; opt_staging.use_fp16_storage = false; } if (!opt_staging.blob_vkallocator->mappable) { opt_staging.blob_vkallocator = opt.staging_vkallocator; } VkMat dst_staging; vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging); // barrier device any @ compute to host-read @ compute if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) { VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = dst_staging.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = dst_staging.buffer(); barriers[0].offset = dst_staging.buffer_offset(); barriers[0].size = dst_staging.buffer_capacity(); VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device host-read @ any dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; } // create dst Mat dst_fp16; dst_fp16.create_like(dst_staging, opt.blob_allocator); if (dst_fp16.empty()) return; // download d->download_post_buffers.push_back(dst_staging); d->download_post_mats_fp16.push_back(dst_fp16); // post memcpy device to dst { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_post_download; r.command_buffer = 0; r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; d->delayed_records.push_back(r); } // cast to fp32 (discrete gpu) if (dst_fp16.elemsize == dst_fp16.elempack * 2u) { if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) { int dims = dst_fp16.dims; if (dims == 1) dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 2) dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 3) dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 4) dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); d->download_post_mats.push_back(dst); VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; r.command_buffer = 0; r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; r.post_cast_float16_to_float32.num_threads = opt.num_threads; d->delayed_records.push_back(r); } else { dst = dst_fp16; } } else { dst = dst_fp16; } } void VkCompute::record_download(const VkImageMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_download image"); // resolve dst_elempack int dims = src.dims; int elemcount = 0; if (dims == 1) elemcount = src.elempack * src.w; if (dims == 2) elemcount = src.elempack * src.h; if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; int dst_elempack = 1; if (opt.use_packing_layout) dst_elempack = elemcount % 4 == 0 ? 4 : 1; else dst_elempack = 1; // gpu cast to fp32 on the fly (integrated gpu) Option opt_staging = opt; if (vkdev->info.type() != 0) { opt_staging.use_fp16_packed = false; opt_staging.use_fp16_storage = false; } if (!opt_staging.blob_vkallocator->mappable) { opt_staging.blob_vkallocator = opt.staging_vkallocator; } VkMat dst_staging; if (vkdev->info.bug_buffer_image_load_zero()) { VkImageMat src_image; vkdev->convert_packing(src, src_image, dst_elempack, *this, opt); if (src_image.empty()) return; record_clone(src_image, dst_staging, opt_staging); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src_image.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src_image.data); } else { vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging); } // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src.data); // barrier device any @ compute to host-read @ compute if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) { VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = dst_staging.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = dst_staging.buffer(); barriers[0].offset = dst_staging.buffer_offset(); barriers[0].size = dst_staging.buffer_capacity(); VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device host-read @ any dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; } // create dst Mat dst_fp16; dst_fp16.create_like(dst_staging, opt.blob_allocator); if (dst_fp16.empty()) return; // download d->download_post_buffers.push_back(dst_staging); d->download_post_mats_fp16.push_back(dst_fp16); // post memcpy device to dst { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_post_download; r.command_buffer = 0; r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; d->delayed_records.push_back(r); } // cast to fp32 (discrete gpu) if (dst_fp16.elemsize == dst_fp16.elempack * 2u) { if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) { int dims = dst_fp16.dims; if (dims == 1) dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 2) dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 3) dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); if (dims == 4) dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); d->download_post_mats.push_back(dst); VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; r.command_buffer = 0; r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; r.post_cast_float16_to_float32.num_threads = opt.num_threads; d->delayed_records.push_back(r); } else { dst = dst_fp16; } } else { dst = dst_fp16; } } void VkCompute::record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_buffer_to_image"); // resolve dst_elempack int dims = src.dims; int elemcount = 0; if (dims == 1) elemcount = src.elempack * src.w; if (dims == 2) elemcount = src.elempack * src.h; if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; int dst_elempack = 1; if (opt.use_shader_pack8) dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; else dst_elempack = elemcount % 4 == 0 ? 4 : 1; if (vkdev->info.bug_buffer_image_load_zero()) { // clone buffer to bridge image VkImageMat src_image; record_clone(src, src_image, opt); if (src_image.empty()) return; vkdev->convert_packing(src_image, dst, dst_elempack, *this, opt); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src_image.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src_image.data); } else { vkdev->convert_packing(src, dst, dst_elempack, *this, opt); } } void VkCompute::record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_image_to_buffer"); // resolve dst_elempack int dims = src.dims; int elemcount = 0; if (dims == 1) elemcount = src.elempack * src.w; if (dims == 2) elemcount = src.elempack * src.h; if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; int dst_elempack = 1; if (opt.use_shader_pack8) dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; else dst_elempack = elemcount % 4 == 0 ? 4 : 1; if (vkdev->info.bug_buffer_image_load_zero()) { VkImageMat src_image; Option opt_image = opt; opt_image.blob_vkallocator = src.allocator; vkdev->convert_packing(src, src_image, dst_elempack, *this, opt_image); if (src_image.empty()) return; record_clone(src_image, dst, opt); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src_image.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src_image.data); } else { vkdev->convert_packing(src, dst, dst_elempack, *this, opt); } // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src.data); } void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_clone host to buffer"); // host to staging VkMat dst_staging; dst_staging.create_like(src, opt.staging_vkallocator); if (dst_staging.empty()) return; // memcpy src to device memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); dst_staging.allocator->flush(dst_staging.data); // mark device host-write @ null dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; // staging to device record_clone(dst_staging, dst, opt); // stash staging d->upload_staging_buffers.push_back(dst_staging); } void VkCompute::record_clone(const Mat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_clone host to image"); // host to staging VkMat dst_staging; Option opt_staging = opt; opt_staging.blob_vkallocator = opt.staging_vkallocator; record_clone(src, dst_staging, opt_staging); // staging to image record_clone(dst_staging, dst, opt); // stash staging d->upload_staging_buffers.push_back(dst_staging); } void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_clone buffer to host"); if (!src.allocator->mappable) { // device to staging VkMat src_staging; Option opt_staging = opt; opt_staging.blob_vkallocator = opt.staging_vkallocator; record_clone(src, src_staging, opt_staging); // staging to host record_clone(src_staging, dst, opt); return; } // create dst dst.create_like(src, opt.blob_allocator); if (dst.empty()) return; // barrier device any @ compute to host-read @ compute if (src.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) { VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = src.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src.buffer(); barriers[0].offset = src.buffer_offset(); barriers[0].size = src.buffer_capacity(); VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device host-read @ any src.data->access_flags = VK_ACCESS_HOST_READ_BIT; src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; } // stash download post buffer and mat d->download_post_buffers.push_back(src); d->download_post_mats_fp16.push_back(dst); // post memcpy device to dst { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_post_download; r.command_buffer = 0; r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; d->delayed_records.push_back(r); } } void VkCompute::record_clone(const VkImageMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_clone image to host"); // image to staging VkMat src_staging; Option opt_staging = opt; opt_staging.blob_vkallocator = opt.staging_vkallocator; record_clone(src, src_staging, opt_staging); // staging to host record_clone(src_staging, dst, opt); } void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_clone buffer to buffer"); // create dst dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) { // barrier device any @ compute to transfer-read @ compute VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = src.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src.buffer(); barriers[0].offset = src.buffer_offset(); barriers[0].size = src.buffer_capacity(); VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device transfer-read @ transfer src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } { // barrier device any @ null to transfer-write @ compute // mark device transfer-write @ transfer dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // record device to staging { VkBufferCopy* regions = new VkBufferCopy[1]; regions[0].srcOffset = src.buffer_offset(); regions[0].dstOffset = dst.buffer_offset(); regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdCopyBuffer(d->compute_command_buffer, src.buffer(), dst.buffer(), 1, regions); delete[] regions; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_copy_buffer; r.command_buffer = d->compute_command_buffer; r.copy_buffer.src = src.buffer(); r.copy_buffer.dst = dst.buffer(); r.copy_buffer.region_count = 1; r.copy_buffer.regions = regions; d->delayed_records.push_back(r); } } } void VkCompute::record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_clone image to image"); // create dst dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; // image layout transform any @ any to transfer-src-optimal @ compute if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = src.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barriers[0].oldLayout = src.data->image_layout; barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = src.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image transfer-src-optimal @ compute src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // image layout transform undefined @ null to transfer-dst-optimal @ compute { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = 0; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = dst.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image transfer-dst-optimal @ compute dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // record device to staging { VkImageCopy* regions = new VkImageCopy[1]; regions[0].srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[0].srcSubresource.mipLevel = 0; regions[0].srcSubresource.baseArrayLayer = 0; regions[0].srcSubresource.layerCount = 1; regions[0].srcOffset.x = 0; regions[0].srcOffset.y = 0; regions[0].srcOffset.z = 0; regions[0].dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[0].dstSubresource.mipLevel = 0; regions[0].dstSubresource.baseArrayLayer = 0; regions[0].dstSubresource.layerCount = 1; regions[0].dstOffset.x = 0; regions[0].dstOffset.y = 0; regions[0].dstOffset.z = 0; regions[0].extent.width = src.data->width; regions[0].extent.height = src.data->height; regions[0].extent.depth = src.data->depth; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdCopyImage(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, regions); delete[] regions; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_copy_image; r.command_buffer = d->compute_command_buffer; r.copy_image.src = src.image(); r.copy_image.src_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; r.copy_image.dst = dst.image(); r.copy_image.dst_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; r.copy_image.region_count = 1; r.copy_image.regions = regions; d->delayed_records.push_back(r); } } // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src.data->command_refcount, 1); NCNN_XADD(&dst.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src.data); d->image_blocks_to_destroy.push_back(dst.data); } void VkCompute::record_clone(const VkMat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_clone buffer to image"); // create dst dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; // barrier device any @ any to transfer-read @ compute if (src.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = src.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src.buffer(); barriers[0].offset = src.buffer_offset(); barriers[0].size = src.buffer_capacity(); VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device transfer-read @ compute src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // image layout transform undefined @ null to transfer-dst-optimal @ compute { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = 0; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = dst.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image transfer-dst-optimal @ compute dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // record device to image { int region_count; VkBufferImageCopy* regions; if (dst.elemsize * dst.w * dst.h % 16 == 0) { region_count = 1; regions = new VkBufferImageCopy[1]; regions[0].bufferOffset = src.buffer_offset(); regions[0].bufferRowLength = 0; regions[0].bufferImageHeight = 0; regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[0].imageSubresource.mipLevel = 0; regions[0].imageSubresource.baseArrayLayer = 0; regions[0].imageSubresource.layerCount = 1; regions[0].imageOffset.x = 0; regions[0].imageOffset.y = 0; regions[0].imageOffset.z = 0; regions[0].imageExtent.width = dst.data->width; regions[0].imageExtent.height = dst.data->height; regions[0].imageExtent.depth = dst.data->depth; } else { region_count = dst.c; regions = new VkBufferImageCopy[region_count]; for (int i = 0; i < region_count; i++) { regions[i].bufferOffset = src.buffer_offset() + src.cstep * src.elemsize * i; regions[i].bufferRowLength = 0; regions[i].bufferImageHeight = 0; regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[i].imageSubresource.mipLevel = 0; regions[i].imageSubresource.baseArrayLayer = 0; regions[i].imageSubresource.layerCount = 1; regions[i].imageOffset.x = 0; regions[i].imageOffset.y = 0; regions[i].imageOffset.z = i; regions[i].imageExtent.width = dst.data->width; regions[i].imageExtent.height = dst.data->height; regions[i].imageExtent.depth = 1; } } if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdCopyBufferToImage(d->compute_command_buffer, src.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, region_count, regions); delete[] regions; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_copy_buffer_to_image; r.command_buffer = d->compute_command_buffer; r.copy_buffer_to_image.src = src.buffer(); r.copy_buffer_to_image.dst = dst.image(); r.copy_buffer_to_image.layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; r.copy_buffer_to_image.region_count = region_count; r.copy_buffer_to_image.regions = regions; d->delayed_records.push_back(r); } } // image and imageview can not be destroyed until command execution ends NCNN_XADD(&dst.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(dst.data); } void VkCompute::record_clone(const VkImageMat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_clone image to buffer"); // create dst dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; // image layout transform any @ any to transfer-src-optimal @ compute if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = src.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barriers[0].oldLayout = src.data->image_layout; barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = src.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image transfer-src-optimal @ compute src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } { // barrier device any @ null to transfer-write @ compute // mark device transfer-write @ transfer dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } // record image to device { int region_count; VkBufferImageCopy* regions; if (src.elemsize * src.w * src.h % 16 == 0) { region_count = 1; regions = new VkBufferImageCopy[1]; regions[0].bufferOffset = dst.buffer_offset(); regions[0].bufferRowLength = 0; regions[0].bufferImageHeight = 0; regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[0].imageSubresource.mipLevel = 0; regions[0].imageSubresource.baseArrayLayer = 0; regions[0].imageSubresource.layerCount = 1; regions[0].imageOffset.x = 0; regions[0].imageOffset.y = 0; regions[0].imageOffset.z = 0; regions[0].imageExtent.width = src.data->width; regions[0].imageExtent.height = src.data->height; regions[0].imageExtent.depth = src.data->depth; } else { region_count = src.c; regions = new VkBufferImageCopy[region_count]; for (int i = 0; i < region_count; i++) { regions[i].bufferOffset = dst.buffer_offset() + dst.cstep * dst.elemsize * i; regions[i].bufferRowLength = 0; regions[i].bufferImageHeight = 0; regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[i].imageSubresource.mipLevel = 0; regions[i].imageSubresource.baseArrayLayer = 0; regions[i].imageSubresource.layerCount = 1; regions[i].imageOffset.x = 0; regions[i].imageOffset.y = 0; regions[i].imageOffset.z = i; regions[i].imageExtent.width = src.data->width; regions[i].imageExtent.height = src.data->height; regions[i].imageExtent.depth = 1; } } if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdCopyImageToBuffer(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.buffer(), region_count, regions); delete[] regions; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_copy_image_to_buffer; r.command_buffer = d->compute_command_buffer; r.copy_image_to_buffer.src = src.image(); r.copy_image_to_buffer.layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; r.copy_image_to_buffer.dst = dst.buffer(); r.copy_image_to_buffer.region_count = region_count; r.copy_image_to_buffer.regions = regions; d->delayed_records.push_back(r); } } // image and imageview can not be destroyed until command execution ends NCNN_XADD(&src.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(src.data); } void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher) { record_pipeline(pipeline, bindings, std::vector(), constants, dispatcher); } void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkImageMat& dispatcher) { record_pipeline(pipeline, std::vector(), bindings, constants, dispatcher); } void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkMat& dispatcher) { Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0); record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat); } void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkImageMat& dispatcher) { Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0); record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat); } void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const Mat& dispatcher) { // NCNN_LOGE("record_pipeline %p", pipeline); const int buffer_binding_count = (int)buffer_bindings.size(); const int image_binding_count = (int)image_bindings.size(); const int constant_count = (int)constants.size(); const int binding_count = buffer_binding_count + image_binding_count; const ShaderInfo& shader_info = pipeline->shader_info(); if (binding_count != shader_info.binding_count) { NCNN_LOGE("binding_count not match, expect %d but got %d + %d", shader_info.binding_count, buffer_binding_count, image_binding_count); } if (constant_count != shader_info.push_constant_count) { NCNN_LOGE("push_constant_count not match, expect %d but got %d", shader_info.push_constant_count, constant_count); } int buffer_index = 0; int image_index = 0; for (int i = 0; i < binding_count; i++) { int binding_type = shader_info.binding_types[i]; if (binding_type == 1) { const VkMat& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index]; buffer_index++; // NCNN_LOGE("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity()); barrier_readwrite(binding); } else if (binding_type == 2) { const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index]; image_index++; // NCNN_LOGE("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); barrier_readwrite(binding); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&binding.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(binding.data); } else // if (binding_type == 3) { const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image_readonly() : image_bindings[image_index]; image_index++; // NCNN_LOGE("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); // if the same image used for both storage image and combined image sampler // only apply image layout transition to general bool image_read_write = false; for (int j = 0; j < image_binding_count; j++) { if (shader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data) { // the same image is used as storage image, skip it image_read_write = true; break; } } if (image_read_write) continue; barrier_readonly(binding); // image and imageview can not be destroyed until command execution ends NCNN_XADD(&binding.data->command_refcount, 1); d->image_blocks_to_destroy.push_back(binding.data); } } // record bind pipeline { if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_pipeline; r.command_buffer = d->compute_command_buffer; r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_pipeline.pipeline = pipeline->pipeline(); d->delayed_records.push_back(r); } } // record update bindings if (binding_count > 0) { std::vector descriptorInfos; { descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count); unsigned char* p_descriptorInfos = descriptorInfos.data(); int descriptorBufferInfo_index = 0; int descriptorImageInfo_index = 0; for (int i = 0; i < binding_count; i++) { int binding_type = shader_info.binding_types[i]; if (binding_type == 1) { const VkMat& binding = buffer_bindings[descriptorBufferInfo_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[descriptorBufferInfo_index]; descriptorBufferInfo_index++; VkDescriptorBufferInfo descriptorBufferInfo; descriptorBufferInfo.buffer = binding.buffer(); descriptorBufferInfo.offset = binding.buffer_offset(); descriptorBufferInfo.range = binding.total() * binding.elemsize; memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo)); p_descriptorInfos += sizeof(VkDescriptorBufferInfo); } else //if (binding_type == 2 || binding_type == 3) { const VkImageMat& binding = image_bindings[descriptorImageInfo_index].empty() ? vkdev->get_dummy_image() : image_bindings[descriptorImageInfo_index]; descriptorImageInfo_index++; // we always use immutable nearest sampler set in descroptor layout during pipeline creation VkDescriptorImageInfo descriptorImageInfo; descriptorImageInfo.sampler = 0; descriptorImageInfo.imageView = binding.imageview(); descriptorImageInfo.imageLayout = binding.data->image_layout; memcpy(p_descriptorInfos, &descriptorImageInfo, sizeof(VkDescriptorImageInfo)); p_descriptorInfos += sizeof(VkDescriptorImageInfo); } } } if (vkdev->info.support_VK_KHR_push_descriptor()) { vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorInfos.data()); } else { // create new descriptor_pool and descriptorset VkDescriptorPool descriptor_pool; { int image_binding_count = 0; int sampler_binding_count = 0; for (int i = 0; i < binding_count; i++) { int binding_type = shader_info.binding_types[i]; if (binding_type == 2) image_binding_count++; else // if (binding_type == 3) sampler_binding_count++; } VkDescriptorPoolSize poolSizes[3]; poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; poolSizes[0].descriptorCount = buffer_binding_count; poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; poolSizes[1].descriptorCount = image_binding_count; poolSizes[2].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; poolSizes[2].descriptorCount = sampler_binding_count; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptorPoolCreateInfo.pNext = 0; descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; descriptorPoolCreateInfo.maxSets = 1; descriptorPoolCreateInfo.poolSizeCount = 3; descriptorPoolCreateInfo.pPoolSizes = poolSizes; VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); return; } } d->descriptor_pools.push_back(descriptor_pool); VkDescriptorSet descriptorset; { VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; descriptorSetAllocateInfo.pNext = 0; descriptorSetAllocateInfo.descriptorPool = descriptor_pool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); return; } } d->descriptorsets.push_back(descriptorset); if (vkdev->info.support_VK_KHR_descriptor_update_template()) { vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), descriptorInfos.data()); } else { std::vector writeDescriptorSets(binding_count); { const unsigned char* p_descriptorInfos = descriptorInfos.data(); for (int i = 0; i < binding_count; i++) { int binding_type = shader_info.binding_types[i]; writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[i].pNext = 0; writeDescriptorSets[i].dstSet = descriptorset; writeDescriptorSets[i].dstBinding = i; writeDescriptorSets[i].dstArrayElement = 0; writeDescriptorSets[i].descriptorCount = 1; writeDescriptorSets[i].pTexelBufferView = 0; if (binding_type == 1) { writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; writeDescriptorSets[i].pImageInfo = 0; writeDescriptorSets[i].pBufferInfo = (const VkDescriptorBufferInfo*)p_descriptorInfos; p_descriptorInfos += sizeof(VkDescriptorBufferInfo); } else if (binding_type == 2) { writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; writeDescriptorSets[i].pBufferInfo = 0; p_descriptorInfos += sizeof(VkDescriptorImageInfo); } else // if (binding_type == 3) { writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; writeDescriptorSets[i].pBufferInfo = 0; p_descriptorInfos += sizeof(VkDescriptorImageInfo); } } } vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); } VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; r.command_buffer = d->compute_command_buffer; r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); r.bind_descriptorsets.descriptorset_count = 1; r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; d->delayed_records.push_back(r); } } // record push constants if (constant_count > 0) { if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPushConstants(d->compute_command_buffer, pipeline->pipeline_layout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); } else { uint32_t size = constant_count * sizeof(vk_constant_type); unsigned char* constant_values = new unsigned char[size]; memcpy(constant_values, constants.data(), size); VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_push_constants; r.command_buffer = d->compute_command_buffer; r.push_constants.pipeline_layout = pipeline->pipeline_layout(); r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; r.push_constants.size = size; r.push_constants.values = constant_values; d->delayed_records.push_back(r); } } // record dispatch { uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); uint32_t group_count_y = (dispatcher.h * (dispatcher.d ? dispatcher.d : 1) + pipeline->local_size_y() - 1) / pipeline->local_size_y(); uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_dispatch; r.command_buffer = d->compute_command_buffer; r.dispatch.group_count_x = group_count_x; r.dispatch.group_count_y = group_count_y; r.dispatch.group_count_z = group_count_z; d->delayed_records.push_back(r); } } } #if NCNN_BENCHMARK void VkCompute::record_write_timestamp(uint32_t query) { if (vkdev->info.support_VK_KHR_push_descriptor()) { if (d->query_pool) vkCmdWriteTimestamp(d->compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, query); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_write_timestamp; r.command_buffer = d->compute_command_buffer; r.write_timestamp.query = query; d->delayed_records.push_back(r); } } #endif // NCNN_BENCHMARK #if NCNN_PLATFORM_API #if __ANDROID_API__ >= 26 void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst) { // image layout transform undefined @ null to general @ compute { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = 0; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = src.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } } // record bind pipeline { if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_pipeline; r.command_buffer = d->compute_command_buffer; r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_pipeline.pipeline = pipeline->pipeline(); d->delayed_records.push_back(r); } } // record update bindings { VkDescriptorImageInfo descriptorImageInfo; descriptorImageInfo.sampler = pipeline->sampler; descriptorImageInfo.imageView = src.imageview(); descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; VkDescriptorBufferInfo descriptorBufferInfo; descriptorBufferInfo.buffer = dst.buffer(); descriptorBufferInfo.offset = dst.buffer_offset(); descriptorBufferInfo.range = dst.total() * dst.elemsize; if (vkdev->info.support_VK_KHR_push_descriptor()) { struct ImportAndroidHardwareBufferDescriptorInfo { VkDescriptorImageInfo imageInfo; VkDescriptorBufferInfo bufferInfo; VkDescriptorBufferInfo buffer4Info; }; ImportAndroidHardwareBufferDescriptorInfo info; info.imageInfo = descriptorImageInfo; info.bufferInfo = descriptorBufferInfo; info.buffer4Info = descriptorBufferInfo; vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, &info); } else { // create new descriptor_pool and descriptorset VkDescriptorPool descriptor_pool; { VkDescriptorPoolSize poolSizes[2]; poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; poolSizes[0].descriptorCount = 1; poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; poolSizes[1].descriptorCount = 2; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptorPoolCreateInfo.pNext = 0; descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; descriptorPoolCreateInfo.maxSets = 1; descriptorPoolCreateInfo.poolSizeCount = 2; descriptorPoolCreateInfo.pPoolSizes = poolSizes; VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); return; } } d->descriptor_pools.push_back(descriptor_pool); VkDescriptorSet descriptorset; { VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; descriptorSetAllocateInfo.pNext = 0; descriptorSetAllocateInfo.descriptorPool = descriptor_pool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); return; } } d->descriptorsets.push_back(descriptorset); if (vkdev->info.support_VK_KHR_descriptor_update_template()) { struct ImportAndroidHardwareBufferDescriptorInfo { VkDescriptorImageInfo imageInfo; VkDescriptorBufferInfo bufferInfo; VkDescriptorBufferInfo buffer4Info; }; ImportAndroidHardwareBufferDescriptorInfo info; info.imageInfo = descriptorImageInfo; info.bufferInfo = descriptorBufferInfo; info.buffer4Info = descriptorBufferInfo; vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), &info); } else { VkWriteDescriptorSet writeDescriptorSets[3]; writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[0].pNext = 0; writeDescriptorSets[0].dstSet = descriptorset; writeDescriptorSets[0].dstBinding = 0; writeDescriptorSets[0].dstArrayElement = 0; writeDescriptorSets[0].descriptorCount = 1; writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writeDescriptorSets[0].pImageInfo = &descriptorImageInfo; writeDescriptorSets[0].pBufferInfo = 0; writeDescriptorSets[0].pTexelBufferView = 0; writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[1].pNext = 0; writeDescriptorSets[1].dstSet = descriptorset; writeDescriptorSets[1].dstBinding = 1; writeDescriptorSets[1].dstArrayElement = 0; writeDescriptorSets[1].descriptorCount = 1; writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; writeDescriptorSets[1].pImageInfo = 0; writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo; writeDescriptorSets[1].pTexelBufferView = 0; writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[2].pNext = 0; writeDescriptorSets[2].dstSet = descriptorset; writeDescriptorSets[2].dstBinding = 2; writeDescriptorSets[2].dstArrayElement = 0; writeDescriptorSets[2].descriptorCount = 1; writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; writeDescriptorSets[2].pImageInfo = 0; writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo; writeDescriptorSets[2].pTexelBufferView = 0; vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); } VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; r.command_buffer = d->compute_command_buffer; r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); r.bind_descriptorsets.descriptorset_count = 1; r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; d->delayed_records.push_back(r); } } // record dispatch { uint32_t group_count_x = (dst.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); uint32_t group_count_y = (dst.h + pipeline->local_size_y() - 1) / pipeline->local_size_y(); uint32_t group_count_z = (dst.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_dispatch; r.command_buffer = d->compute_command_buffer; r.dispatch.group_count_x = group_count_x; r.dispatch.group_count_y = group_count_y; r.dispatch.group_count_z = group_count_z; d->delayed_records.push_back(r); } } } void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst) { // image layout transform undefined @ null to general @ compute { VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[2]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = 0; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = src.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; barriers[1].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[1].pNext = 0; barriers[1].srcAccessMask = 0; barriers[1].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; barriers[1].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barriers[1].newLayout = VK_IMAGE_LAYOUT_GENERAL; barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[1].image = dst.image(); barriers[1].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[1].subresourceRange.baseMipLevel = 0; barriers[1].subresourceRange.levelCount = 1; barriers[1].subresourceRange.baseArrayLayer = 0; barriers[1].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 2, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 2; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } } // record bind pipeline { if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_pipeline; r.command_buffer = d->compute_command_buffer; r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_pipeline.pipeline = pipeline->pipeline(); d->delayed_records.push_back(r); } } // record update bindings { VkDescriptorImageInfo descriptorImageInfos[3]; descriptorImageInfos[0].sampler = pipeline->sampler; descriptorImageInfos[0].imageView = src.imageview(); descriptorImageInfos[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; descriptorImageInfos[1].sampler = 0; descriptorImageInfos[1].imageView = dst.imageview(); descriptorImageInfos[1].imageLayout = VK_IMAGE_LAYOUT_GENERAL; descriptorImageInfos[2].sampler = 0; descriptorImageInfos[2].imageView = dst.imageview(); descriptorImageInfos[2].imageLayout = VK_IMAGE_LAYOUT_GENERAL; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorImageInfos); } else { // create new descriptor_pool and descriptorset VkDescriptorPool descriptor_pool; { VkDescriptorPoolSize poolSizes[2]; poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; poolSizes[0].descriptorCount = 1; poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; poolSizes[1].descriptorCount = 2; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptorPoolCreateInfo.pNext = 0; descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; descriptorPoolCreateInfo.maxSets = 1; descriptorPoolCreateInfo.poolSizeCount = 2; descriptorPoolCreateInfo.pPoolSizes = poolSizes; VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); return; } } d->descriptor_pools.push_back(descriptor_pool); VkDescriptorSet descriptorset; { VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; descriptorSetAllocateInfo.pNext = 0; descriptorSetAllocateInfo.descriptorPool = descriptor_pool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); return; } } d->descriptorsets.push_back(descriptorset); if (vkdev->info.support_VK_KHR_descriptor_update_template()) { vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), descriptorImageInfos); } else { VkWriteDescriptorSet writeDescriptorSets[3]; writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[0].pNext = 0; writeDescriptorSets[0].dstSet = descriptorset; writeDescriptorSets[0].dstBinding = 0; writeDescriptorSets[0].dstArrayElement = 0; writeDescriptorSets[0].descriptorCount = 1; writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writeDescriptorSets[0].pImageInfo = &descriptorImageInfos[0]; writeDescriptorSets[0].pBufferInfo = 0; writeDescriptorSets[0].pTexelBufferView = 0; writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[1].pNext = 0; writeDescriptorSets[1].dstSet = descriptorset; writeDescriptorSets[1].dstBinding = 1; writeDescriptorSets[1].dstArrayElement = 0; writeDescriptorSets[1].descriptorCount = 1; writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; writeDescriptorSets[1].pImageInfo = &descriptorImageInfos[1]; writeDescriptorSets[1].pBufferInfo = 0; writeDescriptorSets[1].pTexelBufferView = 0; writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[2].pNext = 0; writeDescriptorSets[2].dstSet = descriptorset; writeDescriptorSets[2].dstBinding = 2; writeDescriptorSets[2].dstArrayElement = 0; writeDescriptorSets[2].descriptorCount = 1; writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; writeDescriptorSets[2].pImageInfo = &descriptorImageInfos[2]; writeDescriptorSets[2].pBufferInfo = 0; writeDescriptorSets[2].pTexelBufferView = 0; vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); } VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; r.command_buffer = d->compute_command_buffer; r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); r.bind_descriptorsets.descriptorset_count = 1; r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; d->delayed_records.push_back(r); } } // record dispatch { uint32_t group_count_x = (dst.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); uint32_t group_count_y = (dst.h + pipeline->local_size_y() - 1) / pipeline->local_size_y(); uint32_t group_count_z = (dst.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_dispatch; r.command_buffer = d->compute_command_buffer; r.dispatch.group_count_x = group_count_x; r.dispatch.group_count_y = group_count_y; r.dispatch.group_count_z = group_count_z; d->delayed_records.push_back(r); } } } #endif // __ANDROID_API__ >= 26 #endif // NCNN_PLATFORM_API int VkCompute::submit_and_wait() { // NCNN_LOGE("submit_and_wait"); if (!vkdev->info.support_VK_KHR_push_descriptor()) { d->begin_command_buffer(); #if NCNN_BENCHMARK if (d->query_pool) vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); #endif // NCNN_BENCHMARK const size_t record_count = d->delayed_records.size(); // handle delayed records for (size_t i = 0; i < record_count; i++) { const VkComputePrivate::record& r = d->delayed_records[i]; switch (r.type) { case VkComputePrivate::record::TYPE_copy_buffer: { vkCmdCopyBuffer(r.command_buffer, r.copy_buffer.src, r.copy_buffer.dst, r.copy_buffer.region_count, r.copy_buffer.regions); delete[] r.copy_buffer.regions; break; } case VkComputePrivate::record::TYPE_copy_image: { vkCmdCopyImage(r.command_buffer, r.copy_image.src, r.copy_image.src_layout, r.copy_image.dst, r.copy_image.dst_layout, r.copy_image.region_count, r.copy_image.regions); delete[] r.copy_image.regions; break; } case VkComputePrivate::record::TYPE_copy_buffer_to_image: { vkCmdCopyBufferToImage(r.command_buffer, r.copy_buffer_to_image.src, r.copy_buffer_to_image.dst, r.copy_buffer_to_image.layout, r.copy_buffer_to_image.region_count, r.copy_buffer_to_image.regions); delete[] r.copy_buffer_to_image.regions; break; } case VkComputePrivate::record::TYPE_copy_image_to_buffer: { vkCmdCopyImageToBuffer(r.command_buffer, r.copy_image_to_buffer.src, r.copy_image_to_buffer.layout, r.copy_image_to_buffer.dst, r.copy_image_to_buffer.region_count, r.copy_image_to_buffer.regions); delete[] r.copy_image_to_buffer.regions; break; } case VkComputePrivate::record::TYPE_bind_pipeline: { vkCmdBindPipeline(r.command_buffer, r.bind_pipeline.bind_point, r.bind_pipeline.pipeline); break; } case VkComputePrivate::record::TYPE_bind_descriptorsets: { vkCmdBindDescriptorSets(r.command_buffer, r.bind_descriptorsets.bind_point, r.bind_descriptorsets.pipeline_layout, 0, r.bind_descriptorsets.descriptorset_count, &d->descriptorsets[r.bind_descriptorsets.descriptorset_offset], 0, 0); break; } case VkComputePrivate::record::TYPE_push_constants: { vkCmdPushConstants(r.command_buffer, r.push_constants.pipeline_layout, r.push_constants.stage_flags, 0, r.push_constants.size, r.push_constants.values); delete[](unsigned char*) r.push_constants.values; break; } case VkComputePrivate::record::TYPE_dispatch: { vkCmdDispatch(r.command_buffer, r.dispatch.group_count_x, r.dispatch.group_count_y, r.dispatch.group_count_z); break; } case VkComputePrivate::record::TYPE_memory_barrers: { vkCmdPipelineBarrier(r.command_buffer, r.memory_barrers.src_stage, r.memory_barrers.dst_stage, 0, r.memory_barrers.barrier_count, r.memory_barrers.barriers, 0, 0, 0, 0); delete[] r.memory_barrers.barriers; break; } case VkComputePrivate::record::TYPE_buffer_barrers: { vkCmdPipelineBarrier(r.command_buffer, r.buffer_barrers.src_stage, r.buffer_barrers.dst_stage, 0, 0, 0, r.buffer_barrers.barrier_count, r.buffer_barrers.barriers, 0, 0); delete[] r.buffer_barrers.barriers; break; } case VkComputePrivate::record::TYPE_image_barrers: { vkCmdPipelineBarrier(r.command_buffer, r.image_barrers.src_stage, r.image_barrers.dst_stage, 0, 0, 0, 0, 0, r.image_barrers.barrier_count, r.image_barrers.barriers); delete[] r.image_barrers.barriers; break; } #if NCNN_BENCHMARK case VkComputePrivate::record::TYPE_write_timestamp: { if (d->query_pool) vkCmdWriteTimestamp(r.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, r.write_timestamp.query); break; } #endif // NCNN_BENCHMARK case VkComputePrivate::record::TYPE_post_download: case VkComputePrivate::record::TYPE_post_cast_float16_to_float32: default: break; } } } // end command buffer { d->end_command_buffer(); } // acquire queue and reclaim on return VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index()); if (compute_queue == 0) { NCNN_LOGE("out of compute queue"); return -1; } // submit compute { VkSubmitInfo submitInfo; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = 0; submitInfo.waitSemaphoreCount = 0; submitInfo.pWaitSemaphores = 0; submitInfo.pWaitDstStageMask = 0; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &d->compute_command_buffer; submitInfo.signalSemaphoreCount = 0; submitInfo.pSignalSemaphores = 0; VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkQueueSubmit failed %d", ret); vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); return -1; } } vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); // wait { VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1); if (ret != VK_SUCCESS) { NCNN_LOGE("vkWaitForFences failed %d", ret); return -1; } } // handle delayed post records for (size_t i = 0; i < d->delayed_records.size(); i++) { const VkComputePrivate::record& r = d->delayed_records[i]; switch (r.type) { case VkComputePrivate::record::TYPE_post_download: { const VkMat& src = d->download_post_buffers[r.post_download.download_post_buffer_mat_offset]; Mat& dst = d->download_post_mats_fp16[r.post_download.download_post_mat_fp16_offset]; // NCNN_LOGE("post_download %p +%d ~%d -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data); src.allocator->invalidate(src.data); memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); break; } case VkComputePrivate::record::TYPE_post_cast_float16_to_float32: { // NCNN_LOGE("post_cast_float16_to_float32"); const Mat& src = d->download_post_mats_fp16[r.post_cast_float16_to_float32.download_post_mat_fp16_offset]; Mat& dst = d->download_post_mats[r.post_cast_float16_to_float32.download_post_mat_offset]; Option opt; opt.num_threads = r.post_cast_float16_to_float32.num_threads; opt.blob_allocator = dst.allocator; ncnn::cast_float16_to_float32(src, dst, opt); break; } default: break; } } d->delayed_records.clear(); return 0; } int VkCompute::reset() { d->upload_staging_buffers.clear(); d->download_post_buffers.clear(); d->download_post_mats_fp16.clear(); d->download_post_mats.clear(); for (size_t i = 0; i < d->image_blocks_to_destroy.size(); i++) { VkImageMemory* ptr = d->image_blocks_to_destroy[i]; int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1); if (ptr->refcount == 0 && old_command_refcount == 1) { // no userspace reference and we are the last command reference vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); delete ptr; } else { // reference exists in user code or other command } } d->image_blocks_to_destroy.clear(); if (!vkdev->info.support_VK_KHR_push_descriptor()) { for (size_t i = 0; i < d->descriptorsets.size(); i++) { vkFreeDescriptorSets(vkdev->vkdevice(), d->descriptor_pools[i], 1, &d->descriptorsets[i]); vkDestroyDescriptorPool(vkdev->vkdevice(), d->descriptor_pools[i], 0); } d->descriptor_pools.clear(); d->descriptorsets.clear(); } d->delayed_records.clear(); // reset command buffer and fence { VkResult ret = vkResetCommandBuffer(d->compute_command_buffer, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkResetCommandBuffer failed %d", ret); return -1; } } { VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &d->compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkResetFences failed %d", ret); return -1; } } if (vkdev->info.support_VK_KHR_push_descriptor()) { d->begin_command_buffer(); #if NCNN_BENCHMARK if (d->query_pool) vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); #endif // NCNN_BENCHMARK } return 0; } #if NCNN_BENCHMARK int VkCompute::create_query_pool(uint32_t _query_count) { d->query_count = _query_count; VkQueryPoolCreateInfo queryPoolCreateInfo; queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; queryPoolCreateInfo.pNext = 0; queryPoolCreateInfo.flags = 0; queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; queryPoolCreateInfo.queryCount = d->query_count; queryPoolCreateInfo.pipelineStatistics = 0; VkResult ret = vkCreateQueryPool(vkdev->vkdevice(), &queryPoolCreateInfo, 0, &d->query_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateQueryPool failed %d", ret); return -1; } if (vkdev->info.support_VK_KHR_push_descriptor()) { if (d->query_pool) vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); } return 0; } int VkCompute::get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector& results) { if (results.size() < first_query + query_count) { NCNN_LOGE("results not large enough"); return -1; } VkResult ret = vkGetQueryPoolResults(vkdev->vkdevice(), d->query_pool, first_query, query_count, query_count * sizeof(uint64_t), results.data() + first_query, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT); if (ret != VK_SUCCESS && ret != VK_NOT_READY) { NCNN_LOGE("vkGetQueryPoolResults failed %d", ret); return -1; } return 0; } #endif // NCNN_BENCHMARK void VkCompute::barrier_readwrite(const VkMat& binding) { if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { // barrier device any @ compute/null to shader-readwrite @ compute VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = binding.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = binding.buffer(); barriers[0].offset = binding.buffer_offset(); barriers[0].size = binding.buffer_capacity(); VkPipelineStageFlags src_stage = binding.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_buffer_barrers; r.command_buffer = d->compute_command_buffer; r.buffer_barrers.src_stage = src_stage; r.buffer_barrers.dst_stage = dst_stage; r.buffer_barrers.barrier_count = 1; r.buffer_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark device shader-readwrite @ compute binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; } } void VkCompute::barrier_readwrite(const VkImageMat& binding) { if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { // image layout transform any @ any to shader-write @ compute VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = binding.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; barriers[0].oldLayout = binding.data->image_layout; barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = binding.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = binding.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image shader-write @ compute binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; } } void VkCompute::barrier_readonly(const VkImageMat& binding) { if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { // image layout transform any @ any to shader-readonly-optimal @ compute VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = binding.data->access_flags; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barriers[0].oldLayout = binding.data->image_layout; barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = binding.image(); barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[0].subresourceRange.baseMipLevel = 0; barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = binding.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor()) { vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { VkComputePrivate::record r; r.type = VkComputePrivate::record::TYPE_image_barrers; r.command_buffer = d->compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; r.image_barrers.barrier_count = 1; r.image_barrers.barriers = barriers; d->delayed_records.push_back(r); } // mark image shader-readonly-optimal @ compute binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT; binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; } } class VkTransferPrivate { public: VkTransferPrivate(const VulkanDevice* _vkdev); ~VkTransferPrivate(); int init(); int begin_command_buffer(); int end_command_buffer(); const VulkanDevice* vkdev; VkCommandPool compute_command_pool; VkCommandPool transfer_command_pool; VkCommandBuffer upload_command_buffer; VkCommandBuffer compute_command_buffer; VkSemaphore upload_compute_semaphore; VkFence upload_command_fence; VkFence compute_command_fence; std::vector upload_staging_buffers; }; VkTransferPrivate::VkTransferPrivate(const VulkanDevice* _vkdev) : vkdev(_vkdev) { compute_command_pool = 0; transfer_command_pool = 0; upload_command_buffer = 0; compute_command_buffer = 0; upload_compute_semaphore = 0; upload_command_fence = 0; compute_command_fence = 0; init(); } VkTransferPrivate::~VkTransferPrivate() { vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); if (!vkdev->info.unified_compute_transfer_queue()) { vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0); vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0); vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer); vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0); } } int VkTransferPrivate::init() { // compute_command_pool { VkCommandPoolCreateInfo commandPoolCreateInfo; commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; commandPoolCreateInfo.pNext = 0; commandPoolCreateInfo.flags = 0; commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index(); VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateCommandPool failed %d", ret); return -1; } } // compute_command_buffer { VkCommandBufferAllocateInfo commandBufferAllocateInfo; commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; commandBufferAllocateInfo.pNext = 0; commandBufferAllocateInfo.commandPool = compute_command_pool; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); return -1; } } // compute_command_fence { VkFenceCreateInfo fenceCreateInfo; fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; fenceCreateInfo.pNext = 0; fenceCreateInfo.flags = 0; VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateFence failed %d", ret); return -1; } } if (!vkdev->info.unified_compute_transfer_queue()) { // transfer_command_pool { VkCommandPoolCreateInfo commandPoolCreateInfo; commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; commandPoolCreateInfo.pNext = 0; commandPoolCreateInfo.flags = 0; commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index(); VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateCommandPool failed %d", ret); return -1; } } // upload_command_buffer { VkCommandBufferAllocateInfo commandBufferAllocateInfo; commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; commandBufferAllocateInfo.pNext = 0; commandBufferAllocateInfo.commandPool = transfer_command_pool; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); return -1; } } // upload_compute_semaphore { VkSemaphoreCreateInfo semaphoreCreateInfo; semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; semaphoreCreateInfo.pNext = 0; semaphoreCreateInfo.flags = 0; VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateSemaphore failed %d", ret); return -1; } } // upload_command_fence { VkFenceCreateInfo fenceCreateInfo; fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; fenceCreateInfo.pNext = 0; fenceCreateInfo.flags = 0; VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateFence failed %d", ret); return -1; } } } begin_command_buffer(); return 0; } int VkTransferPrivate::begin_command_buffer() { { VkCommandBufferBeginInfo commandBufferBeginInfo; commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; commandBufferBeginInfo.pNext = 0; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; commandBufferBeginInfo.pInheritanceInfo = 0; VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); if (ret != VK_SUCCESS) { NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); return -1; } } if (!vkdev->info.unified_compute_transfer_queue()) { { VkCommandBufferBeginInfo commandBufferBeginInfo; commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; commandBufferBeginInfo.pNext = 0; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; commandBufferBeginInfo.pInheritanceInfo = 0; VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo); if (ret != VK_SUCCESS) { NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); return -1; } } } return 0; } int VkTransferPrivate::end_command_buffer() { { VkResult ret = vkEndCommandBuffer(compute_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEndCommandBuffer failed %d", ret); return -1; } } if (!vkdev->info.unified_compute_transfer_queue()) { { VkResult ret = vkEndCommandBuffer(upload_command_buffer); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEndCommandBuffer failed %d", ret); return -1; } } } return 0; } VkTransfer::VkTransfer(const VulkanDevice* _vkdev) : vkdev(_vkdev), d(new VkTransferPrivate(_vkdev)) { } VkTransfer::~VkTransfer() { delete d; } void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) { // NCNN_LOGE("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); // NOTE keep the hack here ? if (src.elembits() == 32) { if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) { Mat src_fp16; cast_float32_to_float16(src, src_fp16, opt); record_upload(src_fp16, dst, opt, flatten); return; } } Mat src_flattened = flatten ? src.reshape(src.w * src.h * src.c) : src; // create dst dst.create_like(src_flattened, opt.blob_vkallocator); if (dst.empty()) { return; } if (dst.allocator->mappable) { // memcpy src_flattened to device memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); dst.allocator->flush(dst.data); // barrier device host-write @ null to shader-read @ compute { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.buffer = dst.buffer(); barrier.offset = dst.buffer_offset(); barrier.size = dst.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } // mark device shader-readwrite @ compute dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; return; } // create staging VkMat dst_staging; dst_staging.create_like(src_flattened, opt.staging_vkallocator); // memcpy src_flattened to staging memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); dst_staging.allocator->flush(dst_staging.data); VkCommandBuffer command_buffer; if (vkdev->info.unified_compute_transfer_queue()) { command_buffer = d->compute_command_buffer; } else { command_buffer = d->upload_command_buffer; } // barrier staging host-write @ null to transfer-read @ queue { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.buffer = dst_staging.buffer(); barrier.offset = dst_staging.buffer_offset(); barrier.size = dst_staging.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } // record staging to device { VkBufferCopy region; region.srcOffset = dst_staging.buffer_offset(); region.dstOffset = dst.buffer_offset(); region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, ®ion); } if (vkdev->info.unified_compute_transfer_queue()) { // barrier device transfer-write @ compute to shader-read @ compute { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.buffer = dst.buffer(); barrier.offset = dst.buffer_offset(); barrier.size = dst.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } } else { // queue ownership transfer transfer-write @ transfer to shader-read @ compute // release { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.dstAccessMask = 0; barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); barrier.buffer = dst.buffer(); barrier.offset = dst.buffer_offset(); barrier.size = dst.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; vkCmdPipelineBarrier(d->upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } // acquire { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = 0; barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); barrier.buffer = dst.buffer(); barrier.offset = dst.buffer_offset(); barrier.size = dst.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } } // mark device shader-readwrite @ compute dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; // stash staging d->upload_staging_buffers.push_back(dst_staging); } void VkTransfer::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) { // NCNN_LOGE("record_upload image src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); // NOTE keep the hack here ? if (src.elembits() == 32) { if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) { Mat src_fp16; cast_float32_to_float16(src, src_fp16, opt); record_upload(src_fp16, dst, opt); return; } } // create dst dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; // create staging VkMat dst_staging; dst_staging.create_like(src, opt.staging_vkallocator); // memcpy src to staging memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); dst_staging.allocator->flush(dst_staging.data); VkCommandBuffer command_buffer; if (vkdev->info.unified_compute_transfer_queue()) { command_buffer = d->compute_command_buffer; } else { command_buffer = d->upload_command_buffer; } // barrier staging host-write @ null to transfer-read @ queue { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.buffer = dst_staging.buffer(); barrier.offset = dst_staging.buffer_offset(); barrier.size = dst_staging.buffer_capacity(); VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); } // image layout transform undefined @ null to transfer-dst-optimal @ queue { VkImageMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = 0; barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.image = dst.image(); barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barrier.subresourceRange.baseMipLevel = 0; barrier.subresourceRange.levelCount = 1; barrier.subresourceRange.baseArrayLayer = 0; barrier.subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); } // record staging to image { const int channels = dst.c; VkBufferImageCopy* regions = new VkBufferImageCopy[channels]; for (int i = 0; i < channels; i++) { regions[i].bufferOffset = dst_staging.buffer_offset() + dst_staging.cstep * dst_staging.elemsize * i; regions[i].bufferRowLength = 0; regions[i].bufferImageHeight = 0; regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; regions[i].imageSubresource.mipLevel = 0; regions[i].imageSubresource.baseArrayLayer = 0; regions[i].imageSubresource.layerCount = 1; regions[i].imageOffset.x = 0; regions[i].imageOffset.y = 0; regions[i].imageOffset.z = i; regions[i].imageExtent.width = dst.data->width; regions[i].imageExtent.height = dst.data->height; regions[i].imageExtent.depth = 1; } vkCmdCopyBufferToImage(command_buffer, dst_staging.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, channels, regions); delete[] regions; } if (vkdev->info.unified_compute_transfer_queue()) { // image layout transform transfer-dst-optimal @ compute to shader-readonly-optimal @ compute { VkImageMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.image = dst.image(); barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barrier.subresourceRange.baseMipLevel = 0; barrier.subresourceRange.levelCount = 1; barrier.subresourceRange.baseArrayLayer = 0; barrier.subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); } } else { // queue ownership transfer transfer-write @ transfer to shader-read @ compute // release { VkImageMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.dstAccessMask = 0; barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); barrier.image = dst.image(); barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barrier.subresourceRange.baseMipLevel = 0; barrier.subresourceRange.levelCount = 1; barrier.subresourceRange.baseArrayLayer = 0; barrier.subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; vkCmdPipelineBarrier(d->upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); } // acquire { VkImageMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.pNext = 0; barrier.srcAccessMask = 0; barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); barrier.image = dst.image(); barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barrier.subresourceRange.baseMipLevel = 0; barrier.subresourceRange.levelCount = 1; barrier.subresourceRange.baseArrayLayer = 0; barrier.subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); } } // mark device shader-readwrite @ compute dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; dst.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; // stash staging d->upload_staging_buffers.push_back(dst_staging); } int VkTransfer::submit_and_wait() { // NCNN_LOGE("submit_and_wait"); // end command buffer { d->end_command_buffer(); } VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index()); if (compute_queue == 0) { NCNN_LOGE("out of compute queue"); return -1; } if (vkdev->info.unified_compute_transfer_queue()) { // submit compute { VkSubmitInfo submitInfo; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = 0; submitInfo.waitSemaphoreCount = 0; submitInfo.pWaitSemaphores = 0; submitInfo.pWaitDstStageMask = 0; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &d->compute_command_buffer; submitInfo.signalSemaphoreCount = 0; submitInfo.pSignalSemaphores = 0; VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkQueueSubmit failed %d", ret); vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); return -1; } } } else { VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index()); if (transfer_queue == 0) { NCNN_LOGE("out of transfer queue"); vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); return -1; } // submit upload compute { VkSubmitInfo submitInfo; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = 0; submitInfo.waitSemaphoreCount = 0; submitInfo.pWaitSemaphores = 0; submitInfo.pWaitDstStageMask = 0; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &d->upload_command_buffer; submitInfo.signalSemaphoreCount = 1; submitInfo.pSignalSemaphores = &d->upload_compute_semaphore; VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, d->upload_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkQueueSubmit failed %d", ret); vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); return -1; } } { VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; // FIXME VkSubmitInfo submitInfo; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = 0; submitInfo.waitSemaphoreCount = 1; submitInfo.pWaitSemaphores = &d->upload_compute_semaphore; submitInfo.pWaitDstStageMask = &wait_dst_stage; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &d->compute_command_buffer; submitInfo.signalSemaphoreCount = 0; submitInfo.pSignalSemaphores = 0; VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); if (ret != VK_SUCCESS) { NCNN_LOGE("vkQueueSubmit failed %d", ret); vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); return -1; } } vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); } vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); // wait if (vkdev->info.unified_compute_transfer_queue()) { VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1); if (ret != VK_SUCCESS) { NCNN_LOGE("vkWaitForFences failed %d", ret); return -1; } } else { VkFence fences[2] = {d->upload_command_fence, d->compute_command_fence}; VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, (uint64_t)-1); if (ret != VK_SUCCESS) { NCNN_LOGE("vkWaitForFences failed %d", ret); return -1; } } return 0; } } // namespace ncnn #endif // NCNN_VULKAN