| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #include "command.h" |
| |
|
| | #if NCNN_VULKAN |
| |
|
| | #include "option.h" |
| | #include "pipeline.h" |
| |
|
| | namespace ncnn { |
| |
|
| | class VkComputePrivate |
| | { |
| | public: |
| | VkComputePrivate(const VulkanDevice* _vkdev); |
| | ~VkComputePrivate(); |
| |
|
| | int init(); |
| | int begin_command_buffer(); |
| | int end_command_buffer(); |
| |
|
| | const VulkanDevice* vkdev; |
| |
|
| | VkCommandPool compute_command_pool; |
| |
|
| | VkCommandBuffer compute_command_buffer; |
| |
|
| | VkFence compute_command_fence; |
| |
|
| | std::vector<VkMat> upload_staging_buffers; |
| | std::vector<VkMat> download_post_buffers; |
| | std::vector<Mat> download_post_mats_fp16; |
| | std::vector<Mat> download_post_mats; |
| |
|
| | std::vector<VkImageMemory*> image_blocks_to_destroy; |
| |
|
| | |
| | std::vector<VkDescriptorPool> descriptor_pools; |
| | std::vector<VkDescriptorSet> descriptorsets; |
| |
|
| | struct record |
| | { |
| | enum |
| | { |
| | TYPE_copy_buffer, |
| | TYPE_copy_image, |
| | TYPE_copy_buffer_to_image, |
| | TYPE_copy_image_to_buffer, |
| | TYPE_bind_pipeline, |
| | TYPE_bind_descriptorsets, |
| | TYPE_push_constants, |
| | TYPE_dispatch, |
| | TYPE_memory_barrers, |
| | TYPE_buffer_barrers, |
| | TYPE_image_barrers, |
| |
|
| | #if NCNN_BENCHMARK |
| | TYPE_write_timestamp, |
| | #endif |
| |
|
| | TYPE_post_download, |
| | TYPE_post_cast_float16_to_float32, |
| | }; |
| |
|
| | int type; |
| | VkCommandBuffer command_buffer; |
| |
|
| | union |
| | { |
| | struct |
| | { |
| | VkBuffer src; |
| | VkBuffer dst; |
| | uint32_t region_count; |
| | const VkBufferCopy* regions; |
| | } copy_buffer; |
| | struct |
| | { |
| | VkImage src; |
| | VkImageLayout src_layout; |
| | VkImage dst; |
| | VkImageLayout dst_layout; |
| | uint32_t region_count; |
| | const VkImageCopy* regions; |
| | } copy_image; |
| | struct |
| | { |
| | VkBuffer src; |
| | VkImage dst; |
| | VkImageLayout layout; |
| | uint32_t region_count; |
| | const VkBufferImageCopy* regions; |
| | } copy_buffer_to_image; |
| | struct |
| | { |
| | VkImage src; |
| | VkImageLayout layout; |
| | VkBuffer dst; |
| | uint32_t region_count; |
| | const VkBufferImageCopy* regions; |
| | } copy_image_to_buffer; |
| |
|
| | struct |
| | { |
| | VkPipelineBindPoint bind_point; |
| | VkPipeline pipeline; |
| | } bind_pipeline; |
| | struct |
| | { |
| | VkPipelineBindPoint bind_point; |
| | VkPipelineLayout pipeline_layout; |
| | uint32_t descriptorset_count; |
| | uint32_t descriptorset_offset; |
| | } bind_descriptorsets; |
| | struct |
| | { |
| | VkPipelineLayout pipeline_layout; |
| | VkShaderStageFlags stage_flags; |
| | uint32_t size; |
| | const void* values; |
| | } push_constants; |
| |
|
| | struct |
| | { |
| | uint32_t group_count_x; |
| | uint32_t group_count_y; |
| | uint32_t group_count_z; |
| | } dispatch; |
| |
|
| | struct |
| | { |
| | VkPipelineStageFlags src_stage; |
| | VkPipelineStageFlags dst_stage; |
| | uint32_t barrier_count; |
| | const VkMemoryBarrier* barriers; |
| | } memory_barrers; |
| | struct |
| | { |
| | VkPipelineStageFlags src_stage; |
| | VkPipelineStageFlags dst_stage; |
| | uint32_t barrier_count; |
| | const VkBufferMemoryBarrier* barriers; |
| | } buffer_barrers; |
| | struct |
| | { |
| | VkPipelineStageFlags src_stage; |
| | VkPipelineStageFlags dst_stage; |
| | uint32_t barrier_count; |
| | const VkImageMemoryBarrier* barriers; |
| | } image_barrers; |
| |
|
| | #if NCNN_BENCHMARK |
| | struct |
| | { |
| | uint32_t query; |
| | } write_timestamp; |
| | #endif |
| |
|
| | struct |
| | { |
| | uint32_t download_post_buffer_mat_offset; |
| | uint32_t download_post_mat_fp16_offset; |
| | } post_download; |
| | struct |
| | { |
| | uint32_t download_post_mat_fp16_offset; |
| | uint32_t download_post_mat_offset; |
| | int num_threads; |
| | } post_cast_float16_to_float32; |
| | }; |
| | }; |
| |
|
| | std::vector<record> delayed_records; |
| |
|
| | #if NCNN_BENCHMARK |
| | uint32_t query_count; |
| | VkQueryPool query_pool; |
| | #endif |
| | }; |
| |
|
| | VkComputePrivate::VkComputePrivate(const VulkanDevice* _vkdev) |
| | : vkdev(_vkdev) |
| | { |
| | compute_command_pool = 0; |
| | compute_command_buffer = 0; |
| | compute_command_fence = 0; |
| |
|
| | #if NCNN_BENCHMARK |
| | query_count = 0; |
| | query_pool = 0; |
| | #endif |
| |
|
| | init(); |
| | } |
| |
|
| | VkComputePrivate::~VkComputePrivate() |
| | { |
| | for (size_t i = 0; i < image_blocks_to_destroy.size(); i++) |
| | { |
| | VkImageMemory* ptr = image_blocks_to_destroy[i]; |
| |
|
| | int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1); |
| | if (ptr->refcount == 0 && old_command_refcount == 1) |
| | { |
| | |
| | vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); |
| | vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); |
| |
|
| | delete ptr; |
| | } |
| | else |
| | { |
| | |
| | } |
| | } |
| | image_blocks_to_destroy.clear(); |
| |
|
| | if (!vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | for (size_t i = 0; i < descriptorsets.size(); i++) |
| | { |
| | vkFreeDescriptorSets(vkdev->vkdevice(), descriptor_pools[i], 1, &descriptorsets[i]); |
| | vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0); |
| | } |
| | } |
| |
|
| | #if NCNN_BENCHMARK |
| | if (query_pool) |
| | { |
| | |
| | vkResetCommandBuffer(compute_command_buffer, 0); |
| |
|
| | vkDestroyQueryPool(vkdev->vkdevice(), query_pool, 0); |
| | } |
| | #endif |
| |
|
| | vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); |
| |
|
| | vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); |
| | vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); |
| | } |
| |
|
| | int VkComputePrivate::init() |
| | { |
| | |
| | { |
| | VkCommandPoolCreateInfo commandPoolCreateInfo; |
| | commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; |
| | commandPoolCreateInfo.pNext = 0; |
| | commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; |
| | commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| |
|
| | VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateCommandPool failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkCommandBufferAllocateInfo commandBufferAllocateInfo; |
| | commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; |
| | commandBufferAllocateInfo.pNext = 0; |
| | commandBufferAllocateInfo.commandPool = compute_command_pool; |
| | commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; |
| | commandBufferAllocateInfo.commandBufferCount = 1; |
| |
|
| | VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkFenceCreateInfo fenceCreateInfo; |
| | fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; |
| | fenceCreateInfo.pNext = 0; |
| | fenceCreateInfo.flags = 0; |
| |
|
| | VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateFence failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | begin_command_buffer(); |
| |
|
| | #if NCNN_BENCHMARK |
| | if (query_pool) |
| | vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); |
| | #endif |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkComputePrivate::begin_command_buffer() |
| | { |
| | VkCommandBufferBeginInfo commandBufferBeginInfo; |
| | commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; |
| | commandBufferBeginInfo.pNext = 0; |
| | commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; |
| | commandBufferBeginInfo.pInheritanceInfo = 0; |
| |
|
| | VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkComputePrivate::end_command_buffer() |
| | { |
| | VkResult ret = vkEndCommandBuffer(compute_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkEndCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | VkCompute::VkCompute(const VulkanDevice* _vkdev) |
| | : vkdev(_vkdev), d(new VkComputePrivate(_vkdev)) |
| | { |
| | } |
| |
|
| | VkCompute::~VkCompute() |
| | { |
| | delete d; |
| | } |
| |
|
| | void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | Mat src_fp16; |
| | if (src.elemsize == src.elempack * 4u) |
| | { |
| | |
| | if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) |
| | { |
| | ncnn::cast_float32_to_float16(src, src_fp16, opt); |
| | } |
| | else |
| | { |
| | src_fp16 = src; |
| | } |
| | } |
| | else |
| | { |
| | src_fp16 = src; |
| | } |
| |
|
| | |
| | VkMat dst_staging; |
| | dst_staging.create_like(src_fp16, opt.staging_vkallocator); |
| | if (dst_staging.empty()) |
| | return; |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| |
|
| | |
| |
|
| | |
| | memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); |
| | dst_staging.allocator->flush(dst_staging.data); |
| |
|
| | |
| | dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; |
| | dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | |
| | int dims = src_fp16.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; |
| | if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h; |
| | if (dims == 3 || dims == 4) elemcount = src_fp16.elempack * src_fp16.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_shader_pack8) |
| | dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| |
|
| | |
| | vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); |
| | } |
| |
|
| | void VkCompute::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | Mat src_fp16; |
| | if (src.elemsize == src.elempack * 4u) |
| | { |
| | |
| | if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) |
| | { |
| | ncnn::cast_float32_to_float16(src, src_fp16, opt); |
| | } |
| | else |
| | { |
| | src_fp16 = src; |
| | } |
| | } |
| | else |
| | { |
| | src_fp16 = src; |
| | } |
| |
|
| | |
| | VkMat dst_staging; |
| | dst_staging.create_like(src_fp16, opt.staging_vkallocator); |
| | if (dst_staging.empty()) |
| | return; |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| |
|
| | |
| | memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); |
| | dst_staging.allocator->flush(dst_staging.data); |
| |
|
| | |
| | dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; |
| | dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | |
| | int dims = src_fp16.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; |
| | if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h; |
| | if (dims == 3 || dims == 4) elemcount = src_fp16.elempack * src_fp16.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_shader_pack8) |
| | dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| |
|
| | |
| | if (vkdev->info.bug_buffer_image_load_zero()) |
| | { |
| | |
| | VkImageMat dst_image; |
| | record_clone(dst_staging, dst_image, opt); |
| | if (dst_image.empty()) |
| | return; |
| |
|
| | vkdev->convert_packing(dst_image, dst, dst_elempack, *this, opt); |
| |
|
| | |
| | NCNN_XADD(&dst_image.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(dst_image.data); |
| |
|
| | submit_and_wait(); |
| | reset(); |
| | } |
| | else |
| | { |
| | vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); |
| | } |
| | } |
| |
|
| | void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | int dims = src.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src.elempack * src.w; |
| | if (dims == 2) elemcount = src.elempack * src.h; |
| | if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_packing_layout) |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = 1; |
| |
|
| | |
| | Option opt_staging = opt; |
| | if (vkdev->info.type() != 0) |
| | { |
| | opt_staging.use_fp16_packed = false; |
| | opt_staging.use_fp16_storage = false; |
| | } |
| | if (!opt_staging.blob_vkallocator->mappable) |
| | { |
| | opt_staging.blob_vkallocator = opt.staging_vkallocator; |
| | } |
| |
|
| | VkMat dst_staging; |
| | vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging); |
| |
|
| | |
| | if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) |
| | { |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = dst_staging.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = dst_staging.buffer(); |
| | barriers[0].offset = dst_staging.buffer_offset(); |
| | barriers[0].size = dst_staging.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT; |
| | dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| | } |
| |
|
| | |
| | Mat dst_fp16; |
| | dst_fp16.create_like(dst_staging, opt.blob_allocator); |
| | if (dst_fp16.empty()) |
| | return; |
| |
|
| | |
| | d->download_post_buffers.push_back(dst_staging); |
| | d->download_post_mats_fp16.push_back(dst_fp16); |
| |
|
| | |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_post_download; |
| | r.command_buffer = 0; |
| | r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; |
| | r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | if (dst_fp16.elemsize == dst_fp16.elempack * 2u) |
| | { |
| | if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) |
| | { |
| | int dims = dst_fp16.dims; |
| | if (dims == 1) |
| | dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 2) |
| | dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 3) |
| | dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 4) |
| | dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| |
|
| | d->download_post_mats.push_back(dst); |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; |
| | r.command_buffer = 0; |
| | r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; |
| | r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; |
| | r.post_cast_float16_to_float32.num_threads = opt.num_threads; |
| | d->delayed_records.push_back(r); |
| | } |
| | else |
| | { |
| | dst = dst_fp16; |
| | } |
| | } |
| | else |
| | { |
| | dst = dst_fp16; |
| | } |
| | } |
| |
|
| | void VkCompute::record_download(const VkImageMat& src, Mat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | int dims = src.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src.elempack * src.w; |
| | if (dims == 2) elemcount = src.elempack * src.h; |
| | if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_packing_layout) |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = 1; |
| |
|
| | |
| | Option opt_staging = opt; |
| | if (vkdev->info.type() != 0) |
| | { |
| | opt_staging.use_fp16_packed = false; |
| | opt_staging.use_fp16_storage = false; |
| | } |
| | if (!opt_staging.blob_vkallocator->mappable) |
| | { |
| | opt_staging.blob_vkallocator = opt.staging_vkallocator; |
| | } |
| |
|
| | VkMat dst_staging; |
| | if (vkdev->info.bug_buffer_image_load_zero()) |
| | { |
| | VkImageMat src_image; |
| | vkdev->convert_packing(src, src_image, dst_elempack, *this, opt); |
| | if (src_image.empty()) |
| | return; |
| |
|
| | record_clone(src_image, dst_staging, opt_staging); |
| |
|
| | |
| | NCNN_XADD(&src_image.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src_image.data); |
| | } |
| | else |
| | { |
| | vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging); |
| | } |
| |
|
| | |
| | NCNN_XADD(&src.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src.data); |
| |
|
| | |
| | if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) |
| | { |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = dst_staging.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = dst_staging.buffer(); |
| | barriers[0].offset = dst_staging.buffer_offset(); |
| | barriers[0].size = dst_staging.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT; |
| | dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| | } |
| |
|
| | |
| | Mat dst_fp16; |
| | dst_fp16.create_like(dst_staging, opt.blob_allocator); |
| | if (dst_fp16.empty()) |
| | return; |
| |
|
| | |
| | d->download_post_buffers.push_back(dst_staging); |
| | d->download_post_mats_fp16.push_back(dst_fp16); |
| |
|
| | |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_post_download; |
| | r.command_buffer = 0; |
| | r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; |
| | r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | if (dst_fp16.elemsize == dst_fp16.elempack * 2u) |
| | { |
| | if (vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) |
| | { |
| | int dims = dst_fp16.dims; |
| | if (dims == 1) |
| | dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 2) |
| | dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 3) |
| | dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| | if (dims == 4) |
| | dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); |
| |
|
| | d->download_post_mats.push_back(dst); |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; |
| | r.command_buffer = 0; |
| | r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; |
| | r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; |
| | r.post_cast_float16_to_float32.num_threads = opt.num_threads; |
| | d->delayed_records.push_back(r); |
| | } |
| | else |
| | { |
| | dst = dst_fp16; |
| | } |
| | } |
| | else |
| | { |
| | dst = dst_fp16; |
| | } |
| | } |
| |
|
| | void VkCompute::record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | int dims = src.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src.elempack * src.w; |
| | if (dims == 2) elemcount = src.elempack * src.h; |
| | if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_shader_pack8) |
| | dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| |
|
| | if (vkdev->info.bug_buffer_image_load_zero()) |
| | { |
| | |
| | VkImageMat src_image; |
| | record_clone(src, src_image, opt); |
| | if (src_image.empty()) |
| | return; |
| |
|
| | vkdev->convert_packing(src_image, dst, dst_elempack, *this, opt); |
| |
|
| | |
| | NCNN_XADD(&src_image.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src_image.data); |
| | } |
| | else |
| | { |
| | vkdev->convert_packing(src, dst, dst_elempack, *this, opt); |
| | } |
| | } |
| |
|
| | void VkCompute::record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | int dims = src.dims; |
| | int elemcount = 0; |
| | if (dims == 1) elemcount = src.elempack * src.w; |
| | if (dims == 2) elemcount = src.elempack * src.h; |
| | if (dims == 3 || dims == 4) elemcount = src.elempack * src.c; |
| |
|
| | int dst_elempack = 1; |
| | if (opt.use_shader_pack8) |
| | dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; |
| | else |
| | dst_elempack = elemcount % 4 == 0 ? 4 : 1; |
| |
|
| | if (vkdev->info.bug_buffer_image_load_zero()) |
| | { |
| | VkImageMat src_image; |
| | Option opt_image = opt; |
| | opt_image.blob_vkallocator = src.allocator; |
| | vkdev->convert_packing(src, src_image, dst_elempack, *this, opt_image); |
| | if (src_image.empty()) |
| | return; |
| |
|
| | record_clone(src_image, dst, opt); |
| |
|
| | |
| | NCNN_XADD(&src_image.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src_image.data); |
| | } |
| | else |
| | { |
| | vkdev->convert_packing(src, dst, dst_elempack, *this, opt); |
| | } |
| |
|
| | |
| | NCNN_XADD(&src.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src.data); |
| | } |
| |
|
| | void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | VkMat dst_staging; |
| | dst_staging.create_like(src, opt.staging_vkallocator); |
| | if (dst_staging.empty()) |
| | return; |
| |
|
| | |
| | memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); |
| | dst_staging.allocator->flush(dst_staging.data); |
| |
|
| | |
| | dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; |
| | dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | |
| | record_clone(dst_staging, dst, opt); |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| | } |
| |
|
| | void VkCompute::record_clone(const Mat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | VkMat dst_staging; |
| | Option opt_staging = opt; |
| | opt_staging.blob_vkallocator = opt.staging_vkallocator; |
| | record_clone(src, dst_staging, opt_staging); |
| |
|
| | |
| | record_clone(dst_staging, dst, opt); |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| | } |
| |
|
| | void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | if (!src.allocator->mappable) |
| | { |
| | |
| | VkMat src_staging; |
| | Option opt_staging = opt; |
| | opt_staging.blob_vkallocator = opt.staging_vkallocator; |
| | record_clone(src, src_staging, opt_staging); |
| |
|
| | |
| | record_clone(src_staging, dst, opt); |
| |
|
| | return; |
| | } |
| |
|
| | |
| | dst.create_like(src, opt.blob_allocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | |
| | if (src.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) |
| | { |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = src.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = src.buffer(); |
| | barriers[0].offset = src.buffer_offset(); |
| | barriers[0].size = src.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = src.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | src.data->access_flags = VK_ACCESS_HOST_READ_BIT; |
| | src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; |
| | } |
| |
|
| | |
| | d->download_post_buffers.push_back(src); |
| | d->download_post_mats_fp16.push_back(dst); |
| |
|
| | |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_post_download; |
| | r.command_buffer = 0; |
| | r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1; |
| | r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | void VkCompute::record_clone(const VkImageMat& src, Mat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | VkMat src_staging; |
| | Option opt_staging = opt; |
| | opt_staging.blob_vkallocator = opt.staging_vkallocator; |
| | record_clone(src, src_staging, opt_staging); |
| |
|
| | |
| | record_clone(src_staging, dst, opt); |
| | } |
| |
|
| | void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | dst.create_like(src, opt.blob_vkallocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) |
| | { |
| | |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = src.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = src.buffer(); |
| | barriers[0].offset = src.buffer_offset(); |
| | barriers[0].size = src.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = src.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; |
| | src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | { |
| | |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | VkBufferCopy* regions = new VkBufferCopy[1]; |
| | regions[0].srcOffset = src.buffer_offset(); |
| | regions[0].dstOffset = dst.buffer_offset(); |
| | regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdCopyBuffer(d->compute_command_buffer, src.buffer(), dst.buffer(), 1, regions); |
| | delete[] regions; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_copy_buffer; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.copy_buffer.src = src.buffer(); |
| | r.copy_buffer.dst = dst.buffer(); |
| | r.copy_buffer.region_count = 1; |
| | r.copy_buffer.regions = regions; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| | } |
| |
|
| | void VkCompute::record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | dst.create_like(src, opt.blob_vkallocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | |
| | if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = src.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barriers[0].oldLayout = src.data->image_layout; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = src.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = src.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; |
| | src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = 0; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = dst.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | VkImageCopy* regions = new VkImageCopy[1]; |
| | regions[0].srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[0].srcSubresource.mipLevel = 0; |
| | regions[0].srcSubresource.baseArrayLayer = 0; |
| | regions[0].srcSubresource.layerCount = 1; |
| | regions[0].srcOffset.x = 0; |
| | regions[0].srcOffset.y = 0; |
| | regions[0].srcOffset.z = 0; |
| | regions[0].dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[0].dstSubresource.mipLevel = 0; |
| | regions[0].dstSubresource.baseArrayLayer = 0; |
| | regions[0].dstSubresource.layerCount = 1; |
| | regions[0].dstOffset.x = 0; |
| | regions[0].dstOffset.y = 0; |
| | regions[0].dstOffset.z = 0; |
| | regions[0].extent.width = src.data->width; |
| | regions[0].extent.height = src.data->height; |
| | regions[0].extent.depth = src.data->depth; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdCopyImage(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, regions); |
| | delete[] regions; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_copy_image; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.copy_image.src = src.image(); |
| | r.copy_image.src_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | r.copy_image.dst = dst.image(); |
| | r.copy_image.dst_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | r.copy_image.region_count = 1; |
| | r.copy_image.regions = regions; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | NCNN_XADD(&src.data->command_refcount, 1); |
| | NCNN_XADD(&dst.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src.data); |
| | d->image_blocks_to_destroy.push_back(dst.data); |
| | } |
| |
|
| | void VkCompute::record_clone(const VkMat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | dst.create_like(src, opt.blob_vkallocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | |
| | if (src.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) |
| | { |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = src.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = src.buffer(); |
| | barriers[0].offset = src.buffer_offset(); |
| | barriers[0].size = src.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = src.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; |
| | src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = 0; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = dst.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | int region_count; |
| | VkBufferImageCopy* regions; |
| | if (dst.elemsize * dst.w * dst.h % 16 == 0) |
| | { |
| | region_count = 1; |
| | regions = new VkBufferImageCopy[1]; |
| | regions[0].bufferOffset = src.buffer_offset(); |
| | regions[0].bufferRowLength = 0; |
| | regions[0].bufferImageHeight = 0; |
| | regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[0].imageSubresource.mipLevel = 0; |
| | regions[0].imageSubresource.baseArrayLayer = 0; |
| | regions[0].imageSubresource.layerCount = 1; |
| | regions[0].imageOffset.x = 0; |
| | regions[0].imageOffset.y = 0; |
| | regions[0].imageOffset.z = 0; |
| | regions[0].imageExtent.width = dst.data->width; |
| | regions[0].imageExtent.height = dst.data->height; |
| | regions[0].imageExtent.depth = dst.data->depth; |
| | } |
| | else |
| | { |
| | region_count = dst.c; |
| | regions = new VkBufferImageCopy[region_count]; |
| | for (int i = 0; i < region_count; i++) |
| | { |
| | regions[i].bufferOffset = src.buffer_offset() + src.cstep * src.elemsize * i; |
| | regions[i].bufferRowLength = 0; |
| | regions[i].bufferImageHeight = 0; |
| | regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[i].imageSubresource.mipLevel = 0; |
| | regions[i].imageSubresource.baseArrayLayer = 0; |
| | regions[i].imageSubresource.layerCount = 1; |
| | regions[i].imageOffset.x = 0; |
| | regions[i].imageOffset.y = 0; |
| | regions[i].imageOffset.z = i; |
| | regions[i].imageExtent.width = dst.data->width; |
| | regions[i].imageExtent.height = dst.data->height; |
| | regions[i].imageExtent.depth = 1; |
| | } |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdCopyBufferToImage(d->compute_command_buffer, src.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, region_count, regions); |
| | delete[] regions; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_copy_buffer_to_image; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.copy_buffer_to_image.src = src.buffer(); |
| | r.copy_buffer_to_image.dst = dst.image(); |
| | r.copy_buffer_to_image.layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | r.copy_buffer_to_image.region_count = region_count; |
| | r.copy_buffer_to_image.regions = regions; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | NCNN_XADD(&dst.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(dst.data); |
| | } |
| |
|
| | void VkCompute::record_clone(const VkImageMat& src, VkMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | dst.create_like(src, opt.blob_vkallocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | |
| | if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = src.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barriers[0].oldLayout = src.data->image_layout; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = src.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = src.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; |
| | src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | { |
| | |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | } |
| |
|
| | |
| | { |
| | int region_count; |
| | VkBufferImageCopy* regions; |
| | if (src.elemsize * src.w * src.h % 16 == 0) |
| | { |
| | region_count = 1; |
| | regions = new VkBufferImageCopy[1]; |
| | regions[0].bufferOffset = dst.buffer_offset(); |
| | regions[0].bufferRowLength = 0; |
| | regions[0].bufferImageHeight = 0; |
| | regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[0].imageSubresource.mipLevel = 0; |
| | regions[0].imageSubresource.baseArrayLayer = 0; |
| | regions[0].imageSubresource.layerCount = 1; |
| | regions[0].imageOffset.x = 0; |
| | regions[0].imageOffset.y = 0; |
| | regions[0].imageOffset.z = 0; |
| | regions[0].imageExtent.width = src.data->width; |
| | regions[0].imageExtent.height = src.data->height; |
| | regions[0].imageExtent.depth = src.data->depth; |
| | } |
| | else |
| | { |
| | region_count = src.c; |
| | regions = new VkBufferImageCopy[region_count]; |
| | for (int i = 0; i < region_count; i++) |
| | { |
| | regions[i].bufferOffset = dst.buffer_offset() + dst.cstep * dst.elemsize * i; |
| | regions[i].bufferRowLength = 0; |
| | regions[i].bufferImageHeight = 0; |
| | regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[i].imageSubresource.mipLevel = 0; |
| | regions[i].imageSubresource.baseArrayLayer = 0; |
| | regions[i].imageSubresource.layerCount = 1; |
| | regions[i].imageOffset.x = 0; |
| | regions[i].imageOffset.y = 0; |
| | regions[i].imageOffset.z = i; |
| | regions[i].imageExtent.width = src.data->width; |
| | regions[i].imageExtent.height = src.data->height; |
| | regions[i].imageExtent.depth = 1; |
| | } |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdCopyImageToBuffer(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.buffer(), region_count, regions); |
| | delete[] regions; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_copy_image_to_buffer; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.copy_image_to_buffer.src = src.image(); |
| | r.copy_image_to_buffer.layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; |
| | r.copy_image_to_buffer.dst = dst.buffer(); |
| | r.copy_image_to_buffer.region_count = region_count; |
| | r.copy_image_to_buffer.regions = regions; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | NCNN_XADD(&src.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(src.data); |
| | } |
| |
|
| | void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) |
| | { |
| | record_pipeline(pipeline, bindings, std::vector<VkImageMat>(), constants, dispatcher); |
| | } |
| |
|
| | void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher) |
| | { |
| | record_pipeline(pipeline, std::vector<VkMat>(), bindings, constants, dispatcher); |
| | } |
| |
|
| | void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) |
| | { |
| | Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0); |
| |
|
| | record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat); |
| | } |
| |
|
| | void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher) |
| | { |
| | Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0); |
| |
|
| | record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat); |
| | } |
| |
|
| | void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher) |
| | { |
| | |
| |
|
| | const int buffer_binding_count = (int)buffer_bindings.size(); |
| | const int image_binding_count = (int)image_bindings.size(); |
| | const int constant_count = (int)constants.size(); |
| |
|
| | const int binding_count = buffer_binding_count + image_binding_count; |
| | const ShaderInfo& shader_info = pipeline->shader_info(); |
| |
|
| | if (binding_count != shader_info.binding_count) |
| | { |
| | NCNN_LOGE("binding_count not match, expect %d but got %d + %d", shader_info.binding_count, buffer_binding_count, image_binding_count); |
| | } |
| |
|
| | if (constant_count != shader_info.push_constant_count) |
| | { |
| | NCNN_LOGE("push_constant_count not match, expect %d but got %d", shader_info.push_constant_count, constant_count); |
| | } |
| |
|
| | int buffer_index = 0; |
| | int image_index = 0; |
| | for (int i = 0; i < binding_count; i++) |
| | { |
| | int binding_type = shader_info.binding_types[i]; |
| |
|
| | if (binding_type == 1) |
| | { |
| | const VkMat& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index]; |
| | buffer_index++; |
| |
|
| | |
| |
|
| | barrier_readwrite(binding); |
| | } |
| | else if (binding_type == 2) |
| | { |
| | const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index]; |
| | image_index++; |
| |
|
| | |
| |
|
| | barrier_readwrite(binding); |
| |
|
| | |
| | NCNN_XADD(&binding.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(binding.data); |
| | } |
| | else |
| | { |
| | const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image_readonly() : image_bindings[image_index]; |
| | image_index++; |
| |
|
| | |
| |
|
| | |
| | |
| | bool image_read_write = false; |
| | for (int j = 0; j < image_binding_count; j++) |
| | { |
| | if (shader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data) |
| | { |
| | |
| | image_read_write = true; |
| | break; |
| | } |
| | } |
| | if (image_read_write) |
| | continue; |
| |
|
| | barrier_readonly(binding); |
| |
|
| | |
| | NCNN_XADD(&binding.data->command_refcount, 1); |
| | d->image_blocks_to_destroy.push_back(binding.data); |
| | } |
| | } |
| |
|
| | |
| | { |
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_pipeline; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_pipeline.pipeline = pipeline->pipeline(); |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | if (binding_count > 0) |
| | { |
| | std::vector<unsigned char> descriptorInfos; |
| | { |
| | descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count); |
| |
|
| | unsigned char* p_descriptorInfos = descriptorInfos.data(); |
| | int descriptorBufferInfo_index = 0; |
| | int descriptorImageInfo_index = 0; |
| | for (int i = 0; i < binding_count; i++) |
| | { |
| | int binding_type = shader_info.binding_types[i]; |
| |
|
| | if (binding_type == 1) |
| | { |
| | const VkMat& binding = buffer_bindings[descriptorBufferInfo_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[descriptorBufferInfo_index]; |
| | descriptorBufferInfo_index++; |
| |
|
| | VkDescriptorBufferInfo descriptorBufferInfo; |
| | descriptorBufferInfo.buffer = binding.buffer(); |
| | descriptorBufferInfo.offset = binding.buffer_offset(); |
| | descriptorBufferInfo.range = binding.total() * binding.elemsize; |
| |
|
| | memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo)); |
| | p_descriptorInfos += sizeof(VkDescriptorBufferInfo); |
| | } |
| | else |
| | { |
| | const VkImageMat& binding = image_bindings[descriptorImageInfo_index].empty() ? vkdev->get_dummy_image() : image_bindings[descriptorImageInfo_index]; |
| | descriptorImageInfo_index++; |
| |
|
| | |
| | VkDescriptorImageInfo descriptorImageInfo; |
| | descriptorImageInfo.sampler = 0; |
| | descriptorImageInfo.imageView = binding.imageview(); |
| | descriptorImageInfo.imageLayout = binding.data->image_layout; |
| |
|
| | memcpy(p_descriptorInfos, &descriptorImageInfo, sizeof(VkDescriptorImageInfo)); |
| | p_descriptorInfos += sizeof(VkDescriptorImageInfo); |
| | } |
| | } |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorInfos.data()); |
| | } |
| | else |
| | { |
| | |
| | VkDescriptorPool descriptor_pool; |
| | { |
| | int image_binding_count = 0; |
| | int sampler_binding_count = 0; |
| | for (int i = 0; i < binding_count; i++) |
| | { |
| | int binding_type = shader_info.binding_types[i]; |
| |
|
| | if (binding_type == 2) |
| | image_binding_count++; |
| | else |
| | sampler_binding_count++; |
| | } |
| |
|
| | VkDescriptorPoolSize poolSizes[3]; |
| | poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| | poolSizes[0].descriptorCount = buffer_binding_count; |
| | poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; |
| | poolSizes[1].descriptorCount = image_binding_count; |
| | poolSizes[2].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | poolSizes[2].descriptorCount = sampler_binding_count; |
| |
|
| | VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; |
| | descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; |
| | descriptorPoolCreateInfo.pNext = 0; |
| | descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; |
| | descriptorPoolCreateInfo.maxSets = 1; |
| | descriptorPoolCreateInfo.poolSizeCount = 3; |
| | descriptorPoolCreateInfo.pPoolSizes = poolSizes; |
| |
|
| | VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptor_pools.push_back(descriptor_pool); |
| |
|
| | VkDescriptorSet descriptorset; |
| | { |
| | VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); |
| |
|
| | VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; |
| | descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; |
| | descriptorSetAllocateInfo.pNext = 0; |
| | descriptorSetAllocateInfo.descriptorPool = descriptor_pool; |
| | descriptorSetAllocateInfo.descriptorSetCount = 1; |
| | descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; |
| |
|
| | VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptorsets.push_back(descriptorset); |
| |
|
| | if (vkdev->info.support_VK_KHR_descriptor_update_template()) |
| | { |
| | vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), descriptorInfos.data()); |
| | } |
| | else |
| | { |
| | std::vector<VkWriteDescriptorSet> writeDescriptorSets(binding_count); |
| | { |
| | const unsigned char* p_descriptorInfos = descriptorInfos.data(); |
| | for (int i = 0; i < binding_count; i++) |
| | { |
| | int binding_type = shader_info.binding_types[i]; |
| |
|
| | writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[i].pNext = 0; |
| | writeDescriptorSets[i].dstSet = descriptorset; |
| | writeDescriptorSets[i].dstBinding = i; |
| | writeDescriptorSets[i].dstArrayElement = 0; |
| | writeDescriptorSets[i].descriptorCount = 1; |
| | writeDescriptorSets[i].pTexelBufferView = 0; |
| |
|
| | if (binding_type == 1) |
| | { |
| | writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| | writeDescriptorSets[i].pImageInfo = 0; |
| | writeDescriptorSets[i].pBufferInfo = (const VkDescriptorBufferInfo*)p_descriptorInfos; |
| |
|
| | p_descriptorInfos += sizeof(VkDescriptorBufferInfo); |
| | } |
| | else if (binding_type == 2) |
| | { |
| | writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; |
| | writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; |
| | writeDescriptorSets[i].pBufferInfo = 0; |
| |
|
| | p_descriptorInfos += sizeof(VkDescriptorImageInfo); |
| | } |
| | else |
| | { |
| | writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; |
| | writeDescriptorSets[i].pBufferInfo = 0; |
| |
|
| | p_descriptorInfos += sizeof(VkDescriptorImageInfo); |
| | } |
| | } |
| | } |
| |
|
| | vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); |
| | } |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); |
| | r.bind_descriptorsets.descriptorset_count = 1; |
| | r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | if (constant_count > 0) |
| | { |
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPushConstants(d->compute_command_buffer, pipeline->pipeline_layout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); |
| | } |
| | else |
| | { |
| | uint32_t size = constant_count * sizeof(vk_constant_type); |
| | unsigned char* constant_values = new unsigned char[size]; |
| | memcpy(constant_values, constants.data(), size); |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_push_constants; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.push_constants.pipeline_layout = pipeline->pipeline_layout(); |
| | r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; |
| | r.push_constants.size = size; |
| | r.push_constants.values = constant_values; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); |
| | uint32_t group_count_y = (dispatcher.h * (dispatcher.d ? dispatcher.d : 1) + pipeline->local_size_y() - 1) / pipeline->local_size_y(); |
| | uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_dispatch; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.dispatch.group_count_x = group_count_x; |
| | r.dispatch.group_count_y = group_count_y; |
| | r.dispatch.group_count_z = group_count_z; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| | } |
| |
|
| | #if NCNN_BENCHMARK |
| | void VkCompute::record_write_timestamp(uint32_t query) |
| | { |
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | if (d->query_pool) |
| | vkCmdWriteTimestamp(d->compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, query); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_write_timestamp; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.write_timestamp.query = query; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| | #endif |
| |
|
| | #if NCNN_PLATFORM_API |
| | #if __ANDROID_API__ >= 26 |
| | void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst) |
| | { |
| | |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = 0; |
| | barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = src.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_pipeline; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_pipeline.pipeline = pipeline->pipeline(); |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkDescriptorImageInfo descriptorImageInfo; |
| | descriptorImageInfo.sampler = pipeline->sampler; |
| | descriptorImageInfo.imageView = src.imageview(); |
| | descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| |
|
| | VkDescriptorBufferInfo descriptorBufferInfo; |
| | descriptorBufferInfo.buffer = dst.buffer(); |
| | descriptorBufferInfo.offset = dst.buffer_offset(); |
| | descriptorBufferInfo.range = dst.total() * dst.elemsize; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | struct ImportAndroidHardwareBufferDescriptorInfo |
| | { |
| | VkDescriptorImageInfo imageInfo; |
| | VkDescriptorBufferInfo bufferInfo; |
| | VkDescriptorBufferInfo buffer4Info; |
| | }; |
| |
|
| | ImportAndroidHardwareBufferDescriptorInfo info; |
| | info.imageInfo = descriptorImageInfo; |
| | info.bufferInfo = descriptorBufferInfo; |
| | info.buffer4Info = descriptorBufferInfo; |
| |
|
| | vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, &info); |
| | } |
| | else |
| | { |
| | |
| | VkDescriptorPool descriptor_pool; |
| | { |
| | VkDescriptorPoolSize poolSizes[2]; |
| | poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | poolSizes[0].descriptorCount = 1; |
| | poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| | poolSizes[1].descriptorCount = 2; |
| |
|
| | VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; |
| | descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; |
| | descriptorPoolCreateInfo.pNext = 0; |
| | descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; |
| | descriptorPoolCreateInfo.maxSets = 1; |
| | descriptorPoolCreateInfo.poolSizeCount = 2; |
| | descriptorPoolCreateInfo.pPoolSizes = poolSizes; |
| |
|
| | VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptor_pools.push_back(descriptor_pool); |
| |
|
| | VkDescriptorSet descriptorset; |
| | { |
| | VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); |
| |
|
| | VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; |
| | descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; |
| | descriptorSetAllocateInfo.pNext = 0; |
| | descriptorSetAllocateInfo.descriptorPool = descriptor_pool; |
| | descriptorSetAllocateInfo.descriptorSetCount = 1; |
| | descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; |
| |
|
| | VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptorsets.push_back(descriptorset); |
| |
|
| | if (vkdev->info.support_VK_KHR_descriptor_update_template()) |
| | { |
| | struct ImportAndroidHardwareBufferDescriptorInfo |
| | { |
| | VkDescriptorImageInfo imageInfo; |
| | VkDescriptorBufferInfo bufferInfo; |
| | VkDescriptorBufferInfo buffer4Info; |
| | }; |
| |
|
| | ImportAndroidHardwareBufferDescriptorInfo info; |
| | info.imageInfo = descriptorImageInfo; |
| | info.bufferInfo = descriptorBufferInfo; |
| | info.buffer4Info = descriptorBufferInfo; |
| |
|
| | vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), &info); |
| | } |
| | else |
| | { |
| | VkWriteDescriptorSet writeDescriptorSets[3]; |
| | writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[0].pNext = 0; |
| | writeDescriptorSets[0].dstSet = descriptorset; |
| | writeDescriptorSets[0].dstBinding = 0; |
| | writeDescriptorSets[0].dstArrayElement = 0; |
| | writeDescriptorSets[0].descriptorCount = 1; |
| | writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | writeDescriptorSets[0].pImageInfo = &descriptorImageInfo; |
| | writeDescriptorSets[0].pBufferInfo = 0; |
| | writeDescriptorSets[0].pTexelBufferView = 0; |
| | writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[1].pNext = 0; |
| | writeDescriptorSets[1].dstSet = descriptorset; |
| | writeDescriptorSets[1].dstBinding = 1; |
| | writeDescriptorSets[1].dstArrayElement = 0; |
| | writeDescriptorSets[1].descriptorCount = 1; |
| | writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| | writeDescriptorSets[1].pImageInfo = 0; |
| | writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo; |
| | writeDescriptorSets[1].pTexelBufferView = 0; |
| | writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[2].pNext = 0; |
| | writeDescriptorSets[2].dstSet = descriptorset; |
| | writeDescriptorSets[2].dstBinding = 2; |
| | writeDescriptorSets[2].dstArrayElement = 0; |
| | writeDescriptorSets[2].descriptorCount = 1; |
| | writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| | writeDescriptorSets[2].pImageInfo = 0; |
| | writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo; |
| | writeDescriptorSets[2].pTexelBufferView = 0; |
| |
|
| | vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); |
| | } |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); |
| | r.bind_descriptorsets.descriptorset_count = 1; |
| | r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | uint32_t group_count_x = (dst.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); |
| | uint32_t group_count_y = (dst.h + pipeline->local_size_y() - 1) / pipeline->local_size_y(); |
| | uint32_t group_count_z = (dst.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_dispatch; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.dispatch.group_count_x = group_count_x; |
| | r.dispatch.group_count_y = group_count_y; |
| | r.dispatch.group_count_z = group_count_z; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| | } |
| |
|
| | void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst) |
| | { |
| | |
| | { |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[2]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = 0; |
| | barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = src.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| | barriers[1].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[1].pNext = 0; |
| | barriers[1].srcAccessMask = 0; |
| | barriers[1].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; |
| | barriers[1].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barriers[1].newLayout = VK_IMAGE_LAYOUT_GENERAL; |
| | barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[1].image = dst.image(); |
| | barriers[1].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[1].subresourceRange.baseMipLevel = 0; |
| | barriers[1].subresourceRange.levelCount = 1; |
| | barriers[1].subresourceRange.baseArrayLayer = 0; |
| | barriers[1].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 2, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 2; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline()); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_pipeline; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_pipeline.pipeline = pipeline->pipeline(); |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkDescriptorImageInfo descriptorImageInfos[3]; |
| | descriptorImageInfos[0].sampler = pipeline->sampler; |
| | descriptorImageInfos[0].imageView = src.imageview(); |
| | descriptorImageInfos[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | descriptorImageInfos[1].sampler = 0; |
| | descriptorImageInfos[1].imageView = dst.imageview(); |
| | descriptorImageInfos[1].imageLayout = VK_IMAGE_LAYOUT_GENERAL; |
| | descriptorImageInfos[2].sampler = 0; |
| | descriptorImageInfos[2].imageView = dst.imageview(); |
| | descriptorImageInfos[2].imageLayout = VK_IMAGE_LAYOUT_GENERAL; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorImageInfos); |
| | } |
| | else |
| | { |
| | |
| | VkDescriptorPool descriptor_pool; |
| | { |
| | VkDescriptorPoolSize poolSizes[2]; |
| | poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | poolSizes[0].descriptorCount = 1; |
| | poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; |
| | poolSizes[1].descriptorCount = 2; |
| |
|
| | VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; |
| | descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; |
| | descriptorPoolCreateInfo.pNext = 0; |
| | descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; |
| | descriptorPoolCreateInfo.maxSets = 1; |
| | descriptorPoolCreateInfo.poolSizeCount = 2; |
| | descriptorPoolCreateInfo.pPoolSizes = poolSizes; |
| |
|
| | VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateDescriptorPool failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptor_pools.push_back(descriptor_pool); |
| |
|
| | VkDescriptorSet descriptorset; |
| | { |
| | VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout(); |
| |
|
| | VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; |
| | descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; |
| | descriptorSetAllocateInfo.pNext = 0; |
| | descriptorSetAllocateInfo.descriptorPool = descriptor_pool; |
| | descriptorSetAllocateInfo.descriptorSetCount = 1; |
| | descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; |
| |
|
| | VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret); |
| | return; |
| | } |
| | } |
| | d->descriptorsets.push_back(descriptorset); |
| |
|
| | if (vkdev->info.support_VK_KHR_descriptor_update_template()) |
| | { |
| | vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), descriptorImageInfos); |
| | } |
| | else |
| | { |
| | VkWriteDescriptorSet writeDescriptorSets[3]; |
| | writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[0].pNext = 0; |
| | writeDescriptorSets[0].dstSet = descriptorset; |
| | writeDescriptorSets[0].dstBinding = 0; |
| | writeDescriptorSets[0].dstArrayElement = 0; |
| | writeDescriptorSets[0].descriptorCount = 1; |
| | writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; |
| | writeDescriptorSets[0].pImageInfo = &descriptorImageInfos[0]; |
| | writeDescriptorSets[0].pBufferInfo = 0; |
| | writeDescriptorSets[0].pTexelBufferView = 0; |
| | writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[1].pNext = 0; |
| | writeDescriptorSets[1].dstSet = descriptorset; |
| | writeDescriptorSets[1].dstBinding = 1; |
| | writeDescriptorSets[1].dstArrayElement = 0; |
| | writeDescriptorSets[1].descriptorCount = 1; |
| | writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; |
| | writeDescriptorSets[1].pImageInfo = &descriptorImageInfos[1]; |
| | writeDescriptorSets[1].pBufferInfo = 0; |
| | writeDescriptorSets[1].pTexelBufferView = 0; |
| | writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| | writeDescriptorSets[2].pNext = 0; |
| | writeDescriptorSets[2].dstSet = descriptorset; |
| | writeDescriptorSets[2].dstBinding = 2; |
| | writeDescriptorSets[2].dstArrayElement = 0; |
| | writeDescriptorSets[2].descriptorCount = 1; |
| | writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; |
| | writeDescriptorSets[2].pImageInfo = &descriptorImageInfos[2]; |
| | writeDescriptorSets[2].pBufferInfo = 0; |
| | writeDescriptorSets[2].pTexelBufferView = 0; |
| |
|
| | vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); |
| | } |
| |
|
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_bind_descriptorsets; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; |
| | r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout(); |
| | r.bind_descriptorsets.descriptorset_count = 1; |
| | r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| |
|
| | |
| | { |
| | uint32_t group_count_x = (dst.w + pipeline->local_size_x() - 1) / pipeline->local_size_x(); |
| | uint32_t group_count_y = (dst.h + pipeline->local_size_y() - 1) / pipeline->local_size_y(); |
| | uint32_t group_count_z = (dst.c + pipeline->local_size_z() - 1) / pipeline->local_size_z(); |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z); |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_dispatch; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.dispatch.group_count_x = group_count_x; |
| | r.dispatch.group_count_y = group_count_y; |
| | r.dispatch.group_count_z = group_count_z; |
| | d->delayed_records.push_back(r); |
| | } |
| | } |
| | } |
| | #endif |
| | #endif |
| |
|
| | int VkCompute::submit_and_wait() |
| | { |
| | |
| |
|
| | if (!vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | d->begin_command_buffer(); |
| |
|
| | #if NCNN_BENCHMARK |
| | if (d->query_pool) |
| | vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); |
| | #endif |
| |
|
| | const size_t record_count = d->delayed_records.size(); |
| |
|
| | |
| | for (size_t i = 0; i < record_count; i++) |
| | { |
| | const VkComputePrivate::record& r = d->delayed_records[i]; |
| |
|
| | switch (r.type) |
| | { |
| | case VkComputePrivate::record::TYPE_copy_buffer: |
| | { |
| | vkCmdCopyBuffer(r.command_buffer, r.copy_buffer.src, r.copy_buffer.dst, r.copy_buffer.region_count, r.copy_buffer.regions); |
| | delete[] r.copy_buffer.regions; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_copy_image: |
| | { |
| | vkCmdCopyImage(r.command_buffer, r.copy_image.src, r.copy_image.src_layout, r.copy_image.dst, r.copy_image.dst_layout, r.copy_image.region_count, r.copy_image.regions); |
| | delete[] r.copy_image.regions; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_copy_buffer_to_image: |
| | { |
| | vkCmdCopyBufferToImage(r.command_buffer, r.copy_buffer_to_image.src, r.copy_buffer_to_image.dst, r.copy_buffer_to_image.layout, r.copy_buffer_to_image.region_count, r.copy_buffer_to_image.regions); |
| | delete[] r.copy_buffer_to_image.regions; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_copy_image_to_buffer: |
| | { |
| | vkCmdCopyImageToBuffer(r.command_buffer, r.copy_image_to_buffer.src, r.copy_image_to_buffer.layout, r.copy_image_to_buffer.dst, r.copy_image_to_buffer.region_count, r.copy_image_to_buffer.regions); |
| | delete[] r.copy_image_to_buffer.regions; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_bind_pipeline: |
| | { |
| | vkCmdBindPipeline(r.command_buffer, r.bind_pipeline.bind_point, r.bind_pipeline.pipeline); |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_bind_descriptorsets: |
| | { |
| | vkCmdBindDescriptorSets(r.command_buffer, r.bind_descriptorsets.bind_point, r.bind_descriptorsets.pipeline_layout, 0, r.bind_descriptorsets.descriptorset_count, &d->descriptorsets[r.bind_descriptorsets.descriptorset_offset], 0, 0); |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_push_constants: |
| | { |
| | vkCmdPushConstants(r.command_buffer, r.push_constants.pipeline_layout, r.push_constants.stage_flags, 0, r.push_constants.size, r.push_constants.values); |
| | delete[](unsigned char*) r.push_constants.values; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_dispatch: |
| | { |
| | vkCmdDispatch(r.command_buffer, r.dispatch.group_count_x, r.dispatch.group_count_y, r.dispatch.group_count_z); |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_memory_barrers: |
| | { |
| | vkCmdPipelineBarrier(r.command_buffer, r.memory_barrers.src_stage, r.memory_barrers.dst_stage, 0, r.memory_barrers.barrier_count, r.memory_barrers.barriers, 0, 0, 0, 0); |
| | delete[] r.memory_barrers.barriers; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_buffer_barrers: |
| | { |
| | vkCmdPipelineBarrier(r.command_buffer, r.buffer_barrers.src_stage, r.buffer_barrers.dst_stage, 0, 0, 0, r.buffer_barrers.barrier_count, r.buffer_barrers.barriers, 0, 0); |
| | delete[] r.buffer_barrers.barriers; |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_image_barrers: |
| | { |
| | vkCmdPipelineBarrier(r.command_buffer, r.image_barrers.src_stage, r.image_barrers.dst_stage, 0, 0, 0, 0, 0, r.image_barrers.barrier_count, r.image_barrers.barriers); |
| | delete[] r.image_barrers.barriers; |
| | break; |
| | } |
| | #if NCNN_BENCHMARK |
| | case VkComputePrivate::record::TYPE_write_timestamp: |
| | { |
| | if (d->query_pool) |
| | vkCmdWriteTimestamp(r.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, r.write_timestamp.query); |
| | break; |
| | } |
| | #endif |
| | case VkComputePrivate::record::TYPE_post_download: |
| | case VkComputePrivate::record::TYPE_post_cast_float16_to_float32: |
| | default: |
| | break; |
| | } |
| | } |
| | } |
| |
|
| | |
| | { |
| | d->end_command_buffer(); |
| | } |
| |
|
| | |
| | VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index()); |
| | if (compute_queue == 0) |
| | { |
| | NCNN_LOGE("out of compute queue"); |
| | return -1; |
| | } |
| |
|
| | |
| | { |
| | VkSubmitInfo submitInfo; |
| | submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; |
| | submitInfo.pNext = 0; |
| | submitInfo.waitSemaphoreCount = 0; |
| | submitInfo.pWaitSemaphores = 0; |
| | submitInfo.pWaitDstStageMask = 0; |
| | submitInfo.commandBufferCount = 1; |
| | submitInfo.pCommandBuffers = &d->compute_command_buffer; |
| | submitInfo.signalSemaphoreCount = 0; |
| | submitInfo.pSignalSemaphores = 0; |
| |
|
| | VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkQueueSubmit failed %d", ret); |
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| | return -1; |
| | } |
| | } |
| |
|
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| |
|
| | |
| | { |
| | VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkWaitForFences failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | for (size_t i = 0; i < d->delayed_records.size(); i++) |
| | { |
| | const VkComputePrivate::record& r = d->delayed_records[i]; |
| |
|
| | switch (r.type) |
| | { |
| | case VkComputePrivate::record::TYPE_post_download: |
| | { |
| | const VkMat& src = d->download_post_buffers[r.post_download.download_post_buffer_mat_offset]; |
| | Mat& dst = d->download_post_mats_fp16[r.post_download.download_post_mat_fp16_offset]; |
| |
|
| | |
| |
|
| | src.allocator->invalidate(src.data); |
| | memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); |
| | break; |
| | } |
| | case VkComputePrivate::record::TYPE_post_cast_float16_to_float32: |
| | { |
| | |
| |
|
| | const Mat& src = d->download_post_mats_fp16[r.post_cast_float16_to_float32.download_post_mat_fp16_offset]; |
| | Mat& dst = d->download_post_mats[r.post_cast_float16_to_float32.download_post_mat_offset]; |
| |
|
| | Option opt; |
| | opt.num_threads = r.post_cast_float16_to_float32.num_threads; |
| | opt.blob_allocator = dst.allocator; |
| | ncnn::cast_float16_to_float32(src, dst, opt); |
| | break; |
| | } |
| | default: |
| | break; |
| | } |
| | } |
| |
|
| | d->delayed_records.clear(); |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkCompute::reset() |
| | { |
| | d->upload_staging_buffers.clear(); |
| | d->download_post_buffers.clear(); |
| | d->download_post_mats_fp16.clear(); |
| | d->download_post_mats.clear(); |
| |
|
| | for (size_t i = 0; i < d->image_blocks_to_destroy.size(); i++) |
| | { |
| | VkImageMemory* ptr = d->image_blocks_to_destroy[i]; |
| |
|
| | int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1); |
| | if (ptr->refcount == 0 && old_command_refcount == 1) |
| | { |
| | |
| | vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); |
| | vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); |
| |
|
| | delete ptr; |
| | } |
| | else |
| | { |
| | |
| | } |
| | } |
| | d->image_blocks_to_destroy.clear(); |
| |
|
| | if (!vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | for (size_t i = 0; i < d->descriptorsets.size(); i++) |
| | { |
| | vkFreeDescriptorSets(vkdev->vkdevice(), d->descriptor_pools[i], 1, &d->descriptorsets[i]); |
| | vkDestroyDescriptorPool(vkdev->vkdevice(), d->descriptor_pools[i], 0); |
| | } |
| | d->descriptor_pools.clear(); |
| | d->descriptorsets.clear(); |
| | } |
| |
|
| | d->delayed_records.clear(); |
| |
|
| | |
| | { |
| | VkResult ret = vkResetCommandBuffer(d->compute_command_buffer, 0); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkResetCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| | } |
| | { |
| | VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &d->compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkResetFences failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | d->begin_command_buffer(); |
| |
|
| | #if NCNN_BENCHMARK |
| | if (d->query_pool) |
| | vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); |
| | #endif |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | #if NCNN_BENCHMARK |
| | int VkCompute::create_query_pool(uint32_t _query_count) |
| | { |
| | d->query_count = _query_count; |
| |
|
| | VkQueryPoolCreateInfo queryPoolCreateInfo; |
| | queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; |
| | queryPoolCreateInfo.pNext = 0; |
| | queryPoolCreateInfo.flags = 0; |
| | queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; |
| | queryPoolCreateInfo.queryCount = d->query_count; |
| | queryPoolCreateInfo.pipelineStatistics = 0; |
| |
|
| | VkResult ret = vkCreateQueryPool(vkdev->vkdevice(), &queryPoolCreateInfo, 0, &d->query_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateQueryPool failed %d", ret); |
| | return -1; |
| | } |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | if (d->query_pool) |
| | vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count); |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkCompute::get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results) |
| | { |
| | if (results.size() < first_query + query_count) |
| | { |
| | NCNN_LOGE("results not large enough"); |
| | return -1; |
| | } |
| |
|
| | VkResult ret = vkGetQueryPoolResults(vkdev->vkdevice(), d->query_pool, first_query, query_count, |
| | query_count * sizeof(uint64_t), results.data() + first_query, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT); |
| | if (ret != VK_SUCCESS && ret != VK_NOT_READY) |
| | { |
| | NCNN_LOGE("vkGetQueryPoolResults failed %d", ret); |
| | return -1; |
| | } |
| |
|
| | return 0; |
| | } |
| | #endif |
| |
|
| | void VkCompute::barrier_readwrite(const VkMat& binding) |
| | { |
| | if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) |
| | { |
| | |
| | VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = binding.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].buffer = binding.buffer(); |
| | barriers[0].offset = binding.buffer_offset(); |
| | barriers[0].size = binding.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = binding.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_buffer_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.buffer_barrers.src_stage = src_stage; |
| | r.buffer_barrers.dst_stage = dst_stage; |
| | r.buffer_barrers.barrier_count = 1; |
| | r.buffer_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; |
| | binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| | } |
| | } |
| |
|
| | void VkCompute::barrier_readwrite(const VkImageMat& binding) |
| | { |
| | if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) |
| | { |
| | |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = binding.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; |
| | barriers[0].oldLayout = binding.data->image_layout; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = binding.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = binding.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; |
| | binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; |
| | binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| | } |
| | } |
| |
|
| | void VkCompute::barrier_readonly(const VkImageMat& binding) |
| | { |
| | if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) |
| | { |
| | |
| | VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; |
| | barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barriers[0].pNext = 0; |
| | barriers[0].srcAccessMask = binding.data->access_flags; |
| | barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barriers[0].oldLayout = binding.data->image_layout; |
| | barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barriers[0].image = binding.image(); |
| | barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barriers[0].subresourceRange.baseMipLevel = 0; |
| | barriers[0].subresourceRange.levelCount = 1; |
| | barriers[0].subresourceRange.baseArrayLayer = 0; |
| | barriers[0].subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = binding.data->stage_flags; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | if (vkdev->info.support_VK_KHR_push_descriptor()) |
| | { |
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); |
| | delete[] barriers; |
| | } |
| | else |
| | { |
| | VkComputePrivate::record r; |
| | r.type = VkComputePrivate::record::TYPE_image_barrers; |
| | r.command_buffer = d->compute_command_buffer; |
| | r.image_barrers.src_stage = src_stage; |
| | r.image_barrers.dst_stage = dst_stage; |
| | r.image_barrers.barrier_count = 1; |
| | r.image_barrers.barriers = barriers; |
| | d->delayed_records.push_back(r); |
| | } |
| |
|
| | |
| | binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT; |
| | binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| | } |
| | } |
| |
|
| | class VkTransferPrivate |
| | { |
| | public: |
| | VkTransferPrivate(const VulkanDevice* _vkdev); |
| | ~VkTransferPrivate(); |
| |
|
| | int init(); |
| | int begin_command_buffer(); |
| | int end_command_buffer(); |
| |
|
| | const VulkanDevice* vkdev; |
| |
|
| | VkCommandPool compute_command_pool; |
| | VkCommandPool transfer_command_pool; |
| |
|
| | VkCommandBuffer upload_command_buffer; |
| | VkCommandBuffer compute_command_buffer; |
| |
|
| | VkSemaphore upload_compute_semaphore; |
| |
|
| | VkFence upload_command_fence; |
| | VkFence compute_command_fence; |
| |
|
| | std::vector<VkMat> upload_staging_buffers; |
| | }; |
| |
|
| | VkTransferPrivate::VkTransferPrivate(const VulkanDevice* _vkdev) |
| | : vkdev(_vkdev) |
| | { |
| | compute_command_pool = 0; |
| | transfer_command_pool = 0; |
| |
|
| | upload_command_buffer = 0; |
| | compute_command_buffer = 0; |
| |
|
| | upload_compute_semaphore = 0; |
| |
|
| | upload_command_fence = 0; |
| | compute_command_fence = 0; |
| |
|
| | init(); |
| | } |
| |
|
| | VkTransferPrivate::~VkTransferPrivate() |
| | { |
| | vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); |
| |
|
| | vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); |
| | vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); |
| |
|
| | if (!vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0); |
| |
|
| | vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0); |
| |
|
| | vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer); |
| | vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0); |
| | } |
| | } |
| |
|
| | int VkTransferPrivate::init() |
| | { |
| | |
| | { |
| | VkCommandPoolCreateInfo commandPoolCreateInfo; |
| | commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; |
| | commandPoolCreateInfo.pNext = 0; |
| | commandPoolCreateInfo.flags = 0; |
| | commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| |
|
| | VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateCommandPool failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkCommandBufferAllocateInfo commandBufferAllocateInfo; |
| | commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; |
| | commandBufferAllocateInfo.pNext = 0; |
| | commandBufferAllocateInfo.commandPool = compute_command_pool; |
| | commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; |
| | commandBufferAllocateInfo.commandBufferCount = 1; |
| |
|
| | VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkFenceCreateInfo fenceCreateInfo; |
| | fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; |
| | fenceCreateInfo.pNext = 0; |
| | fenceCreateInfo.flags = 0; |
| |
|
| | VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateFence failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | if (!vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | |
| | { |
| | VkCommandPoolCreateInfo commandPoolCreateInfo; |
| | commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; |
| | commandPoolCreateInfo.pNext = 0; |
| | commandPoolCreateInfo.flags = 0; |
| | commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index(); |
| |
|
| | VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateCommandPool failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkCommandBufferAllocateInfo commandBufferAllocateInfo; |
| | commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; |
| | commandBufferAllocateInfo.pNext = 0; |
| | commandBufferAllocateInfo.commandPool = transfer_command_pool; |
| | commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; |
| | commandBufferAllocateInfo.commandBufferCount = 1; |
| |
|
| | VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkSemaphoreCreateInfo semaphoreCreateInfo; |
| | semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; |
| | semaphoreCreateInfo.pNext = 0; |
| | semaphoreCreateInfo.flags = 0; |
| |
|
| | VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateSemaphore failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | |
| | { |
| | VkFenceCreateInfo fenceCreateInfo; |
| | fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; |
| | fenceCreateInfo.pNext = 0; |
| | fenceCreateInfo.flags = 0; |
| |
|
| | VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkCreateFence failed %d", ret); |
| | return -1; |
| | } |
| | } |
| | } |
| |
|
| | begin_command_buffer(); |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkTransferPrivate::begin_command_buffer() |
| | { |
| | { |
| | VkCommandBufferBeginInfo commandBufferBeginInfo; |
| | commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; |
| | commandBufferBeginInfo.pNext = 0; |
| | commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; |
| | commandBufferBeginInfo.pInheritanceInfo = 0; |
| |
|
| | VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | if (!vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | { |
| | VkCommandBufferBeginInfo commandBufferBeginInfo; |
| | commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; |
| | commandBufferBeginInfo.pNext = 0; |
| | commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; |
| | commandBufferBeginInfo.pInheritanceInfo = 0; |
| |
|
| | VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkBeginCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| | } |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int VkTransferPrivate::end_command_buffer() |
| | { |
| | { |
| | VkResult ret = vkEndCommandBuffer(compute_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkEndCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | if (!vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | { |
| | VkResult ret = vkEndCommandBuffer(upload_command_buffer); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkEndCommandBuffer failed %d", ret); |
| | return -1; |
| | } |
| | } |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | VkTransfer::VkTransfer(const VulkanDevice* _vkdev) |
| | : vkdev(_vkdev), d(new VkTransferPrivate(_vkdev)) |
| | { |
| | } |
| |
|
| | VkTransfer::~VkTransfer() |
| | { |
| | delete d; |
| | } |
| |
|
| | void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) |
| | { |
| | |
| |
|
| | |
| | if (src.elembits() == 32) |
| | { |
| | if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) |
| | { |
| | Mat src_fp16; |
| | cast_float32_to_float16(src, src_fp16, opt); |
| |
|
| | record_upload(src_fp16, dst, opt, flatten); |
| |
|
| | return; |
| | } |
| | } |
| |
|
| | Mat src_flattened = flatten ? src.reshape(src.w * src.h * src.c) : src; |
| |
|
| | |
| | dst.create_like(src_flattened, opt.blob_vkallocator); |
| |
|
| | if (dst.empty()) |
| | { |
| | return; |
| | } |
| |
|
| | if (dst.allocator->mappable) |
| | { |
| | |
| | memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); |
| | dst.allocator->flush(dst.data); |
| |
|
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; |
| | barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.buffer = dst.buffer(); |
| | barrier.offset = dst.buffer_offset(); |
| | barrier.size = dst.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | return; |
| | } |
| |
|
| | |
| | VkMat dst_staging; |
| | dst_staging.create_like(src_flattened, opt.staging_vkallocator); |
| |
|
| | |
| | memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); |
| | dst_staging.allocator->flush(dst_staging.data); |
| |
|
| | VkCommandBuffer command_buffer; |
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | command_buffer = d->compute_command_buffer; |
| | } |
| | else |
| | { |
| | command_buffer = d->upload_command_buffer; |
| | } |
| |
|
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; |
| | barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.buffer = dst_staging.buffer(); |
| | barrier.offset = dst_staging.buffer_offset(); |
| | barrier.size = dst_staging.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| |
|
| | |
| | { |
| | VkBufferCopy region; |
| | region.srcOffset = dst_staging.buffer_offset(); |
| | region.dstOffset = dst.buffer_offset(); |
| | region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); |
| |
|
| | vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, ®ion); |
| | } |
| |
|
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.buffer = dst.buffer(); |
| | barrier.offset = dst.buffer_offset(); |
| | barrier.size = dst.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| | } |
| | else |
| | { |
| | |
| |
|
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barrier.dstAccessMask = 0; |
| | barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); |
| | barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| | barrier.buffer = dst.buffer(); |
| | barrier.offset = dst.buffer_offset(); |
| | barrier.size = dst.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; |
| |
|
| | vkCmdPipelineBarrier(d->upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| |
|
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = 0; |
| | barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); |
| | barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| | barrier.buffer = dst.buffer(); |
| | barrier.offset = dst.buffer_offset(); |
| | barrier.size = dst.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| | } |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| | } |
| |
|
| | void VkTransfer::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) |
| | { |
| | |
| |
|
| | |
| | if (src.elembits() == 32) |
| | { |
| | if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) |
| | { |
| | Mat src_fp16; |
| | cast_float32_to_float16(src, src_fp16, opt); |
| |
|
| | record_upload(src_fp16, dst, opt); |
| |
|
| | return; |
| | } |
| | } |
| |
|
| | |
| | dst.create_like(src, opt.blob_vkallocator); |
| | if (dst.empty()) |
| | return; |
| |
|
| | |
| | VkMat dst_staging; |
| | dst_staging.create_like(src, opt.staging_vkallocator); |
| |
|
| | |
| | memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); |
| | dst_staging.allocator->flush(dst_staging.data); |
| |
|
| | VkCommandBuffer command_buffer; |
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | command_buffer = d->compute_command_buffer; |
| | } |
| | else |
| | { |
| | command_buffer = d->upload_command_buffer; |
| | } |
| |
|
| | |
| | { |
| | VkBufferMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; |
| | barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.buffer = dst_staging.buffer(); |
| | barrier.offset = dst_staging.buffer_offset(); |
| | barrier.size = dst_staging.buffer_capacity(); |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); |
| | } |
| |
|
| | |
| | { |
| | VkImageMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = 0; |
| | barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| | barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.image = dst.image(); |
| | barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barrier.subresourceRange.baseMipLevel = 0; |
| | barrier.subresourceRange.levelCount = 1; |
| | barrier.subresourceRange.baseArrayLayer = 0; |
| | barrier.subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| |
|
| | vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); |
| | } |
| |
|
| | |
| | { |
| | const int channels = dst.c; |
| | VkBufferImageCopy* regions = new VkBufferImageCopy[channels]; |
| | for (int i = 0; i < channels; i++) |
| | { |
| | regions[i].bufferOffset = dst_staging.buffer_offset() + dst_staging.cstep * dst_staging.elemsize * i; |
| | regions[i].bufferRowLength = 0; |
| | regions[i].bufferImageHeight = 0; |
| | regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | regions[i].imageSubresource.mipLevel = 0; |
| | regions[i].imageSubresource.baseArrayLayer = 0; |
| | regions[i].imageSubresource.layerCount = 1; |
| | regions[i].imageOffset.x = 0; |
| | regions[i].imageOffset.y = 0; |
| | regions[i].imageOffset.z = i; |
| | regions[i].imageExtent.width = dst.data->width; |
| | regions[i].imageExtent.height = dst.data->height; |
| | regions[i].imageExtent.depth = 1; |
| | } |
| |
|
| | vkCmdCopyBufferToImage(command_buffer, dst_staging.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, channels, regions); |
| | delete[] regions; |
| | } |
| |
|
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | |
| | { |
| | VkImageMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; |
| | barrier.image = dst.image(); |
| | barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barrier.subresourceRange.baseMipLevel = 0; |
| | barrier.subresourceRange.levelCount = 1; |
| | barrier.subresourceRange.baseArrayLayer = 0; |
| | barrier.subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); |
| | } |
| | } |
| | else |
| | { |
| | |
| |
|
| | |
| | { |
| | VkImageMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| | barrier.dstAccessMask = 0; |
| | barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); |
| | barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| | barrier.image = dst.image(); |
| | barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barrier.subresourceRange.baseMipLevel = 0; |
| | barrier.subresourceRange.levelCount = 1; |
| | barrier.subresourceRange.baseArrayLayer = 0; |
| | barrier.subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; |
| |
|
| | vkCmdPipelineBarrier(d->upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); |
| | } |
| |
|
| | |
| | { |
| | VkImageMemoryBarrier barrier; |
| | barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; |
| | barrier.pNext = 0; |
| | barrier.srcAccessMask = 0; |
| | barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; |
| | barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; |
| | barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index(); |
| | barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index(); |
| | barrier.image = dst.image(); |
| | barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| | barrier.subresourceRange.baseMipLevel = 0; |
| | barrier.subresourceRange.levelCount = 1; |
| | barrier.subresourceRange.baseArrayLayer = 0; |
| | barrier.subresourceRange.layerCount = 1; |
| |
|
| | VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; |
| | VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); |
| | } |
| | } |
| |
|
| | |
| | dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; |
| | dst.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; |
| | dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| |
|
| | |
| | d->upload_staging_buffers.push_back(dst_staging); |
| | } |
| |
|
| | int VkTransfer::submit_and_wait() |
| | { |
| | |
| |
|
| | |
| | { |
| | d->end_command_buffer(); |
| | } |
| |
|
| | VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index()); |
| | if (compute_queue == 0) |
| | { |
| | NCNN_LOGE("out of compute queue"); |
| | return -1; |
| | } |
| |
|
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | |
| | { |
| | VkSubmitInfo submitInfo; |
| | submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; |
| | submitInfo.pNext = 0; |
| | submitInfo.waitSemaphoreCount = 0; |
| | submitInfo.pWaitSemaphores = 0; |
| | submitInfo.pWaitDstStageMask = 0; |
| | submitInfo.commandBufferCount = 1; |
| | submitInfo.pCommandBuffers = &d->compute_command_buffer; |
| | submitInfo.signalSemaphoreCount = 0; |
| | submitInfo.pSignalSemaphores = 0; |
| |
|
| | VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkQueueSubmit failed %d", ret); |
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| | return -1; |
| | } |
| | } |
| | } |
| | else |
| | { |
| | VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index()); |
| | if (transfer_queue == 0) |
| | { |
| | NCNN_LOGE("out of transfer queue"); |
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| | return -1; |
| | } |
| |
|
| | |
| | { |
| | VkSubmitInfo submitInfo; |
| | submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; |
| | submitInfo.pNext = 0; |
| | submitInfo.waitSemaphoreCount = 0; |
| | submitInfo.pWaitSemaphores = 0; |
| | submitInfo.pWaitDstStageMask = 0; |
| | submitInfo.commandBufferCount = 1; |
| | submitInfo.pCommandBuffers = &d->upload_command_buffer; |
| | submitInfo.signalSemaphoreCount = 1; |
| | submitInfo.pSignalSemaphores = &d->upload_compute_semaphore; |
| |
|
| | VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, d->upload_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkQueueSubmit failed %d", ret); |
| | vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); |
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| | return -1; |
| | } |
| | } |
| | { |
| | VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; |
| |
|
| | VkSubmitInfo submitInfo; |
| | submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; |
| | submitInfo.pNext = 0; |
| | submitInfo.waitSemaphoreCount = 1; |
| | submitInfo.pWaitSemaphores = &d->upload_compute_semaphore; |
| | submitInfo.pWaitDstStageMask = &wait_dst_stage; |
| | submitInfo.commandBufferCount = 1; |
| | submitInfo.pCommandBuffers = &d->compute_command_buffer; |
| | submitInfo.signalSemaphoreCount = 0; |
| | submitInfo.pSignalSemaphores = 0; |
| |
|
| | VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkQueueSubmit failed %d", ret); |
| | vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); |
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| | return -1; |
| | } |
| | } |
| |
|
| | vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue); |
| | } |
| |
|
| | vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue); |
| |
|
| | |
| | if (vkdev->info.unified_compute_transfer_queue()) |
| | { |
| | VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkWaitForFences failed %d", ret); |
| | return -1; |
| | } |
| | } |
| | else |
| | { |
| | VkFence fences[2] = {d->upload_command_fence, d->compute_command_fence}; |
| |
|
| | VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, (uint64_t)-1); |
| | if (ret != VK_SUCCESS) |
| | { |
| | NCNN_LOGE("vkWaitForFences failed %d", ret); |
| | return -1; |
| | } |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | } |
| |
|
| | #endif |
| |
|