Spaces:
Runtime error
Runtime error
| // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
| // | |
| // NVIDIA CORPORATION and its licensors retain all intellectual property | |
| // and proprietary rights in and to this software, related documentation | |
| // and any modifications thereto. Any use, reproduction, disclosure or | |
| // distribution of this software and related documentation without an express | |
| // license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| //------------------------------------------------------------------------ | |
| // Common op attribute parser. | |
| static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p) | |
| { | |
| // Mip and filter modes. | |
| OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode)); | |
| OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported")); | |
| p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR); | |
| // Mip level clamp. | |
| if (p.enableMip) | |
| { | |
| OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit)); | |
| OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level")); | |
| ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op. | |
| } | |
| // Boundary mode. | |
| OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode)); | |
| OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported")); | |
| } | |
| //------------------------------------------------------------------------ | |
| // Forward TensorFlow op. | |
| struct TextureFwdOp : public OpKernel | |
| { | |
| TextureKernelParams m_attribs; | |
| PersistentTensor m_persistentMipTensor; // Used if texture is constant and mips are enabled. | |
| bool m_persistentMipTensorInitialized; | |
| TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
| { | |
| memset(&m_attribs, 0, sizeof(m_attribs)); | |
| m_persistentMipTensorInitialized = false; | |
| parseOpAttributes(ctx, m_attribs); | |
| } | |
| void Compute(OpKernelContext* ctx) | |
| { | |
| TextureKernelParams& p = m_attribs; | |
| cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
| bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); | |
| // Get input. | |
| const Tensor& tex = ctx->input(0); | |
| const Tensor& uv = ctx->input(1); | |
| const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1); | |
| // Extract input dimensions. | |
| p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; | |
| p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; | |
| p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; | |
| p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; | |
| if (!cube_mode) | |
| { | |
| p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; | |
| p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
| p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
| } | |
| else | |
| { | |
| p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
| p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
| p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; | |
| } | |
| // Sanity checks. | |
| if (!cube_mode) | |
| { | |
| OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); | |
| OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); | |
| } | |
| else | |
| { | |
| OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); | |
| OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); | |
| OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); | |
| } | |
| OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); | |
| OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large")); | |
| if (p.enableMip) | |
| { | |
| if (!cube_mode) | |
| OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); | |
| else | |
| OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); | |
| } | |
| // Get input pointers. | |
| p.tex[0] = tex.flat<float>().data(); | |
| p.uv = uv.flat<float>().data(); | |
| p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0; | |
| // Allocate output tensor. | |
| Tensor* out_tensor = NULL; | |
| TensorShape out_shape; | |
| out_shape.AddDim(p.n); | |
| out_shape.AddDim(p.imgHeight); | |
| out_shape.AddDim(p.imgWidth); | |
| out_shape.AddDim(p.channels); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor)); | |
| p.out = out_tensor->flat<float>().data(); | |
| // Choose kernel variants based on channel count. | |
| void* args[] = {&p}; | |
| int channel_div_idx = 0; | |
| if (!(p.channels & 3)) | |
| channel_div_idx = 2; // Channel count divisible by 4. | |
| else if (!(p.channels & 1)) | |
| channel_div_idx = 1; // Channel count divisible by 2. | |
| // Mip-related setup. | |
| float* pmip = 0; | |
| if (p.enableMip) | |
| { | |
| // Generate mip offsets. | |
| int mipOffsets[TEX_MAX_MIP_LEVEL]; | |
| int mipTotal = calculateMipInfo(ctx, p, mipOffsets); | |
| // Mip output tensor. | |
| Tensor* mip_tensor = NULL; | |
| TensorShape mip_shape; | |
| mip_shape.AddDim(mipTotal); | |
| // If texture is constant, calculate mip stack only once. | |
| bool computeMip = true; | |
| if (p.texConst) | |
| { | |
| // First execution? | |
| if (!m_persistentMipTensorInitialized) | |
| { | |
| // Allocate a persistent mip tensor. | |
| OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor)); | |
| m_persistentMipTensorInitialized = true; | |
| } | |
| else | |
| { | |
| // Reuse the persistent tensor, do not recompute mip levels. | |
| mip_tensor = m_persistentMipTensor.AccessTensor(ctx); | |
| computeMip = false; | |
| } | |
| // Set as output tensor as well. | |
| ctx->set_output(1, *mip_tensor); | |
| } | |
| else | |
| { | |
| // Allocate an output tensor as usual. | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor)); | |
| } | |
| pmip = mip_tensor->flat<float>().data(); // Pointer to data. | |
| for (int i=1; i <= p.mipLevelMax; i++) | |
| p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. | |
| // Build mip levels if needed. | |
| if (computeMip) | |
| { | |
| for (int i=1; i <= p.mipLevelMax; i++) | |
| { | |
| int2 ms = mipLevelSize(p, i); | |
| int3 sz = make_int3(ms.x, ms.y, p.texDepth); | |
| dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y); | |
| dim3 gridSize = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1)); | |
| p.mipLevelOut = i; | |
| void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 }; | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream)); | |
| } | |
| } | |
| } | |
| // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. | |
| if (!cube_mode) | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); | |
| if ((p.channels & 3) == 0) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.out & 15), errors::Internal("out output tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip output tensor not aligned to float4")); | |
| } | |
| if ((p.channels & 1) == 0) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.out & 7), errors::Internal("out output tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip output tensor not aligned to float2")); | |
| } | |
| if (!cube_mode) | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); | |
| else | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); | |
| // Choose launch parameters for texture lookup kernel. | |
| dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); | |
| dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); | |
| // Choose kernel based on filter mode, cube mode, and datatype. | |
| void* func_tbl[TEX_MODE_COUNT * 3 * 2] = { | |
| (void*)TextureFwdKernelNearest1, | |
| (void*)TextureFwdKernelNearest2, | |
| (void*)TextureFwdKernelNearest4, | |
| (void*)TextureFwdKernelLinear1, | |
| (void*)TextureFwdKernelLinear2, | |
| (void*)TextureFwdKernelLinear4, | |
| (void*)TextureFwdKernelLinearMipmapNearest1, | |
| (void*)TextureFwdKernelLinearMipmapNearest2, | |
| (void*)TextureFwdKernelLinearMipmapNearest4, | |
| (void*)TextureFwdKernelLinearMipmapLinear1, | |
| (void*)TextureFwdKernelLinearMipmapLinear2, | |
| (void*)TextureFwdKernelLinearMipmapLinear4, | |
| (void*)TextureFwdKernelCubeNearest1, | |
| (void*)TextureFwdKernelCubeNearest2, | |
| (void*)TextureFwdKernelCubeNearest4, | |
| (void*)TextureFwdKernelCubeLinear1, | |
| (void*)TextureFwdKernelCubeLinear2, | |
| (void*)TextureFwdKernelCubeLinear4, | |
| (void*)TextureFwdKernelCubeLinearMipmapNearest1, | |
| (void*)TextureFwdKernelCubeLinearMipmapNearest2, | |
| (void*)TextureFwdKernelCubeLinearMipmapNearest4, | |
| (void*)TextureFwdKernelCubeLinearMipmapLinear1, | |
| (void*)TextureFwdKernelCubeLinearMipmapLinear2, | |
| (void*)TextureFwdKernelCubeLinearMipmapLinear4, | |
| }; | |
| // Function index. | |
| int func_idx = p.filterMode; | |
| if (cube_mode) | |
| func_idx += TEX_MODE_COUNT; | |
| func_idx = func_idx * 3 + channel_div_idx; | |
| // Launch kernel. | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); | |
| } | |
| }; | |
| REGISTER_OP("TextureFwd") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Output ("out: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int"); | |
| REGISTER_OP("TextureFwdMip") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Input ("uv_da: float") | |
| .Output ("out: float") | |
| .Output ("mip: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int") | |
| .Attr ("tex_const: int") | |
| .Attr ("max_mip_level: int"); | |
| REGISTER_KERNEL_BUILDER(Name("TextureFwd") .Device(DEVICE_GPU), TextureFwdOp); | |
| REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp); | |
| //------------------------------------------------------------------------ | |
| // Gradient TensorFlow op. | |
| struct TextureGradOp : public OpKernel | |
| { | |
| TextureKernelParams m_attribs; | |
| TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
| { | |
| memset(&m_attribs, 0, sizeof(m_attribs)); | |
| parseOpAttributes(ctx, m_attribs); | |
| } | |
| void Compute(OpKernelContext* ctx) | |
| { | |
| TextureKernelParams& p = m_attribs; | |
| cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
| bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); | |
| // Get input. | |
| const Tensor& tex = ctx->input(0); | |
| const Tensor& uv = ctx->input(1); | |
| const Tensor& dy = ctx->input(2); | |
| const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2); | |
| const Tensor& mip = ctx->input(p.enableMip ? 4 : 2); | |
| // Extract input dimensions. | |
| p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; | |
| p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; | |
| p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; | |
| p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; | |
| if (!cube_mode) | |
| { | |
| p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; | |
| p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
| p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
| } | |
| else | |
| { | |
| p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
| p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
| p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; | |
| } | |
| // Sanity checks. | |
| if (!cube_mode) | |
| { | |
| OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); | |
| OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); | |
| } | |
| else | |
| { | |
| OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); | |
| OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); | |
| OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); | |
| } | |
| OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); | |
| OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]")); | |
| if (p.enableMip) | |
| { | |
| if (!cube_mode) | |
| OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); | |
| else | |
| OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); | |
| } | |
| // Get input pointers. | |
| p.tex[0] = tex.flat<float>().data(); | |
| p.uv = uv.flat<float>().data(); | |
| p.dy = dy.flat<float>().data(); | |
| p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0; | |
| float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0; | |
| // Allocate output tensor for tex gradient. | |
| Tensor* grad_tex_tensor = NULL; | |
| TensorShape grad_tex_shape; | |
| grad_tex_shape.AddDim(p.texDepth); | |
| if (cube_mode) | |
| grad_tex_shape.AddDim(6); | |
| grad_tex_shape.AddDim(p.texHeight); | |
| grad_tex_shape.AddDim(p.texWidth); | |
| grad_tex_shape.AddDim(p.channels); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor)); | |
| p.gradTex[0] = grad_tex_tensor->flat<float>().data(); | |
| // Allocate output tensor for uv gradient. | |
| if (p.filterMode != TEX_MODE_NEAREST) | |
| { | |
| TensorShape grad_uv_shape; | |
| Tensor* grad_uv_tensor = NULL; | |
| grad_uv_shape.AddDim(p.n); | |
| grad_uv_shape.AddDim(p.imgHeight); | |
| grad_uv_shape.AddDim(p.imgWidth); | |
| grad_uv_shape.AddDim(uv.dim_size(3)); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor)); | |
| p.gradUV = grad_uv_tensor->flat<float>().data(); | |
| // Allocate output tensor for uv_da gradient. | |
| if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR) | |
| { | |
| Tensor* grad_uv_da_tensor = NULL; | |
| grad_uv_shape.set_dim(3, uv_da.dim_size(3)); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor)); | |
| p.gradUVDA = grad_uv_da_tensor->flat<float>().data(); | |
| } | |
| } | |
| // Choose kernel variants based on channel count. | |
| int channel_div_idx = 0; | |
| if (!(p.channels & 3)) | |
| channel_div_idx = 2; // Channel count divisible by 4. | |
| else if (!(p.channels & 1)) | |
| channel_div_idx = 1; // Channel count divisible by 2. | |
| // Mip-related setup. | |
| Tensor grad_mip_tensor; | |
| float* pgradMip = 0; | |
| if (p.enableMip) | |
| { | |
| // Generate mip offsets. | |
| int mipOffsets[TEX_MAX_MIP_LEVEL]; | |
| int mipTotal = calculateMipInfo(ctx, p, mipOffsets); | |
| // Get space for temporary mip gradients. | |
| TensorShape grad_mip_shape; | |
| grad_mip_shape.AddDim(mipTotal); | |
| ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor); | |
| pgradMip = grad_mip_tensor.flat<float>().data(); | |
| for (int i=1; i <= p.mipLevelMax; i++) | |
| { | |
| p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. | |
| p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients. | |
| } | |
| // Clear mip gradients. | |
| OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream)); | |
| } | |
| // Initialize texture gradients to zero. | |
| int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float); | |
| if (cube_mode) | |
| texBytes *= 6; | |
| OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream)); | |
| // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. | |
| if (!cube_mode) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradUV & 7), errors::Internal("grad_uv output tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4")); | |
| } | |
| else | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2")); | |
| } | |
| if ((p.channels & 3) == 0) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.dy & 15), errors::Internal("dy input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 15), errors::Internal("internal mip gradient tensor not aligned to float4")); | |
| } | |
| if ((p.channels & 1) == 0) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy output tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 7), errors::Internal("internal mip gradient tensor not aligned to float2")); | |
| } | |
| // Choose launch parameters for main gradient kernel. | |
| void* args[] = {&p}; | |
| dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); | |
| dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); | |
| void* func_tbl[TEX_MODE_COUNT * 2] = { | |
| (void*)TextureGradKernelNearest, | |
| (void*)TextureGradKernelLinear, | |
| (void*)TextureGradKernelLinearMipmapNearest, | |
| (void*)TextureGradKernelLinearMipmapLinear, | |
| (void*)TextureGradKernelCubeNearest, | |
| (void*)TextureGradKernelCubeLinear, | |
| (void*)TextureGradKernelCubeLinearMipmapNearest, | |
| (void*)TextureGradKernelCubeLinearMipmapLinear, | |
| }; | |
| // Function index. | |
| int func_idx = p.filterMode; | |
| if (cube_mode) | |
| func_idx += TEX_MODE_COUNT; | |
| // Launch main gradient kernel. | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); | |
| // Launch kernel to pull gradients from mip levels. | |
| if (p.enableMip) | |
| { | |
| dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight); | |
| dim3 gridSize = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1)); | |
| int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float); | |
| void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 }; | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream)); | |
| } | |
| } | |
| }; | |
| REGISTER_OP("TextureGradNearest") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Input ("dy: float") | |
| .Output ("grad_tex: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int"); | |
| REGISTER_OP("TextureGradLinear") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Input ("dy: float") | |
| .Output ("grad_tex: float") | |
| .Output ("grad_uv: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int"); | |
| REGISTER_OP("TextureGradLinearMipmapNearest") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Input ("dy: float") | |
| .Input ("uv_da: float") | |
| .Input ("mip: float") | |
| .Output ("grad_tex: float") | |
| .Output ("grad_uv: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int") | |
| .Attr ("max_mip_level: int"); | |
| REGISTER_OP("TextureGradLinearMipmapLinear") | |
| .Input ("tex: float") | |
| .Input ("uv: float") | |
| .Input ("dy: float") | |
| .Input ("uv_da: float") | |
| .Input ("mip: float") | |
| .Output ("grad_tex: float") | |
| .Output ("grad_uv: float") | |
| .Output ("grad_uv_da: float") | |
| .Attr ("filter_mode: int") | |
| .Attr ("boundary_mode: int") | |
| .Attr ("max_mip_level: int"); | |
| REGISTER_KERNEL_BUILDER(Name("TextureGradNearest") .Device(DEVICE_GPU), TextureGradOp); | |
| REGISTER_KERNEL_BUILDER(Name("TextureGradLinear") .Device(DEVICE_GPU), TextureGradOp); | |
| REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp); | |
| REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp); | |
| //------------------------------------------------------------------------ | |