# EVOLVE-BLOCK-START """ Initial Grayscale submission with Triton kernel. Y = 0.2989 R + 0.5870 G + 0.1140 B """ import torch import triton import triton.language as tl @triton.jit def grayscale_kernel( rgb_ptr, out_ptr, H, W, stride_h, stride_w, stride_c, BLOCK_SIZE: tl.constexpr, ): pid = tl.program_id(0) n_pixels = H * W block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n_pixels h_idx = offsets // W w_idx = offsets % W r_ptr = rgb_ptr + h_idx * stride_h + w_idx * stride_w + 0 * stride_c g_ptr = rgb_ptr + h_idx * stride_h + w_idx * stride_w + 1 * stride_c b_ptr = rgb_ptr + h_idx * stride_h + w_idx * stride_w + 2 * stride_c r = tl.load(r_ptr, mask=mask) g = tl.load(g_ptr, mask=mask) b = tl.load(b_ptr, mask=mask) gray = 0.2989 * r + 0.5870 * g + 0.1140 * b out_offsets = h_idx * W + w_idx tl.store(out_ptr + out_offsets, gray, mask=mask) def custom_kernel(data): rgb, output = data H, W, C = rgb.shape assert C == 3 rgb = rgb.contiguous() stride_h, stride_w, stride_c = rgb.stride() n_pixels = H * W BLOCK_SIZE = 1024 grid = (triton.cdiv(n_pixels, BLOCK_SIZE),) grayscale_kernel[grid]( rgb, output, H, W, stride_h, stride_w, stride_c, BLOCK_SIZE=BLOCK_SIZE, ) return output # EVOLVE-BLOCK-END