| // Copyright (C) 2018-2022 Intel Corporation |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
|
|
| |
| |
|
|
| __kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias) |
| { |
| __local half src[8 * 1024]; |
| __local half dst[8 * 1024]; |
|
|
| const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0); |
|
|
| event_t e1 = async_work_group_copy_3D3D( |
| src, // dst |
| src_data + index, // src |
| get_local_size(0), // num_elements_per_line, |
| get_local_size(1), // num_lines, |
| get_global_size(0) - get_local_size(0), // src_line_stride, |
| 0, // dst_line_stride, |
| C, // num_planes, |
| get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride |
| 0, // dst_plane_stride |
| 0); |
| wait_group_events(1, &e1); |
|
|
| float variance = bias + 1e-9f; |
|
|
| |
| for (int c = 0; c < C; c++) { |
| float val = (float)src[c * get_local_size(1) * get_local_size(0) |
| + get_local_id(1) * get_local_size(0) |
| + get_local_id(0)]; |
| variance += val * val; |
| } |
|
|
| half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f); |
|
|
| |
| for (int c = 0; c < C; c++) { |
| dst[c * get_local_size(1) * get_local_size(0) |
| + get_local_id(1) * get_local_size(0) |
| + get_local_id(0)] = |
| src[c * get_local_size(1) * get_local_size(0) |
| + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance; |
| } |
|
|
| barrier(CLK_LOCAL_MEM_FENCE); |
|
|
| event_t e2 = async_work_group_copy_3D3D( |
| dst_data + index, // src |
| dst, // dst |
| get_local_size(0), // num_elements_per_line, |
| get_local_size(1), // num_lines, |
| 0, // src_line_stride, |
| get_global_size(0) - get_local_size(0), // dst_line_stride, |
| C, // num_planes, |
| 0, // src_plane_stride |
| get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride |
| 0); |
| wait_group_events(1, &e2); |
| } |
|
|