|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
|
|
|
|
#ifdef cl_intel_subgroups |
|
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable |
|
|
#else |
|
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable |
|
|
#endif |
|
|
|
|
|
#ifdef cl_intel_required_subgroup_size |
|
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable |
|
|
#define INTEL_GPU 1 |
|
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) |
|
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) |
|
|
#elif defined(cl_qcom_reqd_sub_group_size) |
|
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable |
|
|
#define ADRENO_GPU 1 |
|
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) |
|
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) |
|
|
#endif |
|
|
|
|
|
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; } |
|
|
|
|
|
enum ggml_sort_order { |
|
|
GGML_SORT_ORDER_ASC, |
|
|
GGML_SORT_ORDER_DESC, |
|
|
}; |
|
|
|
|
|
kernel void kernel_argsort_f32_i32( |
|
|
global float * src0, |
|
|
ulong offset0, |
|
|
global int * dst, |
|
|
ulong offsetd, |
|
|
const int ne00, |
|
|
const int ne00_pad, |
|
|
const int order, |
|
|
local int * dst_row |
|
|
) { |
|
|
|
|
|
int col = get_local_id(0); |
|
|
int row = get_group_id(1); |
|
|
|
|
|
if (col >= ne00_pad) { |
|
|
return; |
|
|
} |
|
|
|
|
|
src0 = (global char *)((global char *)src0 + offset0); |
|
|
dst = (global float *)((global char *)dst + offsetd); |
|
|
|
|
|
global float * x_row = src0 + row * ne00; |
|
|
|
|
|
|
|
|
dst_row[col] = col; |
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
for (int k = 2; k <= ne00_pad; k *= 2) { |
|
|
for (int j = k / 2; j > 0; j /= 2) { |
|
|
int ixj = col ^ j; |
|
|
if (ixj > col) { |
|
|
if ((col & k) == 0) { |
|
|
if (dst_row[col] >= ne00 || |
|
|
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ? |
|
|
x_row[dst_row[col]] > x_row[dst_row[ixj]] : |
|
|
x_row[dst_row[col]] < x_row[dst_row[ixj]])) |
|
|
) { |
|
|
SWAP(dst_row[col], dst_row[ixj], int); |
|
|
} |
|
|
} else { |
|
|
if (dst_row[ixj] >= ne00 || |
|
|
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ? |
|
|
x_row[dst_row[col]] < x_row[dst_row[ixj]] : |
|
|
x_row[dst_row[col]] > x_row[dst_row[ixj]])) |
|
|
) { |
|
|
SWAP(dst_row[col], dst_row[ixj], int); |
|
|
} |
|
|
} |
|
|
} |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (col < ne00) { |
|
|
dst[row * ne00 + col] = dst_row[col]; |
|
|
} |
|
|
} |
|
|
|