add clip

17db41a about 3 years ago

32.3 kB



	#include <cuda_fp16.h>
	#include <cuda_bf16.h>

	using bfloat16 = nv_bfloat16;
	using bfloat16_2 = nv_bfloat162;


	#include <cuda_fp16.h>
	#include <cuda_bf16.h>
	#include "cutlass/cutlass.h"
	#include "cutlass/fast_math.h"
	#include "cutlass/constants.h"
	#include "cutlass/epilogue/thread/activation.h"
	#include "math_constants.h"



	namespace {


	#define FUSED_ELE_THREAD_SIZE 256

	const int N_ELEMENTS_PER_THREAD = sizeof(uint4) / sizeof(half);
	const int N_ELEMENTS_PER_READ = sizeof(uint4) / sizeof(half);
	const int N_OPS_PER_THREAD = sizeof(uint4) / sizeof(half2);


	// Copyright (c) Meta Platforms, Inc. and affiliates.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#ifndef CUSTOM_MATH
	#define CUSTOM_MATH

	#ifndef __HALF2_TO_UI
	#define __HALF2_TO_UI(var) (reinterpret_cast<unsigned int>(&(var)))
	#endif

	#ifndef __HALF_TO_US
	#define __HALF_TO_US(var) (reinterpret_cast<unsigned short>(&(var)))
	#endif

	#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)

	#define CUDA_FP16_ZERO \
	__half { \
	0x0u \
	}
	#define CUDA_BF16_ZERO \
	__nv_bfloat16 { \
	0x0u \
	}
	#define CUDA_FP162_ZERO \
	__half2 { \
	0x0u, 0x0u \
	}
	#define CUDA_BF162_ZERO \
	__nv_bfloat162 { \
	0x0u, 0x0u \
	}
	#define CUDA_FP16_ONE \
	__half_raw { \
	0x3c00u \
	}
	#define CUDA_BF16_ONE \
	__nv_bfloat16_raw { \
	0x3f80u \
	}
	#define CUDA_FP16_ONE_HALF \
	__half_raw { \
	0x3800u \
	}
	#define CUDA_BF16_ONE_HALF \
	__nv_bfloat16_raw { \
	0x3f00u \
	}

	// sqrt(2 / pi)
	#define CUDA_BF16_K1 \
	__nv_bfloat16_raw { \
	0x3f4c \
	}

	// 2/(3*pi) - 1/6
	#define CUDA_BF16_K3 \
	__nv_bfloat16_raw { \
	0x3d3a \
	}

	template <typename T>
	__device__ T sign_custom(const T a) {
	return T(a > T(0)) - T(a < T(0));
	}

	__device__ half2 h2sign_custom(const half2 a) {
	return __hsub2(__hgt2(a, CUDA_FP162_ZERO), __hlt2(a, CUDA_FP162_ZERO));
	}

	__device__ bfloat16_2 h2sign_custom(const bfloat16_2 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hsub2(__hgt2(a, CUDA_BF162_ZERO), __hlt2(a, CUDA_BF162_ZERO));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 fast_tanh(half2 x) {
	#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
	(__CUDA_ARCH__ >= 750)

	asm volatile("tanh.approx.f16x2 %0, %1;"
	: "=r"(__HALF2_TO_UI(x))
	: "r"(__HALF2_TO_UI(x)));
	return x;

	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half fast_tanh(half x) {
	#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
	(__CUDA_ARCH__ >= 750)

	asm volatile("tanh.approx.f16 %0, %1;"
	: "=h"(__HALF_TO_US(x))
	: "h"(__HALF_TO_US(x)));
	return x;

	#else
	return half(cutlass::fast_tanh(float(x)));
	#endif
	}

	__device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
	#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
	(__CUDA_ARCH__ >= 900)

	asm volatile("tanh.approx.bf16x2 %0, %1;"
	: "=r"(__HALF_TO_UI(x))
	: "r"(__HALF_TO_UI(x)));
	return x;

	#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return bfloat16_2(
	{cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ bfloat16 fast_tanh(bfloat16 x) {
	#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
	(__CUDA_ARCH__ >= 900)
	asm volatile("tanh.approx.bf16 %0, %1;"
	: "=h"(__HALF_TO_US(x))
	: "h"(__HALF_TO_US(x)));
	return x;

	#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return cutlass::fast_tanh(float(x));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float fsigmoid_custom(const float a) {
	return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
	}

	__device__ half hsigmoid_custom(const half a) {
	return __hmul(
	(__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
	CUDA_FP16_ONE_HALF);
	}

	__device__ half2 h2sigmoid_custom(const half2 a) {
	const auto halfX2 = half2(CUDA_FP16_ONE_HALF, CUDA_FP16_ONE_HALF);
	const auto oneX2 = half2(CUDA_FP16_ONE, CUDA_FP16_ONE);
	return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
	}

	__device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmul(
	(__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
	CUDA_BF16_ONE_HALF);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ bfloat16_2 h2sigmoid_custom(const bfloat16_2 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	const auto halfX2 = bfloat16_2(CUDA_BF16_ONE_HALF, CUDA_BF16_ONE_HALF);
	const auto oneX2 = bfloat16_2(CUDA_BF16_ONE, CUDA_BF16_ONE);
	return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float fsilu(const float a) {
	return a * fsigmoid_custom(a);
	}

	__device__ half hsilu(const half a) {
	return __hmul(a, hsigmoid_custom(a));
	}

	__device__ bfloat16 hsilu(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmul(a, hsigmoid_custom(a));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 h2silu(const half2 a) {
	return __hmul2(a, h2sigmoid_custom(a));
	}

	__device__ bfloat16_2 h2silu(const bfloat16_2 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmul2(a, h2sigmoid_custom(a));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float leaky_relu(const float a, const float negativeSlope) {
	return a > 0.f ? a : a * negativeSlope;
	}

	__device__ half leaky_relu(const half a, const half negativeSlope) {
	return a > half(0.f) ? a : __hmul(a, negativeSlope);
	}

	__device__ bfloat16 leaky_relu(const bfloat16 a, const bfloat16 negativeSlope) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return a > bfloat16(0.f) ? a : __hmul(a, negativeSlope);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
	return half2(
	leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
	}

	__device__ bfloat16_2
	leaky_relu(const bfloat16_2 a, const bfloat16_2 negativeSlope) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return bfloat16_2(
	leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float relu(const float a) {
	return fmaxf(a, 0.f);
	}

	__device__ half relu(const half a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax(a, CUDA_FP16_ZERO);
	#else
	return a > CUDA_FP16_ZERO ? a : CUDA_FP16_ZERO;
	#endif
	}

	__device__ half2 relu(const half2 a) {
	const half2 zeroX2 = half2(CUDA_FP16_ZERO, CUDA_FP16_ZERO);
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2(a, zeroX2);
	#else
	return half2(relu(a.x), relu(a.y));
	#endif
	}

	__device__ bfloat16 relu(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax(a, CUDA_BF16_ZERO);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ bfloat16_2 relu(const bfloat16_2 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2(a, CUDA_BF162_ZERO);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ bfloat16
	hard_tanh(const bfloat16 a, const bfloat16 min_val, const bfloat16 max_val) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax(min_val, __hmin(max_val, a));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half
	hard_tanh(const half a, const half min_val, const half max_val) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax(min_val, __hmin(max_val, a));
	#else
	return a > max_val ? max_val : a < min_val ? min_val : a;
	#endif
	}

	__device__ float hard_tanh(
	const float a,
	const float min_val,
	const float max_val) {
	return fmaxf(min_val, fminf(max_val, a));
	}

	__device__ half2
	h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2(min_val, __hmin2(max_val, a));
	#else
	return half2(
	hard_tanh(a.x, min_val.x, max_val.x),
	hard_tanh(a.y, min_val.y, max_val.y));
	#endif
	}

	__device__ bfloat16_2 h2hard_tanh(
	const bfloat16_2 a,
	const bfloat16_2 min_val,
	const bfloat16_2 max_val) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2(min_val, __hmin2(max_val, a));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half replace_if_inf(
	const half a,
	const half inf_replace,
	const half neginf_replace) {
	auto is_inf = __hisinf(a);
	if (is_inf == -1) {
	return neginf_replace;
	}
	if (is_inf == 1) {
	return inf_replace;
	}
	return a;
	}

	__device__ bfloat16 replace_if_inf(
	const bfloat16 a,
	const bfloat16 inf_replace,
	const bfloat16 neginf_replace) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	auto is_inf = __hisinf(a);
	if (is_inf == -1) {
	return neginf_replace;
	}
	if (is_inf == 1) {
	return inf_replace;
	}
	return a;
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float replace_if_inf(
	const float a,
	const float inf_replace,
	const float neginf_replace) {
	if (isinf(a)) {
	return (a > 0) ? inf_replace : neginf_replace;
	}
	return a;
	}

	__device__ half2 nan_to_num(
	const half2 a,
	const half2 nan_replace,
	const half2 inf_replace,
	const half2 neginf_replace) {
	half2 isnan = __hisnan2(a);
	return half2(
	isnan.x ? nan_replace.x
	: replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
	isnan.y ? nan_replace.y
	: replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
	}

	__device__ bfloat16_2 nan_to_num(
	const bfloat16_2 a,
	const bfloat16_2 nan_replace,
	const bfloat16_2 inf_replace,
	const bfloat16_2 neginf_replace) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	bfloat16_2 isnan = __hisnan2(a);
	return bfloat16_2(
	isnan.x ? nan_replace.x
	: replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
	isnan.y ? nan_replace.y
	: replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half nan_to_num(
	const half a,
	const half nan_replace,
	const half inf_replace,
	const half neginf_replace) {
	if (__hisnan(a)) {
	return nan_replace;
	}
	return replace_if_inf(a, inf_replace, neginf_replace);
	}

	__device__ bfloat16 nan_to_num(
	const bfloat16 a,
	const bfloat16 nan_replace,
	const bfloat16 inf_replace,
	const bfloat16 neginf_replace) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	if (__hisnan(a)) {
	return nan_replace;
	}
	return replace_if_inf(a, inf_replace, neginf_replace);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float nan_to_num(
	const float a,
	const float nan_replace,
	const float inf_replace,
	const float neginf_replace) {
	if (isnan(a)) {
	return nan_replace;
	}
	return replace_if_inf(a, inf_replace, neginf_replace);
	}

	__device__ half2 clamp_nan_to_num(
	const half2 a,
	const half2 clamp_min,
	const half2 clamp_max,
	const half2 nan_replace) {
	half2 isnan = __hisnan2(a);
	return half2(
	isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
	isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
	}

	__device__ bfloat16_2 clamp_nan_to_num(
	const bfloat16_2 a,
	const bfloat16_2 clamp_min,
	const bfloat16_2 clamp_max,
	const bfloat16_2 nan_replace) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	auto isnan = __hisnan2(a);
	return bfloat16_2(
	isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
	isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half clamp_nan_to_num(
	const half a,
	const half clamp_min,
	const half clamp_max,
	const half nan_replace) {
	return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
	}

	__device__ bfloat16 clamp_nan_to_num(
	const bfloat16 a,
	const bfloat16 clamp_min,
	const bfloat16 clamp_max,
	const bfloat16 nan_replace) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float clamp_nan_to_num(
	const float a,
	const float clamp_min,
	const float clamp_max,
	const float nan_replace) {
	return isnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
	}

	// Backup functions for CUDA_ARCH < 800
	__device__ half nanh() {
	return __float2half(nanf(""));
	}

	__device__ bool half_isnan(half h) {
	return h != h;
	}

	__device__ half hmin(half a, half b) {
	return (a < b) ? a : b;
	}

	__device__ half hmax(half a, half b) {
	return (a > b) ? a : b;
	}

	// max/min functions that let NaNs pass through
	__device__ float fmaxf_nan(const float a, const float b) {
	return (isnan(a) \|\| isnan(b)) ? nanf("") : fmaxf(a, b);
	}

	__device__ half hmax_nan(const half a, const half b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax_nan(a, b);
	#else
	return (half_isnan(a) \|\| half_isnan(b)) ? nanh() : hmax(a, b);
	#endif
	}

	__device__ bfloat16 hmax_nan(const bfloat16 a, const bfloat16 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax_nan(a, b);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 hmax2_nan(const half2 a, const half2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2_nan(a, b);
	#else
	return half2(hmax_nan(a.x, b.x), hmax_nan(a.y, b.y));
	#endif
	}

	__device__ bfloat16_2 hmax2_nan(const bfloat16_2 a, const bfloat16_2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmax2_nan(a, b);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float fminf_nan(const float a, const float b) {
	return (isnan(a) \|\| isnan(b)) ? nanf("") : fminf(a, b);
	}

	__device__ half hmin_nan(const half a, const half b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmin_nan(a, b);
	#else
	return (half_isnan(a) \|\| half_isnan(b)) ? nanh() : hmin(a, b);
	#endif
	}

	__device__ bfloat16 hmin_nan(const bfloat16 a, const bfloat16 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmin_nan(a, b);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 hmin2_nan(const half2 a, const half2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmin2_nan(a, b);
	#else
	return half2(hmin_nan(a.x, b.x), hmin_nan(a.y, b.y));
	#endif
	}

	__device__ bfloat16_2 hmin2_nan(const bfloat16_2 a, const bfloat16_2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmin2_nan(a, b);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	// pow impl
	__device__ half hpow(const half a, const half b);
	__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b);

	__device__ half2 h2pow(const half2 a, const half2 b) {
	half b1 = __low2half(b);
	half b2 = __high2half(b);
	if (b1 != b2) {
	half a1 = __low2half(a);
	half a2 = __high2half(a);
	half c1 = hpow(a1, b1);
	half c2 = hpow(a2, b2);
	return __halves2half2(c1, c2);
	}

	// New special cases can be added if needed, such as
	// an powi for cases where b is an integer
	if (__hbeq2(b, half2(0.0, 0.0))) {
	return half2(1.0, 1.0);
	}
	if (__hbeq2(b, half2(1.0, 1.0))) {
	return a;
	}
	if (__hbeq2(b, half2(2.0, 2.0))) {
	return __hmul2(a, a);
	}
	if (__hbeq2(b, half2(3.0, 3.0))) {
	return __hmul2(__hmul2(a, a), a);
	}
	if (__hbeq2(b, half2(0.5, 0.5))) {
	return h2sqrt(a);
	}
	if (__hbeq2(b, half2(-0.5, -0.5))) {
	return h2rsqrt(a);
	}
	if (__hbeq2(b, half2(-1.0, -1.0))) {
	return __h2div(half2(1.0, 1.0), a);
	}
	if (__hbeq2(b, half2(-2.0, -2.0))) {
	return __h2div(half2(1.0, 1.0), __hmul2(a, a));
	}

	half a1 = __low2half(a);
	half a2 = __high2half(a);

	// low 16 bits
	half c1 =
	static_cast<half>(pow(static_cast<double>(a1), static_cast<double>(b1)));
	// high 16 bits
	half c2 =
	static_cast<half>(pow(static_cast<double>(a2), static_cast<double>(b2)));
	return __halves2half2(c1, c2);
	}

	__device__ bfloat16_2 h2pow(const bfloat16_2 a, const bfloat16_2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	auto b1 = __low2bfloat16(b);
	auto b2 = __high2bfloat16(b);
	if (b1 != b2) {
	auto a1 = __low2bfloat16(a);
	auto a2 = __high2bfloat16(a);
	auto c1 = hpow(a1, b1);
	auto c2 = hpow(a2, b2);
	return __halves2bfloat162(c1, c2);
	}

	// New special cases can be added if needed, such as
	// an powi for cases where b is an integer
	if (__hbeq2(b, bfloat16_2(0.0, 0.0))) {
	return bfloat16_2(1.0, 1.0);
	}
	if (__hbeq2(b, bfloat16_2(1.0, 1.0))) {
	return a;
	}
	if (__hbeq2(b, bfloat16_2(2.0, 2.0))) {
	return __hmul2(a, a);
	}
	if (__hbeq2(b, bfloat16_2(3.0, 3.0))) {
	return __hmul2(__hmul2(a, a), a);
	}
	if (__hbeq2(b, bfloat16_2(0.5, 0.5))) {
	return h2sqrt(a);
	}
	if (__hbeq2(b, bfloat16_2(-0.5, -0.5))) {
	return h2rsqrt(a);
	}
	if (__hbeq2(b, bfloat16_2(-1.0, -1.0))) {
	return __h2div(bfloat16_2(1.0, 1.0), a);
	}
	if (__hbeq2(b, bfloat16_2(-2.0, -2.0))) {
	return __h2div(bfloat16_2(1.0, 1.0), __hmul2(a, a));
	}

	auto a1 = __low2bfloat16(a);
	auto a2 = __high2bfloat16(a);

	// low 16 bits
	auto c1 = static_cast<bfloat16>(
	pow(static_cast<double>(__bfloat162float(a1)),
	static_cast<double>(__bfloat162float(b1))));
	// high 16 bits
	auto c2 = static_cast<bfloat16>(
	pow(static_cast<double>(__bfloat162float(a2)),
	static_cast<double>(__bfloat162float(b2))));
	return __halves2bfloat162(c1, c2);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half hpow(const half a, const half b) {
	if (b == half(0.0)) {
	return half(1.0);
	}
	if (b == half(1.0)) {
	return a;
	}
	if (b == half(2.0)) {
	return a * a;
	}
	if (b == half(3.0)) {
	return a * a * a;
	}
	if (b == half(0.5)) {
	return hsqrt(a);
	}
	if (b == half(-0.5)) {
	return hrsqrt(a);
	}
	if (b == half(-1.0)) {
	return half(1.0) / a;
	}
	if (b == half(-2.0)) {
	return half(1.0) / (a * a);
	}
	return static_cast<half>(pow(static_cast<double>(a), static_cast<double>(b)));
	}

	__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	if (b == bfloat16(0.0)) {
	return bfloat16(1.0);
	}
	if (b == bfloat16(1.0)) {
	return a;
	}
	if (b == bfloat16(2.0)) {
	return a * a;
	}
	if (b == bfloat16(3.0)) {
	return a * a * a;
	}
	if (b == bfloat16(0.5)) {
	return hsqrt(a);
	}
	if (b == bfloat16(-0.5)) {
	return hrsqrt(a);
	}
	if (b == bfloat16(-1.0)) {
	return bfloat16(1.0) / a;
	}
	if (b == bfloat16(-2.0)) {
	return bfloat16(1.0) / (a * a);
	}
	return static_cast<bfloat16>(
	pow(static_cast<double>(__bfloat162float(a)),
	static_cast<double>(__bfloat162float(b))));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float fpow(const float a, const float b) {
	if (b == float(0.0)) {
	return float(1.0);
	}
	if (b == float(1.0)) {
	return a;
	}
	if (b == float(2.0)) {
	return a * a;
	}
	if (b == float(3.0)) {
	return a * a * a;
	}
	if (b == float(0.5)) {
	return sqrt(a);
	}
	if (b == float(-0.5)) {
	return rsqrt(a);
	}
	if (b == float(-1.0)) {
	return float(1.0) / a;
	}
	if (b == float(-2.0)) {
	return float(1.0) / (a * a);
	}
	return static_cast<float>(
	pow(static_cast<double>(a), static_cast<double>(b)));
	}

	//
	// GELU function definitions implemented as described by
	// Hendrycks, D., and Gimpel, K. in
	// "Gaussian Error Linear Units (GELUs)." (2020)
	// https://arxiv.org/pdf/1606.08415.pdf
	//
	// Floating-point constants are Taylor coefficients described in the paper.
	//
	__device__ half hgelu(const half a) {
	cutlass::epilogue::thread::GELU<cutlass::half_t> gelu_op;
	return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
	}

	__device__ bfloat16 hgelu(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmul(
	a,
	__hmul(
	CUDA_BF16_ONE_HALF,
	__hadd(
	CUDA_BF16_ONE,
	bfloat16(erff(__bfloat162float(a) * rsqrtf(2.f))))));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float fgelu(const float a) {
	return a * .5f * (1.f + erff(a * rsqrtf(2.f)));
	}

	__device__ half h_fast_gelu(const half a) {
	cutlass::epilogue::thread::GELU_taylor<cutlass::half_t> gelu_op;
	return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
	}

	// The CUDA_BF16_K3 constant in the linked paper
	// (https://arxiv.org/pdf/1606.08415.pdf) (=0.044715) slightly differs
	// from the one computed analytically (2/(3*pi) - 1/6) ~ 0.045539):
	// atanh(x) = x + x^3/3 + O(x^5),
	// erf(x/sqrt(2)) = sqrt(2/pi)*(x - x^3/6 + O(x^5)),
	// atanh(erf(x/sqrt(2))) = sqrt(2/pi)*x +
	// + (sqrt(2/pi)x)^3/3 - (sqrt(2/pi)/6)x^3 + O(x^5) =
	// = sqrt(2/pi)x(1 + (2/(3pi) - 1/6)x^2 + O(x^4)).
	// The Cutlass folks have hardcoded the constant from the paper.
	__device__ bfloat16 h_fast_gelu(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hmul(
	a,
	__hmul(
	CUDA_BF16_ONE_HALF,
	__hadd(
	CUDA_BF16_ONE,
	fast_tanh(
	__hmul(CUDA_BF16_K1, a) *
	__hadd(CUDA_BF16_ONE, __hmul(CUDA_BF16_K3, a * a))))));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float f_fast_gelu(const float a) {
	cutlass::epilogue::thread::GELU_taylor<float> gelu_op;
	return gelu_op(a);
	}

	__device__ float fsoftplus(
	const float a,
	const float beta,
	const float threshold) {
	return (a * beta > threshold) ? a : log1pf(expf(a * beta)) / beta;
	}

	__device__ half hsoftplus(const half a, const half beta, const half threshold) {
	return __hgt(__hmul(a, beta), threshold)
	? a
	: __hdiv(hlog(__hadd(CUDA_FP16_ONE, hexp(__hmul(a, beta)))), beta);
	}

	__device__ bfloat16
	hsoftplus(const bfloat16 a, const bfloat16 beta, const bfloat16 threshold) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hgt(__hmul(a, beta), threshold)
	? a
	: __hdiv(hlog(__hadd(CUDA_BF16_ONE, hexp(__hmul(a, beta)))), beta);
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2
	h2softplus(const half2 a, const half2 beta, const half2 threshold) {
	return half2(
	hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
	}

	__device__ bfloat16_2 h2softplus(
	const bfloat16_2 a,
	const bfloat16_2 beta,
	const bfloat16_2 threshold) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return bfloat16_2(
	hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float felu(const float op_input, const float alpha) {
	return op_input > 0.f ? op_input : alpha * (expf(op_input) - 1.0f);
	}

	__device__ half helu(const half op_input, const half alpha) {
	return __hgt(op_input, CUDA_FP16_ZERO)
	? op_input
	: __hmul(alpha, __hsub(hexp(op_input), CUDA_FP16_ONE));
	}

	__device__ bfloat16 helu(const bfloat16 op_input, const bfloat16 alpha) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hgt(op_input, CUDA_BF16_ZERO)
	? op_input
	: __hmul(alpha, __hsub(hexp(op_input), CUDA_BF16_ONE));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 h2elu(const half2 op_input, const half2 alpha) {
	return half2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
	}

	__device__ bfloat16_2 h2elu(const bfloat16_2 op_input, const bfloat16_2 alpha) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return bfloat16_2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half hsoftsign(const half a) {
	return __hdiv(a, __hadd(CUDA_FP16_ONE, __habs(a)));
	}

	__device__ half2 h2softsign(const half2 a) {
	return __h2div(a, __hadd2(half2(1.0, 1.0), __habs2(a)));
	}

	__device__ float fsoftsign(const float a) {
	return a / (1.0f + fabsf(a));
	}

	__device__ bfloat16 hsoftsign(const bfloat16 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __hdiv(a, __hadd(CUDA_BF16_ONE, __habs(a)));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ bfloat16_2 h2softsign(const bfloat16_2 a) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return __h2div(a, __hadd2(bfloat16_2(1.0, 1.0), __habs2(a)));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ float floor_div(const float a, const float b) {
	return floor(a / b);
	}

	__device__ half floor_div(const half a, const half b) {
	return hfloor(__hdiv(a, b));
	}

	__device__ bfloat16 floor_div(const bfloat16 a, const bfloat16 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return hfloor(__hdiv(a, b));
	#else
	NOT_IMPLEMENTED();
	#endif
	}

	__device__ half2 floor_div(const half2 a, const half2 b) {
	return half2(floor_div(a.x, b.x), floor_div(a.y, b.y));
	}

	__device__ bfloat16_2 floor_div(const bfloat16_2 a, const bfloat16_2 b) {
	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
	return bfloat16_2(floor_div(a.x, b.x), floor_div(a.y, b.y));

	#else
	NOT_IMPLEMENTED();
	#endif
	}

	#endif




	// Copyright (c) Meta Platforms, Inc. and affiliates.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#ifndef AIT_TENSOR_ACCESSOR_CUH
	#define AIT_TENSOR_ACCESSOR_CUH

	// Returns a strided address based on a base pointer, an index and strided
	// information.
	// DATA_T: tensor data type.
	// READ_T: actual data type used when reading data. e.g. for a "half"
	// tensor, READ_T could be uint4 when all data is aligned.
	// data: A base pointer in READ_T type.
	// idx: read index in terms of READ_T.
	// offset, original_total_elements_from_stride_dim and
	// actual_total_elements_from_stride_dim are the corresponding data member
	// values of TensorAccessor.
	template <typename DATA_T, typename READ_T, bool is_contiguous>
	__device__ __forceinline__ READ_T* get_strided_address(
	READ_T* data,
	int64_t idx,
	int64_t offset,
	int64_t original_total_elements_from_stride_dim,
	int64_t actual_total_elements_from_stride_dim) {
	(void)original_total_elements_from_stride_dim; // Suppress incorrect declared
	// but never referenced warning
	// from nvcc.
	(void)actual_total_elements_from_stride_dim; // Ditto.
	if constexpr (is_contiguous) {
	return reinterpret_cast<READ_T>(reinterpret_cast<DATA_T>(data) + offset) +
	idx;
	} else {
	constexpr int N_ELEMENTS_PER_READ = sizeof(READ_T) / sizeof(DATA_T);
	int64_t data_idx = idx * N_ELEMENTS_PER_READ;
	int64_t num_rows = data_idx / original_total_elements_from_stride_dim;
	int64_t row_offset = data_idx % original_total_elements_from_stride_dim;
	data_idx =
	num_rows * actual_total_elements_from_stride_dim + row_offset + offset;
	return reinterpret_cast<READ_T*>(
	reinterpret_cast<DATA_T*>(data) + data_idx);
	}
	return nullptr; // Suppress incorrect warning about missing return statement
	// from nvcc.
	}

	static inline uint64_t max_power2_divisor(uint64_t n) {
	// max power of 2 which divides n
	return n & (~(n - 1));
	}

	// A TensorAccessor which handles strided tensor access underneath.
	struct TensorAccessor {
	int64_t offset{0};
	bool is_contiguous{true};

	int stride_dim{-1};
	int64_t original_total_elements_from_stride_dim{-1};
	int64_t actual_total_elements_from_stride_dim{-1};

	// Returns an address based on a base pointer and an index.

	// DATA_T: tensor data type.
	// READ_T: actual data type used when reading data. e.g. for a "half"
	// tensor, READ_T could be uint4 when all data is aligned.
	// data: A base pointer in READ_T type.
	// idx: read index in terms of READ_T.
	template <typename DATA_T, typename READ_T>
	__device__ inline READ_T* get(READ_T* data, int64_t idx) const {
	return is_contiguous ? get_strided_address<DATA_T, READ_T, true>(
	data,
	idx,
	offset,
	original_total_elements_from_stride_dim,
	actual_total_elements_from_stride_dim)
	: get_strided_address<DATA_T, READ_T, false>(
	data,
	idx,
	offset,
	original_total_elements_from_stride_dim,
	actual_total_elements_from_stride_dim);
	}

	uint64_t max_alignment() const {
	// gcd of max alignments
	auto alignment = max_power2_divisor(offset);
	if (!is_contiguous) {
	alignment \|= max_power2_divisor(original_total_elements_from_stride_dim);
	alignment \|= max_power2_divisor(actual_total_elements_from_stride_dim);
	}
	return max_power2_divisor(alignment);
	}

	bool is_valid_alignment(uint64_t n) const {
	// n is a power of 2; return whether tensor accessor alignment is divisible
	// by n.
	return !(max_alignment() & (n - 1));
	}
	};

	#endif





	__global__ void
	fused_elementwise_376(uint4* output0, const uint4* input0,const uint4* input1, int64_t n_elements) {
	const int bid = blockIdx.x;
	const int tid = threadIdx.x;
	const int64_t idx = bid * FUSED_ELE_THREAD_SIZE + tid;
	const int64_t idx_elem = idx * N_ELEMENTS_PER_THREAD;
	if (idx_elem >= n_elements) {
	return;
	}

	uint4 input_tmp0 = const_cast<uint4>(input0);


	input_tmp0 = get_strided_address</data_t/ half,
	/read_t/ uint4,
	/is_contiguous/ true>(
	input_tmp0, idx, 0, 0, 0);


	uint4 tmp_i0 = *input_tmp0;
	const half2* p_tmp_i0 = reinterpret_cast<const half2*>(&tmp_i0);



	uint4 input_tmp1 = const_cast<uint4>(input1);


	input_tmp1 = get_strided_address</data_t/ half,
	/read_t/ uint4,
	/is_contiguous/ true>(
	input_tmp1, idx, 0, 0, 0);


	uint4 tmp_i1 = *input_tmp1;
	const half2* p_tmp_i1 = reinterpret_cast<const half2*>(&tmp_i1);




	uint4 tmp_o0;
	half2* p_tmp_o0 = reinterpret_cast<half2*>(&tmp_o0);


	#pragma unroll
	for (int i = 0; i < N_OPS_PER_THREAD; ++i) {
	p_tmp_o0[i] = __hadd2(p_tmp_i1[i],p_tmp_i0[i]);

	}



	output0 = get_strided_address</data_t/ half,
	/read_t/ uint4,
	/is_contiguous/ true>(
	output0, idx, 0, 0, 0);


	*output0 = tmp_o0;

	}


	} // namespace

	void invoke_fused_elementwise_376(void* output0, const void* input0,const void* input1, int64_t n_elements, cudaStream_t stream) {
	if (n_elements == 0) {
	return;
	}
	int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
	fused_elementwise_376<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
	reinterpret_cast<uint4*>(output0),
	reinterpret_cast<const uint4>(input0),reinterpret_cast<const uint4>(input1),

	n_elements
	);
	}