|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "nvHLSLExtnsInternal.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int NvShfl(int val, uint srcLane, int width = NV_WARP_SIZE) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = val; |
|
|
g_NvidiaExt[index].src0u.y = srcLane; |
|
|
g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width); |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL; |
|
|
|
|
|
|
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int NvShflUp(int val, uint delta, int width = NV_WARP_SIZE) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = val; |
|
|
g_NvidiaExt[index].src0u.y = delta; |
|
|
g_NvidiaExt[index].src0u.z = (NV_WARP_SIZE - width) << 8; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_UP; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int NvShflDown(int val, uint delta, int width = NV_WARP_SIZE) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = val; |
|
|
g_NvidiaExt[index].src0u.y = delta; |
|
|
g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width); |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_DOWN; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int NvShflXor(int val, uint laneMask, int width = NV_WARP_SIZE) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = val; |
|
|
g_NvidiaExt[index].src0u.y = laneMask; |
|
|
g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width); |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_XOR; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint NvAny(int predicate) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = predicate; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ANY; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
uint NvAll(int predicate) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = predicate; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ALL; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
uint NvBallot(int predicate) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = predicate; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_BALLOT; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int NvGetLaneId() |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_LANE_ID; |
|
|
return g_NvidiaExt.IncrementCounter(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN); |
|
|
} |
|
|
|
|
|
uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val) |
|
|
{ |
|
|
return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
float NvInterlockedAddFp32(RWByteAddressBuffer uav, uint byteAddress, float val) |
|
|
{ |
|
|
return __NvAtomicAddFP32(uav, byteAddress, val); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
float NvInterlockedAddFp32(RWTexture1D<float> uav, uint address, float val) |
|
|
{ |
|
|
return __NvAtomicAddFP32(uav, address, val); |
|
|
} |
|
|
|
|
|
float NvInterlockedAddFp32(RWTexture2D<float> uav, uint2 address, float val) |
|
|
{ |
|
|
return __NvAtomicAddFP32(uav, address, val); |
|
|
} |
|
|
|
|
|
float NvInterlockedAddFp32(RWTexture3D<float> uav, uint3 address, float val) |
|
|
{ |
|
|
return __NvAtomicAddFP32(uav, address, val); |
|
|
} |
|
|
|
|
|
|