|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "nvShaderExtnEnums.h" |
|
|
|
|
|
struct NvShaderExtnStruct |
|
|
{ |
|
|
uint opcode; |
|
|
uint rid; |
|
|
uint sid; |
|
|
|
|
|
uint4 dst1u; |
|
|
uint4 padding0[3]; |
|
|
|
|
|
uint4 src0u; |
|
|
uint4 src1u; |
|
|
uint4 src2u; |
|
|
uint4 dst0u; |
|
|
|
|
|
uint markUavRef; |
|
|
float padding1[28]; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef NV_SHADER_EXTN_REGISTER_SPACE |
|
|
RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT, NV_SHADER_EXTN_REGISTER_SPACE ); |
|
|
#else |
|
|
RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT ); |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int __NvGetShflMaskFromWidth(uint width) |
|
|
{ |
|
|
return ((NV_WARP_SIZE - width) << 8) | 0x1F; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void __NvReferenceUAVForOp(RWByteAddressBuffer uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav.Store(index, 0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<float2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = float2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<float2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = float2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<float2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = float2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<float4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = float4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<float4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = float4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<float4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = float4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<float> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = 0.0f; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<float> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = 0.0f; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<float> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = 0.0f; |
|
|
} |
|
|
|
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<uint2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = uint2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<uint2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = uint2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<uint2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = uint2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<uint4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = uint4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<uint4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = uint4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<uint4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = uint4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<uint> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = 0; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<uint> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = 0; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<uint> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = 0; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<int2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = int2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<int2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = int2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<int2> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = int2(0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<int4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = int4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<int4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = int4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<int4> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = int4(0,0,0,0); |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture1D<int> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[index] = 0; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture2D<int> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint2(index,index)] = 0; |
|
|
} |
|
|
|
|
|
void __NvReferenceUAVForOp(RWTexture3D<int> uav) |
|
|
{ |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].markUavRef = 1; |
|
|
uav[uint3(index,index,index)] = 0; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define NV_EXTN_ATOM_ADD 3 |
|
|
#define NV_EXTN_ATOM_MAX 6 |
|
|
#define NV_EXTN_ATOM_MIN 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = byteAddress; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
|
|
|
return g_NvidiaExt[index].dst0u.x; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = address; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
|
|
|
return g_NvidiaExt[index].dst0u.x; |
|
|
} |
|
|
|
|
|
uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xy = address; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
|
|
|
return g_NvidiaExt[index].dst0u.x; |
|
|
} |
|
|
|
|
|
uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xyz = address; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
|
|
|
return g_NvidiaExt[index].dst0u.x; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
|
|
|
|
|
|
uint2 retVal; |
|
|
|
|
|
|
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = address * 2; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.x; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.x = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
|
|
|
index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = address * 2 + 1; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.y; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.y = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
return retVal; |
|
|
} |
|
|
|
|
|
uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
|
|
|
|
|
|
uint2 retVal; |
|
|
|
|
|
|
|
|
uint2 addressTemp = uint2(address.x * 2, address.y); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xy = addressTemp; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.x; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.x = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
|
|
|
addressTemp.x++; |
|
|
index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xy = addressTemp; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.y; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.y = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
return retVal; |
|
|
} |
|
|
|
|
|
uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
|
|
|
|
|
|
uint2 retVal; |
|
|
|
|
|
|
|
|
uint3 addressTemp = uint3(address.x * 2, address.y, address.z); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xyz = addressTemp; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.x; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.x = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
|
|
|
addressTemp.x++; |
|
|
index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xyz = addressTemp; |
|
|
g_NvidiaExt[index].src1u.x = fp16x2Val.y; |
|
|
g_NvidiaExt[index].src2u.x = atomicOpType; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; |
|
|
retVal.y = g_NvidiaExt[index].dst0u.x; |
|
|
|
|
|
return retVal; |
|
|
} |
|
|
|
|
|
uint __fp32x2Tofp16x2(float2 val) |
|
|
{ |
|
|
return (f32tof16(val.y)<<16) | f32tof16(val.x) ; |
|
|
} |
|
|
|
|
|
uint2 __fp32x4Tofp16x4(float4 val) |
|
|
{ |
|
|
return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = byteAddress; |
|
|
g_NvidiaExt[index].src1u.x = asuint(val); |
|
|
g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; |
|
|
|
|
|
return asfloat(g_NvidiaExt[index].dst0u.x); |
|
|
} |
|
|
|
|
|
float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.x = address; |
|
|
g_NvidiaExt[index].src1u.x = asuint(val); |
|
|
g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; |
|
|
|
|
|
return asfloat(g_NvidiaExt[index].dst0u.x); |
|
|
} |
|
|
|
|
|
float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xy = address; |
|
|
g_NvidiaExt[index].src1u.x = asuint(val); |
|
|
g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; |
|
|
|
|
|
return asfloat(g_NvidiaExt[index].dst0u.x); |
|
|
} |
|
|
|
|
|
float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val) |
|
|
{ |
|
|
__NvReferenceUAVForOp(uav); |
|
|
uint index = g_NvidiaExt.IncrementCounter(); |
|
|
g_NvidiaExt[index].src0u.xyz = address; |
|
|
g_NvidiaExt[index].src1u.x = asuint(val); |
|
|
g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; |
|
|
g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; |
|
|
|
|
|
return asfloat(g_NvidiaExt[index].dst0u.x); |
|
|
} |
|
|
|
|
|
|