|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| #if !defined(CURAND_KERNEL_H_) |
| #define CURAND_KERNEL_H_ |
|
|
| |
| |
| |
| |
| |
|
|
| #if !defined(QUALIFIERS) |
| #define QUALIFIERS static __forceinline__ __device__ |
| #endif |
|
|
| |
| #if !defined(GCC_UNUSED_PARAMETER) |
| #if defined(__GNUC__) |
| #define GCC_UNUSED_PARAMETER __attribute__((unused)) |
| #else |
| #define GCC_UNUSED_PARAMETER |
| #endif |
| #endif |
|
|
| #include <nv/target> |
|
|
| #ifdef __CUDACC_RTC__ |
| #define CURAND_DETAIL_USE_CUDA_STL |
| #endif |
|
|
| #if __cplusplus >= 201103L |
| # ifdef CURAND_DETAIL_USE_CUDA_STL |
| # define CURAND_STD cuda::std |
| # include <cuda/std/type_traits> |
| # else |
| # define CURAND_STD std |
| # include <type_traits> |
| # endif |
| #else |
| |
| # define CURAND_STD curand_detail |
| namespace curand_detail { |
| template<bool B, class T = void> |
| struct enable_if {}; |
|
|
| template<class T> |
| struct enable_if<true, T> { typedef T type; }; |
|
|
| template<class T, class U> |
| struct is_same { static const bool value = false; }; |
|
|
| template<class T> |
| struct is_same<T, T> { static const bool value = true; }; |
| } |
| #endif |
|
|
| #ifndef __CUDACC_RTC__ |
| #include <math.h> |
| #endif |
|
|
| #include "curand.h" |
| #include "curand_discrete.h" |
| #include "curand_precalc.h" |
| #include "curand_mrg32k3a.h" |
| #include "curand_mtgp32_kernel.h" |
| #include "curand_philox4x32_x.h" |
| #include "curand_globals.h" |
|
|
| |
| |
| |
| |
| |
| |
| struct curandStateTest { |
| unsigned int v; |
| }; |
|
|
| |
| typedef struct curandStateTest curandStateTest_t; |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| struct curandStateXORWOW; |
|
|
| |
| |
| struct curandStateXORWOW { |
| unsigned int d, v[5]; |
| int boxmuller_flag; |
| int boxmuller_flag_double; |
| float boxmuller_extra; |
| double boxmuller_extra_double; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateXORWOW curandStateXORWOW_t; |
|
|
| #define EXTRA_FLAG_NORMAL 0x00000001 |
| #define EXTRA_FLAG_LOG_NORMAL 0x00000002 |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| #define MRG32K3A_MOD1 4294967087. |
| #define MRG32K3A_MOD2 4294944443. |
|
|
| |
|
|
| #define MRG32K3A_A12 1403580. |
| #define MRG32K3A_A13N 810728. |
| #define MRG32K3A_A21 527612. |
| #define MRG32K3A_A23N 1370589. |
| #define MRG32K3A_NORM (2.3283065498378288e-10) |
| |
| |
| |
| #define MRG32K3A_BITS_NORM 1.000000048662 |
|
|
| |
|
|
|
|
|
|
|
|
| |
| |
| |
| struct curandStateMRG32k3a; |
|
|
| |
| struct curandStateMRG32k3a { |
| unsigned int s1[3]; |
| unsigned int s2[3]; |
| int boxmuller_flag; |
| int boxmuller_flag_double; |
| float boxmuller_extra; |
| double boxmuller_extra_double; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateMRG32k3a curandStateMRG32k3a_t; |
| |
|
|
| |
| |
| |
| |
| struct curandStateSobol32; |
|
|
| |
| struct curandStateSobol32 { |
| unsigned int i, x, c; |
| unsigned int direction_vectors[32]; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateSobol32 curandStateSobol32_t; |
| |
|
|
| |
| |
| |
| struct curandStateScrambledSobol32; |
|
|
| |
| struct curandStateScrambledSobol32 { |
| unsigned int i, x, c; |
| unsigned int direction_vectors[32]; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t; |
| |
|
|
| |
| |
| |
| struct curandStateSobol64; |
|
|
| |
| struct curandStateSobol64 { |
| unsigned long long i, x, c; |
| unsigned long long direction_vectors[64]; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateSobol64 curandStateSobol64_t; |
| |
|
|
| |
| |
| |
| struct curandStateScrambledSobol64; |
|
|
| |
| struct curandStateScrambledSobol64 { |
| unsigned long long i, x, c; |
| unsigned long long direction_vectors[64]; |
| }; |
|
|
| |
| |
| |
| |
| typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t; |
| |
|
|
| |
| |
| |
| |
| typedef struct curandStateXORWOW curandState_t; |
| typedef struct curandStateXORWOW curandState; |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| template<int N> |
| QUALIFIERS void __curand_matvec_inplace(unsigned int *vector, unsigned int *matrix) |
| { |
| unsigned int result[N] = { 0 }; |
| for(int i = 0; i < N; i++) { |
| #ifdef __CUDA_ARCH__ |
| #pragma unroll 16 |
| #endif |
| for(int j = 0; j < 32; j++) { |
| if(vector[i] & (1 << j)) { |
| for(int k = 0; k < N; k++) { |
| result[k] ^= matrix[N * (i * 32 + j) + k]; |
| } |
| } |
| } |
| } |
| for(int i = 0; i < N; i++) { |
| vector[i] = result[i]; |
| } |
| } |
|
|
| QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix, |
| unsigned int *result, int n) |
| { |
| for(int i = 0; i < n; i++) { |
| result[i] = 0; |
| } |
| for(int i = 0; i < n; i++) { |
| for(int j = 0; j < 32; j++) { |
| if(vector[i] & (1 << j)) { |
| for(int k = 0; k < n; k++) { |
| result[k] ^= matrix[n * (i * 32 + j) + k]; |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n) |
| { |
| int r; |
| for(int i = 0; i < n * 32; i++) { |
| for(int j = 0; j < n; j++) { |
| r = i & 31; |
| if(i / 32 == j) { |
| matrix[i * n + j] = (1 << r); |
| } else { |
| matrix[i * n + j] = 0; |
| } |
| } |
| } |
| } |
|
|
| |
| |
| QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n) |
| { |
| unsigned int result[MAX_XOR_N]; |
| for(int i = 0; i < n * 32; i++) { |
| __curand_matvec(matrixA + i * n, matrixB, result, n); |
| for(int j = 0; j < n; j++) { |
| matrixA[i * n + j] = result[j]; |
| } |
| } |
| } |
|
|
| |
| QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n) |
| { |
| for(int i = 0; i < n; i++) { |
| vector[i] = vectorA[i]; |
| } |
| } |
|
|
| |
| QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n) |
| { |
| for(int i = 0; i < n * n * 32; i++) { |
| matrix[i] = matrixA[i]; |
| } |
| } |
|
|
| |
| QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA, |
| unsigned long long p, int n) |
| { |
| unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32]; |
| unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32]; |
| __curand_matidentity(matrix, n); |
| __curand_matcopy(matrixR, matrixA, n); |
| while(p) { |
| if(p & 1) { |
| __curand_matmat(matrix, matrixR, n); |
| } |
| __curand_matcopy(matrixS, matrixR, n); |
| __curand_matmat(matrixR, matrixS, n); |
| p >>= 1; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| QUALIFIERS double curand_MRGmod(double i, double m) |
| { |
| double quo; |
| double rem; |
| quo = floor(i/m); |
| rem = i - (quo*m); |
| if (rem < 0.0) rem += m; |
| return rem; |
| } |
|
|
| |
| |
|
|
| QUALIFIERS double curand_MRGmodMul(double i, double j, double m) |
| { |
| double tempHi; |
| double tempLo; |
|
|
| tempHi = floor(i/131072.0); |
| tempLo = i - (tempHi*131072.0); |
| tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m); |
|
|
| if (tempLo < 0.0) tempLo += m; |
| return tempLo; |
| } |
|
|
| |
|
|
| QUALIFIERS void curand_MRGmatMul3x3(unsigned int i1[][3],unsigned int i2[][3],unsigned int o[][3],double m) |
| { |
| int i,j; |
| double temp[3][3]; |
| for (i=0; i<3; i++){ |
| for (j=0; j<3; j++){ |
| temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) + |
| curand_MRGmodMul(i1[i][1], i2[1][j], m) + |
| curand_MRGmodMul(i1[i][2], i2[2][j], m)); |
| temp[i][j] = curand_MRGmod( temp[i][j], m ); |
| } |
| } |
| for (i=0; i<3; i++){ |
| for (j=0; j<3; j++){ |
| o[i][j] = (unsigned int)temp[i][j]; |
| } |
| } |
| } |
|
|
| |
|
|
| QUALIFIERS void curand_MRGmatVecMul3x3( unsigned int i[][3], unsigned int v[], double m) |
| { |
| int k; |
| double t[3]; |
| for (k = 0; k < 3; k++) { |
| t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) + |
| curand_MRGmodMul(i[k][1], v[1], m) + |
| curand_MRGmodMul(i[k][2], v[2], m) ); |
| t[k] = curand_MRGmod( t[k], m ); |
| } |
| for (k = 0; k < 3; k++) { |
| v[k] = (unsigned int)t[k]; |
| } |
|
|
| } |
|
|
| |
| |
| |
|
|
| QUALIFIERS void curand_MRGmatPow3x3( unsigned int in[][3][3], unsigned int o[][3], double m, unsigned long long pow ) |
| { |
| int i,j; |
| for ( i = 0; i < 3; i++ ) { |
| for ( j = 0; j < 3; j++ ) { |
| o[i][j] = 0; |
| if ( i == j ) o[i][j] = 1; |
| } |
| } |
| i = 0; |
| curand_MRGmatVecMul3x3(o,o[0],m); |
| while (pow) { |
| if ( pow & 1ll ) { |
| curand_MRGmatMul3x3(in[i], o, o, m); |
| } |
| i++; |
| pow >>= 1; |
| } |
| } |
|
|
| |
| |
|
|
| QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow ) |
| { |
| unsigned int temp[3][3]; |
| int i,j; |
| pow = pow % 191; |
| for ( i = 0; i < 3; i++ ) { |
| for ( j = 0; j < 3; j++ ) { |
| temp[i][j] = (unsigned int)in[i][j]; |
| } |
| } |
| while (pow) { |
| curand_MRGmatMul3x3(temp, temp, temp, m); |
| pow--; |
| } |
| for ( i = 0; i < 3; i++ ) { |
| for ( j = 0; j < 3; j++ ) { |
| o[i][j] = temp[i][j]; |
| } |
| } |
| } |
|
|
| |
|
|
| |
| |
| |
|
|
| |
|
|
| QUALIFIERS void curand_init(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStateTest_t *state) |
| { |
| state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \ |
| (unsigned int)offset; |
| } |
|
|
|
|
| QUALIFIERS unsigned int curand(curandStateTest_t *state) |
| { |
| unsigned int r = state->v++; |
| return r; |
| } |
|
|
| QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state) |
| { |
| state->v += (unsigned int)n; |
| } |
|
|
| |
|
|
| template <typename T, int n> |
| QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[]) |
| { |
| T state; |
| |
| |
| |
| for(int i = 0; i < 32 * n; i++) { |
| state.d = 0; |
| for(int j = 0; j < n; j++) { |
| state.v[j] = 0; |
| } |
| state.v[i / 32] = (1 << (i & 31)); |
| curand(&state); |
| for(int j = 0; j < n; j++) { |
| matrix[i * n + j] = state.v[j]; |
| } |
| } |
| } |
|
|
| template <typename T, int n> |
| QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch) |
| { |
| |
| unsigned int *matrix = scratch; |
| |
| unsigned int *matrixA = scratch + (n * n * 32); |
| |
| unsigned int *vector = scratch + (n * n * 32) + (n * n * 32); |
| |
| unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n; |
| unsigned long long p = x; |
| for(int i = 0; i < n; i++) { |
| vector[i] = state->v[i]; |
| } |
| int matrix_num = 0; |
| while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) { |
| for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n); |
| , |
| __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n); |
| ) |
| __curand_veccopy(vector, result, n); |
| } |
| p >>= PRECALC_BLOCK_SIZE; |
| matrix_num++; |
| } |
| if(p) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n); |
| __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n); |
| , |
| __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n); |
| __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n); |
| ) |
| } |
| while(p) { |
| for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) { |
| __curand_matvec(vector, matrixA, result, n); |
| __curand_veccopy(vector, result, n); |
| } |
| p >>= SKIPAHEAD_BLOCKSIZE; |
| if(p) { |
| for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) { |
| __curand_matmat(matrix, matrixA, n); |
| __curand_matcopy(matrixA, matrix, n); |
| } |
| } |
| } |
| for(int i = 0; i < n; i++) { |
| state->v[i] = vector[i]; |
| } |
| state->d += 362437 * (unsigned int)x; |
| } |
|
|
| template <typename T, int n> |
| QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch) |
| { |
| |
| unsigned int *matrix = scratch; |
| |
| unsigned int *matrixA = scratch + (n * n * 32); |
| |
| unsigned int *vector = scratch + (n * n * 32) + (n * n * 32); |
| |
| unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n; |
| unsigned long long p = x; |
| for(int i = 0; i < n; i++) { |
| vector[i] = state->v[i]; |
| } |
| int matrix_num = 0; |
| while(p && matrix_num < PRECALC_NUM_MATRICES - 1) { |
| for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n); |
| , |
| __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n); |
| ) |
| __curand_veccopy(vector, result, n); |
| } |
| p >>= PRECALC_BLOCK_SIZE; |
| matrix_num++; |
| } |
| if(p) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n); |
| __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n); |
| , |
| __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n); |
| __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n); |
| ) |
| } |
| while(p) { |
| for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) { |
| __curand_matvec(vector, matrixA, result, n); |
| __curand_veccopy(vector, result, n); |
| } |
| p >>= SKIPAHEAD_BLOCKSIZE; |
| if(p) { |
| for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) { |
| __curand_matmat(matrix, matrixA, n); |
| __curand_matcopy(matrixA, matrix, n); |
| } |
| } |
| } |
| for(int i = 0; i < n; i++) { |
| state->v[i] = vector[i]; |
| } |
| |
| } |
|
|
| template <typename T, int N> |
| QUALIFIERS void _skipahead_inplace(const unsigned long long x, T *state) |
| { |
| unsigned long long p = x; |
| int matrix_num = 0; |
| while(p) { |
| for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix[matrix_num]); |
| , |
| __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix_host[matrix_num]); |
| ) |
| } |
| p >>= PRECALC_BLOCK_SIZE; |
| matrix_num++; |
| } |
| state->d += 362437 * (unsigned int)x; |
| } |
|
|
| template <typename T, int N> |
| QUALIFIERS void _skipahead_sequence_inplace(unsigned long long x, T *state) |
| { |
| int matrix_num = 0; |
| while(x) { |
| for(unsigned int t = 0; t < (x & PRECALC_BLOCK_MASK); t++) { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix[matrix_num]); |
| , |
| __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix_host[matrix_num]); |
| ) |
| } |
| x >>= PRECALC_BLOCK_SIZE; |
| matrix_num++; |
| } |
| |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state) |
| { |
| _skipahead_inplace<curandStateXORWOW_t, 5>(n, state); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state) |
| { |
| _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(n, state); |
| } |
|
|
| QUALIFIERS void _curand_init_scratch(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStateXORWOW_t *state, |
| unsigned int *scratch) |
| { |
| |
| |
| unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL; |
| unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL; |
| |
| |
| unsigned int t0 = 1099087573UL * s0; |
| unsigned int t1 = 2591861531UL * s1; |
| state->d = 6615241 + t1 + t0; |
| state->v[0] = 123456789UL + t0; |
| state->v[1] = 362436069UL ^ t0; |
| state->v[2] = 521288629UL + t1; |
| state->v[3] = 88675123UL ^ t1; |
| state->v[4] = 5783321UL + t0; |
| _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch); |
| _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch); |
| state->boxmuller_flag = 0; |
| state->boxmuller_flag_double = 0; |
| state->boxmuller_extra = 0.f; |
| state->boxmuller_extra_double = 0.; |
| } |
|
|
| QUALIFIERS void _curand_init_inplace(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStateXORWOW_t *state) |
| { |
| |
| |
| unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL; |
| unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL; |
| |
| |
| unsigned int t0 = 1099087573UL * s0; |
| unsigned int t1 = 2591861531UL * s1; |
| state->d = 6615241 + t1 + t0; |
| state->v[0] = 123456789UL + t0; |
| state->v[1] = 362436069UL ^ t0; |
| state->v[2] = 521288629UL + t1; |
| state->v[3] = 88675123UL ^ t1; |
| state->v[4] = 5783321UL + t0; |
| _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(subsequence, state); |
| _skipahead_inplace<curandStateXORWOW_t, 5>(offset, state); |
| state->boxmuller_flag = 0; |
| state->boxmuller_flag_double = 0; |
| state->boxmuller_extra = 0.f; |
| state->boxmuller_extra_double = 0.; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStateXORWOW_t *state) |
| { |
| _curand_init_inplace(seed, subsequence, offset, state); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS unsigned int curand(curandStateXORWOW_t *state) |
| { |
| unsigned int t; |
| t = (state->v[0] ^ (state->v[0] >> 2)); |
| state->v[0] = state->v[1]; |
| state->v[1] = state->v[2]; |
| state->v[2] = state->v[3]; |
| state->v[3] = state->v[4]; |
| state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1)); |
| state->d += 362437; |
| return state->v[4] + state->d; |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state) |
| { |
| |
| |
| unsigned int ret; |
| switch(state->STATE++){ |
| default: |
| ret = state->output.x; |
| break; |
| case 1: |
| ret = state->output.y; |
| break; |
| case 2: |
| ret = state->output.z; |
| break; |
| case 3: |
| ret = state->output.w; |
| break; |
| } |
| if(state->STATE == 4){ |
| Philox_State_Incr(state); |
| state->output = curand_Philox4x32_10(state->ctr,state->key); |
| state->STATE = 0; |
| } |
| return ret; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state) |
| { |
| uint4 r; |
|
|
| uint4 tmp = state->output; |
| Philox_State_Incr(state); |
| state->output= curand_Philox4x32_10(state->ctr,state->key); |
| switch(state->STATE){ |
| case 0: |
| return tmp; |
| case 1: |
| r.x = tmp.y; |
| r.y = tmp.z; |
| r.z = tmp.w; |
| r.w = state->output.x; |
| break; |
| case 2: |
| r.x = tmp.z; |
| r.y = tmp.w; |
| r.z = state->output.x; |
| r.w = state->output.y; |
| break; |
| case 3: |
| r.x = tmp.w; |
| r.y = state->output.x; |
| r.z = state->output.y; |
| r.w = state->output.z; |
| break; |
| default: |
| |
| return tmp; |
| } |
| return r; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state) |
| { |
| state->STATE += (n & 3); |
| n /= 4; |
| if( state->STATE > 3 ){ |
| n += 1; |
| state->STATE -= 4; |
| } |
| Philox_State_Incr(state, n); |
| state->output = curand_Philox4x32_10(state->ctr,state->key); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state) |
| { |
| Philox_State_Incr_hi(state, n); |
| state->output = curand_Philox4x32_10(state->ctr,state->key); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStatePhilox4_32_10_t *state) |
| { |
| state->ctr = make_uint4(0, 0, 0, 0); |
| state->key.x = (unsigned int)seed; |
| state->key.y = (unsigned int)(seed>>32); |
| state->STATE = 0; |
| state->boxmuller_flag = 0; |
| state->boxmuller_flag_double = 0; |
| state->boxmuller_extra = 0.f; |
| state->boxmuller_extra_double = 0.; |
| skipahead_sequence(subsequence, state); |
| skipahead(offset, state); |
| } |
|
|
|
|
| |
|
|
| |
| QUALIFIERS unsigned long long __curand_umad(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b, GCC_UNUSED_PARAMETER unsigned long long c) |
| { |
| unsigned long long r = 0; |
| NV_IF_TARGET(NV_PROVIDES_SM_61, |
| asm("mad.wide.u32 %0, %1, %2, %3;" |
| : "=l"(r) : "r"(a), "r"(b), "l"(c)); |
| ) |
| return r; |
| } |
| QUALIFIERS unsigned long long __curand_umul(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b) |
| { |
| unsigned long long r = 0; |
| NV_IF_TARGET(NV_PROVIDES_SM_61, |
| asm("mul.wide.u32 %0, %1, %2;" |
| : "=l"(r) : "r"(a), "r"(b)); |
| ) |
| return r; |
| } |
| QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state) |
| { |
| NV_IF_TARGET(NV_PROVIDES_SM_61, |
| const unsigned int m1 = 4294967087u; |
| const unsigned int m2 = 4294944443u; |
| const unsigned int m1c = 209u; |
| const unsigned int m2c = 22853u; |
| const unsigned int a12 = 1403580u; |
| const unsigned int a13n = 810728u; |
| const unsigned int a21 = 527612u; |
| const unsigned int a23n = 1370589u; |
|
|
| unsigned long long p1; |
| unsigned long long p2; |
| const unsigned long long p3 = __curand_umul(a13n, m1 - state->s1[0]); |
| p1 = __curand_umad(a12, state->s1[1], p3); |
|
|
| |
| |
| p1 = __curand_umul(p1 >> 32, m1c) + (p1 & 0xffffffff); |
| if (p1 >= m1) p1 -= m1; |
|
|
| state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1; |
| const unsigned long long p4 = __curand_umul(a23n, m2 - state->s2[0]); |
| p2 = __curand_umad(a21, state->s2[2], p4); |
|
|
| |
| |
| p2 = __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff); |
| p2 = __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff); |
| if (p2 >= m2) p2 -= m2; |
|
|
| state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2; |
|
|
| const unsigned int p5 = (unsigned int)p1 - (unsigned int)p2; |
| if(p1 <= p2) return p5 + m1; |
| return p5; |
| ) |
| NV_IF_TARGET(NV_IS_DEVICE, |
| |
| const double m1 = 4294967087.; |
| const double m2 = 4294944443.; |
| const double a12 = 1403580.; |
| const double a13n = 810728.; |
| const double a21 = 527612.; |
| const double a23n = 1370589.; |
|
|
| const double rh1 = 2.3283065498378290e-010; |
| const double rl1 = -1.7354913086174288e-026; |
| const double rh2 = 2.3283188252407387e-010; |
| const double rl2 = 2.4081018096503646e-026; |
|
|
| double q; |
| double p1; |
| double p2; |
| p1 = a12 * state->s1[1] - a13n * state->s1[0]; |
| q = trunc (fma (p1, rh1, p1 * rl1)); |
| p1 -= q * m1; |
| if (p1 < 0.0) p1 += m1; |
| state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = (unsigned int)p1; |
| p2 = a21 * state->s2[2] - a23n * state->s2[0]; |
| q = trunc (fma (p2, rh2, p2 * rl2)); |
| p2 -= q * m2; |
| if (p2 < 0.0) p2 += m2; |
| state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = (unsigned int)p2; |
| if (p1 <= p2) return (p1 - p2 + m1); |
| else return (p1 - p2); |
| ) |
| |
| double p1; |
| double p2; |
| double r; |
| p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]); |
| p1 = curand_MRGmod(p1, MRG32K3A_MOD1); |
| if (p1 < 0.0) p1 += MRG32K3A_MOD1; |
| state->s1[0] = state->s1[1]; |
| state->s1[1] = state->s1[2]; |
| state->s1[2] = (unsigned int)p1; |
| p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]); |
| p2 = curand_MRGmod(p2, MRG32K3A_MOD2); |
| if (p2 < 0) p2 += MRG32K3A_MOD2; |
| state->s2[0] = state->s2[1]; |
| state->s2[1] = state->s2[2]; |
| state->s2[2] = (unsigned int)p2; |
| r = p1 - p2; |
| if (r <= 0) r += MRG32K3A_MOD1; |
| return r; |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state) |
| { |
| double dRet; |
| dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM; |
| return (unsigned int)dRet; |
| } |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state) |
| { |
| unsigned int t[3][3]; |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| , |
| curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| ) |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state) |
| { |
| unsigned int t[3][3]; |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| , |
| curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| ) |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state) |
| { |
| unsigned int t[3][3]; |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3( mrg32k3aM2Seq, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| , |
| curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n); |
| curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1); |
| curand_MRGmatPow3x3( mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n); |
| curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2); |
| ) |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(unsigned long long seed, |
| unsigned long long subsequence, |
| unsigned long long offset, |
| curandStateMRG32k3a_t *state) |
| { |
| int i; |
| for ( i=0; i<3; i++ ) { |
| state->s1[i] = 12345u; |
| state->s2[i] = 12345u; |
| } |
| if (seed != 0ull) { |
| unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL; |
| unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL); |
| state->s1[0] = (unsigned int)curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1); |
| state->s1[1] = (unsigned int)curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1); |
| state->s1[2] = (unsigned int)curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1); |
| state->s2[0] = (unsigned int)curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2); |
| state->s2[1] = (unsigned int)curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2); |
| state->s2[2] = (unsigned int)curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2); |
| } |
| skipahead_subsequence( subsequence, state ); |
| skipahead( offset, state ); |
| state->boxmuller_flag = 0; |
| state->boxmuller_flag_double = 0; |
| state->boxmuller_extra = 0.f; |
| state->boxmuller_extra_double = 0.; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| template <typename T> |
| QUALIFIERS |
| typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol32_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol32_t*, T>::value>::type |
| skipahead(unsigned int n, T state) |
| { |
| unsigned int i_gray; |
| state->x = state->c; |
| state->i += n; |
| |
| i_gray = state->i ^ (state->i >> 1); |
| for(unsigned int k = 0; k < 32; k++) { |
| if(i_gray & (1 << k)) { |
| state->x ^= state->direction_vectors[k]; |
| } |
| } |
| return; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| template <typename T> |
| QUALIFIERS |
| typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol64_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol64_t*, T>::value>::type |
| skipahead(unsigned long long n, T state) |
| { |
| unsigned long long i_gray; |
| state->x = state->c; |
| state->i += n; |
| |
| i_gray = state->i ^ (state->i >> 1); |
| for(unsigned k = 0; k < 64; k++) { |
| if(i_gray & (1ULL << k)) { |
| state->x ^= state->direction_vectors[k]; |
| } |
| } |
| return; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors, |
| unsigned int offset, |
| curandStateSobol32_t *state) |
| { |
| state->i = 0; |
| state->c = 0; |
| for(int i = 0; i < 32; i++) { |
| state->direction_vectors[i] = direction_vectors[i]; |
| } |
| state->x = 0; |
| skipahead<curandStateSobol32_t *>(offset, state); |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors, |
| unsigned int scramble_c, |
| unsigned int offset, |
| curandStateScrambledSobol32_t *state) |
| { |
| state->i = 0; |
| state->c = scramble_c; |
| for(int i = 0; i < 32; i++) { |
| state->direction_vectors[i] = direction_vectors[i]; |
| } |
| state->x = state->c; |
| skipahead<curandStateScrambledSobol32_t *>(offset, state); |
| } |
|
|
| QUALIFIERS int __curand_find_trailing_zero(unsigned int x) |
| { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| int y = __ffs(~x); |
| if(y) |
| return y - 1; |
| return 31; |
| , |
| int i = 1; |
| while(x & 1) { |
| i++; |
| x >>= 1; |
| } |
| i = i - 1; |
| return i == 32 ? 31 : i; |
| ) |
| } |
|
|
| QUALIFIERS int __curand_find_trailing_zero(unsigned long long x) |
| { |
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, |
| int y = __ffsll(~x); |
| if(y) |
| return y - 1; |
| return 63; |
| , |
| int i = 1; |
| while(x & 1) { |
| i++; |
| x >>= 1; |
| } |
| i = i - 1; |
| return i == 64 ? 63 : i; |
| ) |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors, |
| unsigned long long offset, |
| curandStateSobol64_t *state) |
| { |
| state->i = 0; |
| state->c = 0; |
| for(int i = 0; i < 64; i++) { |
| state->direction_vectors[i] = direction_vectors[i]; |
| } |
| state->x = 0; |
| skipahead<curandStateSobol64_t *>(offset, state); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors, |
| unsigned long long scramble_c, |
| unsigned long long offset, |
| curandStateScrambledSobol64_t *state) |
| { |
| state->i = 0; |
| state->c = scramble_c; |
| for(int i = 0; i < 64; i++) { |
| state->direction_vectors[i] = direction_vectors[i]; |
| } |
| state->x = state->c; |
| skipahead<curandStateScrambledSobol64_t *>(offset, state); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS unsigned int curand(curandStateSobol32_t * state) |
| { |
| |
| |
| |
| unsigned int res = state->x; |
| state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)]; |
| state->i ++; |
| return res; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state) |
| { |
| |
| |
| |
| unsigned int res = state->x; |
| state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)]; |
| state->i ++; |
| return res; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS unsigned long long curand(curandStateSobol64_t * state) |
| { |
| |
| |
| |
| unsigned long long res = state->x; |
| state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)]; |
| state->i ++; |
| return res; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state) |
| { |
| |
| |
| |
| unsigned long long res = state->x; |
| state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)]; |
| state->i ++; |
| return res; |
| } |
|
|
| #include "curand_uniform.h" |
| #include "curand_normal.h" |
| #include "curand_lognormal.h" |
| #include "curand_poisson.h" |
| #include "curand_discrete2.h" |
|
|
| __device__ static inline unsigned int *__get_precalculated_matrix(int n) |
| { |
| if(n == 0) { |
| return precalc_xorwow_matrix[n]; |
| } |
| if(n == 2) { |
| return precalc_xorwow_offset_matrix[n]; |
| } |
| return precalc_xorwow_matrix[n]; |
| } |
|
|
| #ifndef __CUDACC_RTC__ |
| __host__ static inline unsigned int *__get_precalculated_matrix_host(int n) |
| { |
| if(n == 1) { |
| return precalc_xorwow_matrix_host[n]; |
| } |
| if(n == 3) { |
| return precalc_xorwow_offset_matrix_host[n]; |
| } |
| return precalc_xorwow_matrix_host[n]; |
| } |
| #endif |
|
|
| __device__ static inline unsigned int *__get_mrg32k3a_matrix(int n) |
| { |
| if(n == 0) { |
| return mrg32k3aM1[n][0]; |
| } |
| if(n == 2) { |
| return mrg32k3aM2[n][0]; |
| } |
| if(n == 4) { |
| return mrg32k3aM1SubSeq[n][0]; |
| } |
| if(n == 6) { |
| return mrg32k3aM2SubSeq[n][0]; |
| } |
| if(n == 8) { |
| return mrg32k3aM1Seq[n][0]; |
| } |
| if(n == 10) { |
| return mrg32k3aM2Seq[n][0]; |
| } |
| return mrg32k3aM1[n][0]; |
| } |
|
|
| #ifndef __CUDACC_RTC__ |
| __host__ static inline unsigned int *__get_mrg32k3a_matrix_host(int n) |
| { |
| if(n == 1) { |
| return mrg32k3aM1Host[n][0]; |
| } |
| if(n == 3) { |
| return mrg32k3aM2Host[n][0]; |
| } |
| if(n == 5) { |
| return mrg32k3aM1SubSeqHost[n][0]; |
| } |
| if(n == 7) { |
| return mrg32k3aM2SubSeqHost[n][0]; |
| } |
| if(n == 9) { |
| return mrg32k3aM1SeqHost[n][0]; |
| } |
| if(n == 11) { |
| return mrg32k3aM2SeqHost[n][0]; |
| } |
| return mrg32k3aM1Host[n][0]; |
| } |
|
|
| __host__ static inline double *__get__cr_lgamma_table_host(void) { |
| return __cr_lgamma_table; |
| } |
| #endif |
|
|
| |
|
|
| #endif |
|
|