| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| #pragma once |
|
|
| #if defined(_WIN32) || defined(_WIN64) |
| #include <windows.h> |
| #undef small |
| #else |
| #include <sys/resource.h> |
| #endif |
|
|
| #include <cuda_runtime.h> |
|
|
| #include <stdio.h> |
| #include <float.h> |
|
|
| #include <cmath> |
| #include <string> |
| #include <vector> |
| #include <sstream> |
| #include <iostream> |
| #include <limits> |
|
|
| #include "mersenne.h" |
| #include "half.h" |
|
|
| #include "cub/util_debug.cuh" |
| #include "cub/util_device.cuh" |
| #include "cub/util_type.cuh" |
| #include "cub/util_macro.cuh" |
| #include "cub/iterator/discard_output_iterator.cuh" |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| template <typename T, typename U> |
| T SafeBitCast(const U& in) |
| { |
| static_assert(sizeof(T) == sizeof(U), "Types must be same size."); |
| T out; |
| memcpy(&out, &in, sizeof(T)); |
| return out; |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| #define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);} |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| struct CommandLineArgs |
| { |
|
|
| std::vector<std::string> keys; |
| std::vector<std::string> values; |
| std::vector<std::string> args; |
| cudaDeviceProp deviceProp; |
| float device_giga_bandwidth; |
| size_t device_free_physmem; |
| size_t device_total_physmem; |
|
|
| |
| |
| |
| CommandLineArgs(int argc, char **argv) : |
| keys(10), |
| values(10) |
| { |
| using namespace std; |
|
|
| |
| unsigned int mersenne_init[4]= {0x123, 0x234, 0x345, 0x456}; |
| mersenne::init_by_array(mersenne_init, 4); |
|
|
| for (int i = 1; i < argc; i++) |
| { |
| string arg = argv[i]; |
|
|
| if ((arg[0] != '-') || (arg[1] != '-')) |
| { |
| args.push_back(arg); |
| continue; |
| } |
|
|
| string::size_type pos; |
| string key, val; |
| if ((pos = arg.find('=')) == string::npos) { |
| key = string(arg, 2, arg.length() - 2); |
| val = ""; |
| } else { |
| key = string(arg, 2, pos - 2); |
| val = string(arg, pos + 1, arg.length() - 1); |
| } |
|
|
| keys.push_back(key); |
| values.push_back(val); |
| } |
| } |
|
|
|
|
| |
| |
| |
| bool CheckCmdLineFlag(const char* arg_name) |
| { |
| using namespace std; |
|
|
| for (int i = 0; i < int(keys.size()); ++i) |
| { |
| if (keys[i] == string(arg_name)) |
| return true; |
| } |
| return false; |
| } |
|
|
|
|
| |
| |
| |
| template <typename T> |
| int NumNakedArgs() |
| { |
| return args.size(); |
| } |
|
|
|
|
| |
| |
| |
| template <typename T> |
| void GetCmdLineArgument(int index, T &val) |
| { |
| using namespace std; |
| if (index < args.size()) { |
| istringstream str_stream(args[index]); |
| str_stream >> val; |
| } |
| } |
|
|
| |
| |
| |
| template <typename T> |
| void GetCmdLineArgument(const char *arg_name, T &val) |
| { |
| using namespace std; |
|
|
| for (int i = 0; i < int(keys.size()); ++i) |
| { |
| if (keys[i] == string(arg_name)) |
| { |
| istringstream str_stream(values[i]); |
| str_stream >> val; |
| } |
| } |
| } |
|
|
|
|
| |
| |
| |
| template <typename T> |
| void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals) |
| { |
| using namespace std; |
|
|
| if (CheckCmdLineFlag(arg_name)) |
| { |
| |
| vals.clear(); |
|
|
| |
| for (int i = 0; i < keys.size(); ++i) |
| { |
| if (keys[i] == string(arg_name)) |
| { |
| string val_string(values[i]); |
| istringstream str_stream(val_string); |
| string::size_type old_pos = 0; |
| string::size_type new_pos = 0; |
|
|
| |
| T val; |
| while ((new_pos = val_string.find(',', old_pos)) != string::npos) |
| { |
| if (new_pos != old_pos) |
| { |
| str_stream.width(new_pos - old_pos); |
| str_stream >> val; |
| vals.push_back(val); |
| } |
|
|
| |
| str_stream.ignore(1); |
| old_pos = new_pos + 1; |
| } |
|
|
| |
| str_stream >> val; |
| vals.push_back(val); |
| } |
| } |
| } |
| } |
|
|
|
|
| |
| |
| |
| int ParsedArgc() |
| { |
| return (int) keys.size(); |
| } |
|
|
| |
| |
| |
| cudaError_t DeviceInit(int dev = -1) |
| { |
| cudaError_t error = cudaSuccess; |
|
|
| do |
| { |
| int deviceCount; |
| error = CubDebug(cudaGetDeviceCount(&deviceCount)); |
| if (error) break; |
|
|
| if (deviceCount == 0) { |
| fprintf(stderr, "No devices supporting CUDA.\n"); |
| exit(1); |
| } |
| if (dev < 0) |
| { |
| GetCmdLineArgument("device", dev); |
| } |
| if ((dev > deviceCount - 1) || (dev < 0)) |
| { |
| dev = 0; |
| } |
|
|
| error = CubDebug(cudaSetDevice(dev)); |
| if (error) break; |
|
|
| CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem)); |
|
|
| int ptx_version = 0; |
| error = CubDebug(cub::PtxVersion(ptx_version)); |
| if (error) break; |
|
|
| error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev)); |
| if (error) break; |
|
|
| if (deviceProp.major < 1) { |
| fprintf(stderr, "Device does not support CUDA.\n"); |
| exit(1); |
| } |
|
|
| device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000; |
|
|
| if (!CheckCmdLineFlag("quiet")) |
| { |
| printf( |
| "Using device %d: %s (PTX version %d, SM%d, %d SMs, " |
| "%lld free / %lld total MB physmem, " |
| "%.3f GB/s @ %d kHz mem clock, ECC %s)\n", |
| dev, |
| deviceProp.name, |
| ptx_version, |
| deviceProp.major * 100 + deviceProp.minor * 10, |
| deviceProp.multiProcessorCount, |
| (unsigned long long) device_free_physmem / 1024 / 1024, |
| (unsigned long long) device_total_physmem / 1024 / 1024, |
| device_giga_bandwidth, |
| deviceProp.memoryClockRate, |
| (deviceProp.ECCEnabled) ? "on" : "off"); |
| fflush(stdout); |
| } |
|
|
| } while (0); |
|
|
| return error; |
| } |
| }; |
|
|
| |
| |
| |
|
|
| int g_num_rand_samples = 0; |
|
|
|
|
| template <typename T> |
| bool IsNaN(T ) { return false; } |
|
|
| template<> |
| __noinline__ bool IsNaN<float>(float val) |
| { |
| return std::isnan(val); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<float1>(float1 val) |
| { |
| return (IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<float2>(float2 val) |
| { |
| return (IsNaN(val.y) || IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<float3>(float3 val) |
| { |
| return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<float4>(float4 val) |
| { |
| return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<double>(double val) |
| { |
| return std::isnan(val); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<double1>(double1 val) |
| { |
| return (IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<double2>(double2 val) |
| { |
| return (IsNaN(val.y) || IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<double3>(double3 val) |
| { |
| return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); |
| } |
|
|
| template<> |
| __noinline__ bool IsNaN<double4>(double4 val) |
| { |
| return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); |
| } |
|
|
|
|
| template<> |
| __noinline__ bool IsNaN<half_t>(half_t val) |
| { |
| const auto bits = SafeBitCast<unsigned short>(val); |
|
|
| |
| return (((bits >= 0x7C01) && (bits <= 0x7FFF)) || |
| ((bits >= 0xFC01) )); |
| } |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| template <typename K> |
| void RandomBits( |
| K &key, |
| int entropy_reduction = 0, |
| int begin_bit = 0, |
| int end_bit = sizeof(K) * 8) |
| { |
| const int NUM_BYTES = sizeof(K); |
| const int WORD_BYTES = sizeof(unsigned int); |
| const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES; |
|
|
| unsigned int word_buff[NUM_WORDS]; |
|
|
| if (entropy_reduction == -1) |
| { |
| memset((void *) &key, 0, sizeof(key)); |
| return; |
| } |
|
|
| if (end_bit < 0) |
| end_bit = sizeof(K) * 8; |
|
|
| while (true) |
| { |
| |
| for (int j = 0; j < NUM_WORDS; j++) |
| { |
| int current_bit = j * WORD_BYTES * 8; |
|
|
| unsigned int word = 0xffffffff; |
| word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit); |
| word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit); |
|
|
| for (int i = 0; i <= entropy_reduction; i++) |
| { |
| |
| word &= mersenne::genrand_int32(); |
| g_num_rand_samples++; |
| } |
|
|
| word_buff[j] = word; |
| } |
|
|
| memcpy(&key, word_buff, sizeof(K)); |
|
|
| K copy = key; |
| if (!IsNaN(copy)) |
| break; |
| } |
| } |
|
|
| |
| template <typename T> |
| T RandomValue(T max) |
| { |
| unsigned int bits; |
| unsigned int max_int = (unsigned int) -1; |
| do { |
| RandomBits(bits); |
| } while (bits == max_int); |
|
|
| return (T) ((double(bits) / double(max_int)) * double(max)); |
| } |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| template <typename T> |
| T CoutCast(T val) { return val; } |
|
|
| int CoutCast(char val) { return val; } |
|
|
| int CoutCast(unsigned char val) { return val; } |
|
|
| int CoutCast(signed char val) { return val; } |
|
|
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| enum GenMode |
| { |
| UNIFORM, |
| INTEGER_SEED, |
| RANDOM, |
| RANDOM_BIT, |
| }; |
|
|
| |
| |
| |
| template <typename T> |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) |
| { |
| switch (gen_mode) |
| { |
| #if (CUB_PTX_ARCH == 0) |
| case RANDOM: |
| RandomBits(value); |
| break; |
| case RANDOM_BIT: |
| char c; |
| RandomBits(c, 0, 0, 1); |
| value = (c > 0) ? (T) 1 : (T) -1; |
| break; |
| #endif |
| case UNIFORM: |
| value = 2; |
| break; |
| case INTEGER_SEED: |
| default: |
| value = (T) index; |
| break; |
| } |
| } |
|
|
|
|
| |
| |
| |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0) |
| { |
| switch (gen_mode) |
| { |
| #if (CUB_PTX_ARCH == 0) |
| case RANDOM: |
| case RANDOM_BIT: |
| char c; |
| RandomBits(c, 0, 0, 1); |
| value = (c > 0); |
| break; |
| #endif |
| case UNIFORM: |
| value = true; |
| break; |
| case INTEGER_SEED: |
| default: |
| value = (index > 0); |
| break; |
| } |
| } |
|
|
|
|
| |
| |
| |
| __host__ __device__ __forceinline__ void InitValue(GenMode , |
| cub::NullType &, |
| int = 0) |
| {} |
|
|
|
|
| |
| |
| |
| template <typename KeyT, typename ValueT> |
| __host__ __device__ __forceinline__ void InitValue( |
| GenMode gen_mode, |
| cub::KeyValuePair<KeyT, ValueT>& value, |
| int index = 0) |
| { |
| InitValue(gen_mode, value.value, index); |
|
|
| |
| RandomBits(value.key, 3); |
| value.key = (value.key & 0x1); |
| } |
|
|
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| template <typename Key, typename Value> |
| std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val) |
| { |
| os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')'; |
| return os; |
| } |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| #define CUB_VEC_OVERLOAD_1(T, BaseT) \ |
| \ |
| std::ostream& operator<<( \ |
| std::ostream& os, \ |
| const T& val) \ |
| { \ |
| os << '(' << CoutCast(val.x) << ')'; \ |
| return os; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator!=( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x != b.x); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator==( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x == b.x); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
| { \ |
| InitValue(gen_mode, value.x, index); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator>( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x > b.x); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator<( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x < b.x); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ T operator+( \ |
| T a, \ |
| T b) \ |
| { \ |
| T retval = make_##T(a.x + b.x); \ |
| return retval; \ |
| } \ |
| namespace cub { \ |
| template<> \ |
| struct NumericTraits<T> \ |
| { \ |
| static const Category CATEGORY = NOT_A_NUMBER; \ |
| enum { \ |
| PRIMITIVE = false, \ |
| NULL_TYPE = false, \ |
| }; \ |
| static T Max() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Max()}; \ |
| return retval; \ |
| } \ |
| static T Lowest() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Lowest()}; \ |
| return retval; \ |
| } \ |
| }; \ |
| } |
|
|
|
|
|
|
| |
| |
| |
| #define CUB_VEC_OVERLOAD_2(T, BaseT) \ |
| \ |
| std::ostream& operator<<( \ |
| std::ostream& os, \ |
| const T& val) \ |
| { \ |
| os << '(' \ |
| << CoutCast(val.x) << ',' \ |
| << CoutCast(val.y) << ')'; \ |
| return os; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator!=( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x != b.x) || \ |
| (a.y != b.y); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator==( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x == b.x) && \ |
| (a.y == b.y); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
| { \ |
| InitValue(gen_mode, value.x, index); \ |
| InitValue(gen_mode, value.y, index); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator>( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
| return a.y > b.y; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator<( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
| return a.y < b.y; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ T operator+( \ |
| T a, \ |
| T b) \ |
| { \ |
| T retval = make_##T( \ |
| a.x + b.x, \ |
| a.y + b.y); \ |
| return retval; \ |
| } \ |
| namespace cub { \ |
| template<> \ |
| struct NumericTraits<T> \ |
| { \ |
| static const Category CATEGORY = NOT_A_NUMBER; \ |
| enum { \ |
| PRIMITIVE = false, \ |
| NULL_TYPE = false, \ |
| }; \ |
| static T Max() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max()}; \ |
| return retval; \ |
| } \ |
| static T Lowest() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest()}; \ |
| return retval; \ |
| } \ |
| }; \ |
| } |
|
|
|
|
|
|
| |
| |
| |
| #define CUB_VEC_OVERLOAD_3(T, BaseT) \ |
| \ |
| std::ostream& operator<<( \ |
| std::ostream& os, \ |
| const T& val) \ |
| { \ |
| os << '(' \ |
| << CoutCast(val.x) << ',' \ |
| << CoutCast(val.y) << ',' \ |
| << CoutCast(val.z) << ')'; \ |
| return os; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator!=( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x != b.x) || \ |
| (a.y != b.y) || \ |
| (a.z != b.z); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator==( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x == b.x) && \ |
| (a.y == b.y) && \ |
| (a.z == b.z); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
| { \ |
| InitValue(gen_mode, value.x, index); \ |
| InitValue(gen_mode, value.y, index); \ |
| InitValue(gen_mode, value.z, index); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator>( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
| if (a.y > b.y) return true; else if (b.y > a.y) return false; \ |
| return a.z > b.z; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator<( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
| if (a.y < b.y) return true; else if (b.y < a.y) return false; \ |
| return a.z < b.z; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ T operator+( \ |
| T a, \ |
| T b) \ |
| { \ |
| T retval = make_##T( \ |
| a.x + b.x, \ |
| a.y + b.y, \ |
| a.z + b.z); \ |
| return retval; \ |
| } \ |
| namespace cub { \ |
| template<> \ |
| struct NumericTraits<T> \ |
| { \ |
| static const Category CATEGORY = NOT_A_NUMBER; \ |
| enum { \ |
| PRIMITIVE = false, \ |
| NULL_TYPE = false, \ |
| }; \ |
| static T Max() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max()}; \ |
| return retval; \ |
| } \ |
| static T Lowest() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest()}; \ |
| return retval; \ |
| } \ |
| }; \ |
| } |
|
|
|
|
| |
| |
| |
| #define CUB_VEC_OVERLOAD_4(T, BaseT) \ |
| \ |
| std::ostream& operator<<( \ |
| std::ostream& os, \ |
| const T& val) \ |
| { \ |
| os << '(' \ |
| << CoutCast(val.x) << ',' \ |
| << CoutCast(val.y) << ',' \ |
| << CoutCast(val.z) << ',' \ |
| << CoutCast(val.w) << ')'; \ |
| return os; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator!=( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x != b.x) || \ |
| (a.y != b.y) || \ |
| (a.z != b.z) || \ |
| (a.w != b.w); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator==( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| return (a.x == b.x) && \ |
| (a.y == b.y) && \ |
| (a.z == b.z) && \ |
| (a.w == b.w); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
| { \ |
| InitValue(gen_mode, value.x, index); \ |
| InitValue(gen_mode, value.y, index); \ |
| InitValue(gen_mode, value.z, index); \ |
| InitValue(gen_mode, value.w, index); \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator>( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
| if (a.y > b.y) return true; else if (b.y > a.y) return false; \ |
| if (a.z > b.z) return true; else if (b.z > a.z) return false; \ |
| return a.w > b.w; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ bool operator<( \ |
| const T &a, \ |
| const T &b) \ |
| { \ |
| if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
| if (a.y < b.y) return true; else if (b.y < a.y) return false; \ |
| if (a.z < b.z) return true; else if (b.z < a.z) return false; \ |
| return a.w < b.w; \ |
| } \ |
| \ |
| __host__ __device__ __forceinline__ T operator+( \ |
| T a, \ |
| T b) \ |
| { \ |
| T retval = make_##T( \ |
| a.x + b.x, \ |
| a.y + b.y, \ |
| a.z + b.z, \ |
| a.w + b.w); \ |
| return retval; \ |
| } \ |
| namespace cub { \ |
| template<> \ |
| struct NumericTraits<T> \ |
| { \ |
| static const Category CATEGORY = NOT_A_NUMBER; \ |
| enum { \ |
| PRIMITIVE = false, \ |
| NULL_TYPE = false, \ |
| }; \ |
| static T Max() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max(), \ |
| NumericTraits<BaseT>::Max()}; \ |
| return retval; \ |
| } \ |
| static T Lowest() \ |
| { \ |
| T retval = { \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest(), \ |
| NumericTraits<BaseT>::Lowest()}; \ |
| return retval; \ |
| } \ |
| }; \ |
| } |
|
|
| |
| |
| |
| #define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT) \ |
| CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT) \ |
| CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT) \ |
| CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT) \ |
| CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT) |
|
|
| |
| |
| |
| CUB_VEC_OVERLOAD(char, char) |
| CUB_VEC_OVERLOAD(short, short) |
| CUB_VEC_OVERLOAD(int, int) |
| CUB_VEC_OVERLOAD(long, long) |
| CUB_VEC_OVERLOAD(longlong, long long) |
| CUB_VEC_OVERLOAD(uchar, unsigned char) |
| CUB_VEC_OVERLOAD(ushort, unsigned short) |
| CUB_VEC_OVERLOAD(uint, unsigned int) |
| CUB_VEC_OVERLOAD(ulong, unsigned long) |
| CUB_VEC_OVERLOAD(ulonglong, unsigned long long) |
| CUB_VEC_OVERLOAD(float, float) |
| CUB_VEC_OVERLOAD(double, double) |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| struct TestFoo |
| { |
| long long x; |
| int y; |
| short z; |
| char w; |
|
|
| |
| static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w) |
| { |
| TestFoo retval = {x, y, z, w}; |
| return retval; |
| } |
|
|
| |
| __host__ __device__ __forceinline__ TestFoo& operator =(int b) |
| { |
| x = b; |
| y = b; |
| z = b; |
| w = b; |
| return *this; |
| } |
|
|
| |
| __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const |
| { |
| return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const |
| { |
| return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const |
| { |
| return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const |
| { |
| if (x < b.x) return true; else if (b.x < x) return false; |
| if (y < b.y) return true; else if (b.y < y) return false; |
| if (z < b.z) return true; else if (b.z < z) return false; |
| return w < b.w; |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const |
| { |
| if (x > b.x) return true; else if (b.x > x) return false; |
| if (y > b.y) return true; else if (b.y > y) return false; |
| if (z > b.z) return true; else if (b.z > z) return false; |
| return w > b.w; |
| } |
|
|
| }; |
|
|
| |
| |
| |
| std::ostream& operator<<(std::ostream& os, const TestFoo& val) |
| { |
| os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')'; |
| return os; |
| } |
|
|
| |
| |
| |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0) |
| { |
| InitValue(gen_mode, value.x, index); |
| InitValue(gen_mode, value.y, index); |
| InitValue(gen_mode, value.z, index); |
| InitValue(gen_mode, value.w, index); |
| } |
|
|
|
|
| |
| namespace cub { |
| template<> |
| struct NumericTraits<TestFoo> |
| { |
| static const Category CATEGORY = NOT_A_NUMBER; |
| enum { |
| PRIMITIVE = false, |
| NULL_TYPE = false, |
| }; |
| static TestFoo Max() |
| { |
| return TestFoo::MakeTestFoo( |
| NumericTraits<long long>::Max(), |
| NumericTraits<int>::Max(), |
| NumericTraits<short>::Max(), |
| NumericTraits<char>::Max()); |
| } |
|
|
| static TestFoo Lowest() |
| { |
| return TestFoo::MakeTestFoo( |
| NumericTraits<long long>::Lowest(), |
| NumericTraits<int>::Lowest(), |
| NumericTraits<short>::Lowest(), |
| NumericTraits<char>::Lowest()); |
| } |
| }; |
| } |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| struct TestBar |
| { |
| long long x; |
| int y; |
|
|
| |
| __host__ __device__ __forceinline__ TestBar() : x(0), y(0) |
| {} |
|
|
| |
| __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b) |
| {} |
|
|
| |
| __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y) |
| {} |
|
|
| |
| __host__ __device__ __forceinline__ TestBar& operator =(int b) |
| { |
| x = b; |
| y = b; |
| return *this; |
| } |
|
|
| |
| __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const |
| { |
| return TestBar(x + b.x, y + b.y); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const |
| { |
| return (x != b.x) || (y != b.y); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const |
| { |
| return (x == b.x) && (y == b.y); |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const |
| { |
| if (x < b.x) return true; else if (b.x < x) return false; |
| return y < b.y; |
| } |
|
|
| |
| __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const |
| { |
| if (x > b.x) return true; else if (b.x > x) return false; |
| return y > b.y; |
| } |
|
|
| }; |
|
|
|
|
| |
| |
| |
| std::ostream& operator<<(std::ostream& os, const TestBar& val) |
| { |
| os << '(' << val.x << ',' << val.y << ')'; |
| return os; |
| } |
|
|
| |
| |
| |
| __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0) |
| { |
| InitValue(gen_mode, value.x, index); |
| InitValue(gen_mode, value.y, index); |
| } |
|
|
| |
| namespace cub { |
| template<> |
| struct NumericTraits<TestBar> |
| { |
| static const Category CATEGORY = NOT_A_NUMBER; |
| enum { |
| PRIMITIVE = false, |
| NULL_TYPE = false, |
| }; |
| static TestBar Max() |
| { |
| return TestBar( |
| NumericTraits<long long>::Max(), |
| NumericTraits<int>::Max()); |
| } |
|
|
| static TestBar Lowest() |
| { |
| return TestBar( |
| NumericTraits<long long>::Lowest(), |
| NumericTraits<int>::Lowest()); |
| } |
| }; |
| } |
|
|
|
|
| |
| |
| |
|
|
|
|
| |
| |
| |
| template <typename S, typename T, typename OffsetT> |
| int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true) |
| { |
| for (OffsetT i = 0; i < len; i++) |
| { |
| if (computed[i] != reference[i]) |
| { |
| if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
| << CoutCast(computed[i]) << " != " |
| << CoutCast(reference[i]); |
| return 1; |
| } |
| } |
| return 0; |
| } |
|
|
|
|
| |
| |
| |
| template <typename OffsetT> |
| int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true) |
| { |
| for (OffsetT i = 0; i < len; i++) |
| { |
| if (computed[i] != reference[i]) |
| { |
| float difference = std::abs(computed[i]-reference[i]); |
| float fraction = difference / std::abs(reference[i]); |
|
|
| if (fraction > 0.0001) |
| { |
| if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
| << "(computed) " << CoutCast(computed[i]) << " != " |
| << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; |
| return 1; |
| } |
| } |
| } |
| return 0; |
| } |
|
|
|
|
| |
| |
| |
| template <typename OffsetT> |
| int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true) |
| { |
| return 0; |
| } |
|
|
| |
| |
| |
| template <typename OffsetT> |
| int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true) |
| { |
| for (OffsetT i = 0; i < len; i++) |
| { |
| if (computed[i] != reference[i]) |
| { |
| double difference = std::abs(computed[i]-reference[i]); |
| double fraction = difference / std::abs(reference[i]); |
|
|
| if (fraction > 0.0001) |
| { |
| if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
| << CoutCast(computed[i]) << " != " |
| << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; |
| return 1; |
| } |
| } |
| } |
| return 0; |
| } |
|
|
|
|
| |
| |
| |
| |
| int CompareDeviceResults( |
| cub::NullType *, |
| cub::NullType *, |
| size_t , |
| bool = true, |
| bool = false) |
| { |
| return 0; |
| } |
|
|
| |
| |
| |
| |
| template <typename S, typename OffsetT> |
| int CompareDeviceResults( |
| S *h_reference, |
| cub::DiscardOutputIterator<OffsetT> d_data, |
| size_t num_items, |
| bool verbose = true, |
| bool display_data = false) |
| { |
| return 0; |
| } |
|
|
| |
| |
| |
| |
| template <typename S, typename T> |
| int CompareDeviceResults( |
| S *h_reference, |
| T *d_data, |
| size_t num_items, |
| bool verbose = true, |
| bool display_data = false) |
| { |
| |
| T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
| |
| cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
| |
| if (display_data) |
| { |
| printf("Reference:\n"); |
| for (int i = 0; i < int(num_items); i++) |
| { |
| std::cout << CoutCast(h_reference[i]) << ", "; |
| } |
| printf("\n\nComputed:\n"); |
| for (int i = 0; i < int(num_items); i++) |
| { |
| std::cout << CoutCast(h_data[i]) << ", "; |
| } |
| printf("\n\n"); |
| } |
|
|
| |
| int retval = CompareResults(h_data, h_reference, num_items, verbose); |
|
|
| |
| if (h_data) free(h_data); |
|
|
| return retval; |
| } |
|
|
|
|
| |
| |
| |
| |
| template <typename T> |
| int CompareDeviceDeviceResults( |
| T *d_reference, |
| T *d_data, |
| size_t num_items, |
| bool verbose = true, |
| bool display_data = false) |
| { |
| |
| T *h_reference = (T*) malloc(num_items * sizeof(T)); |
| T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
| |
| cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
| cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
| |
| if (display_data) { |
| printf("Reference:\n"); |
| for (int i = 0; i < num_items; i++) |
| { |
| std::cout << CoutCast(h_reference[i]) << ", "; |
| } |
| printf("\n\nComputed:\n"); |
| for (int i = 0; i < num_items; i++) |
| { |
| std::cout << CoutCast(h_data[i]) << ", "; |
| } |
| printf("\n\n"); |
| } |
|
|
| |
| int retval = CompareResults(h_data, h_reference, num_items, verbose); |
|
|
| |
| if (h_reference) free(h_reference); |
| if (h_data) free(h_data); |
|
|
| return retval; |
| } |
|
|
|
|
| |
| |
| |
| void DisplayResults( |
| cub::NullType *, |
| size_t ) |
| {} |
|
|
|
|
| |
| |
| |
| template <typename InputIteratorT> |
| void DisplayResults( |
| InputIteratorT h_data, |
| size_t num_items) |
| { |
| |
| for (int i = 0; i < int(num_items); i++) |
| { |
| std::cout << CoutCast(h_data[i]) << ", "; |
| } |
| printf("\n"); |
| } |
|
|
|
|
| |
| |
| |
| template <typename T> |
| void DisplayDeviceResults( |
| T *d_data, |
| size_t num_items) |
| { |
| |
| T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
| |
| cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
| DisplayResults(h_data, num_items); |
|
|
| |
| if (h_data) free(h_data); |
| } |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| void InitializeSegments( |
| int num_items, |
| int num_segments, |
| int *h_segment_offsets, |
| bool verbose = false) |
| { |
| if (num_segments <= 0) |
| return; |
|
|
| unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments; |
| int offset = 0; |
| for (int i = 0; i < num_segments; ++i) |
| { |
| h_segment_offsets[i] = offset; |
|
|
| unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1); |
| offset += segment_length; |
| offset = CUB_MIN(offset, num_items); |
| } |
| h_segment_offsets[num_segments] = num_items; |
|
|
| if (verbose) |
| { |
| printf("Segment offsets: "); |
| DisplayResults(h_segment_offsets, num_segments + 1); |
| } |
| } |
|
|
|
|
| |
| |
| |
|
|
|
|
| struct CpuTimer |
| { |
| #if defined(_WIN32) || defined(_WIN64) |
|
|
| LARGE_INTEGER ll_freq; |
| LARGE_INTEGER ll_start; |
| LARGE_INTEGER ll_stop; |
|
|
| CpuTimer() |
| { |
| QueryPerformanceFrequency(&ll_freq); |
| } |
|
|
| void Start() |
| { |
| QueryPerformanceCounter(&ll_start); |
| } |
|
|
| void Stop() |
| { |
| QueryPerformanceCounter(&ll_stop); |
| } |
|
|
| float ElapsedMillis() |
| { |
| double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart); |
| double stop = double(ll_stop.QuadPart) / double(ll_freq.QuadPart); |
|
|
| return float((stop - start) * 1000); |
| } |
|
|
| #else |
|
|
| rusage start; |
| rusage stop; |
|
|
| void Start() |
| { |
| getrusage(RUSAGE_SELF, &start); |
| } |
|
|
| void Stop() |
| { |
| getrusage(RUSAGE_SELF, &stop); |
| } |
|
|
| float ElapsedMillis() |
| { |
| float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec; |
| float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec; |
|
|
| return (sec * 1000) + (usec / 1000); |
| } |
|
|
| #endif |
| }; |
|
|
| struct GpuTimer |
| { |
| cudaEvent_t start; |
| cudaEvent_t stop; |
|
|
| GpuTimer() |
| { |
| cudaEventCreate(&start); |
| cudaEventCreate(&stop); |
| } |
|
|
| ~GpuTimer() |
| { |
| cudaEventDestroy(start); |
| cudaEventDestroy(stop); |
| } |
|
|
| void Start() |
| { |
| cudaEventRecord(start, 0); |
| } |
|
|
| void Stop() |
| { |
| cudaEventRecord(stop, 0); |
| } |
|
|
| float ElapsedMillis() |
| { |
| float elapsed; |
| cudaEventSynchronize(stop); |
| cudaEventElapsedTime(&elapsed, start, stop); |
| return elapsed; |
| } |
| }; |
|
|