Spaces:
Sleeping
Sleeping
| namespace wp | |
| { | |
| const int ARRAY_MAX_DIMS = 4; // must match constant in types.py | |
| // must match constants in types.py | |
| const int ARRAY_TYPE_REGULAR = 0; | |
| const int ARRAY_TYPE_INDEXED = 1; | |
| const int ARRAY_TYPE_FABRIC = 2; | |
| const int ARRAY_TYPE_FABRIC_INDEXED = 3; | |
| struct shape_t | |
| { | |
| int dims[ARRAY_MAX_DIMS]; | |
| CUDA_CALLABLE inline shape_t() : dims() {} | |
| CUDA_CALLABLE inline int operator[](int i) const | |
| { | |
| assert(i < ARRAY_MAX_DIMS); | |
| return dims[i]; | |
| } | |
| CUDA_CALLABLE inline int& operator[](int i) | |
| { | |
| assert(i < ARRAY_MAX_DIMS); | |
| return dims[i]; | |
| } | |
| }; | |
| CUDA_CALLABLE inline int extract(const shape_t& s, int i) | |
| { | |
| return s.dims[i]; | |
| } | |
| CUDA_CALLABLE inline void adj_extract(const shape_t& s, int i, const shape_t& adj_s, int adj_i, int adj_ret) {} | |
| inline CUDA_CALLABLE void print(shape_t s) | |
| { | |
| // todo: only print valid dims, currently shape has a fixed size | |
| // but we don't know how many dims are valid (e.g.: 1d, 2d, etc) | |
| // should probably store ndim with shape | |
| printf("(%d, %d, %d, %d)\n", s.dims[0], s.dims[1], s.dims[2], s.dims[3]); | |
| } | |
| inline CUDA_CALLABLE void adj_print(shape_t s, shape_t& shape_t) {} | |
| template <typename T> | |
| struct array_t | |
| { | |
| CUDA_CALLABLE inline array_t() {} | |
| CUDA_CALLABLE inline array_t(int) {} // for backward a = 0 initialization syntax | |
| CUDA_CALLABLE array_t(T* data, int size, T* grad=nullptr) : data(data), grad(grad) { | |
| // constructor for 1d array | |
| shape.dims[0] = size; | |
| shape.dims[1] = 0; | |
| shape.dims[2] = 0; | |
| shape.dims[3] = 0; | |
| ndim = 1; | |
| strides[0] = sizeof(T); | |
| strides[1] = 0; | |
| strides[2] = 0; | |
| strides[3] = 0; | |
| } | |
| CUDA_CALLABLE array_t(T* data, int dim0, int dim1, T* grad=nullptr) : data(data), grad(grad) { | |
| // constructor for 2d array | |
| shape.dims[0] = dim0; | |
| shape.dims[1] = dim1; | |
| shape.dims[2] = 0; | |
| shape.dims[3] = 0; | |
| ndim = 2; | |
| strides[0] = dim1 * sizeof(T); | |
| strides[1] = sizeof(T); | |
| strides[2] = 0; | |
| strides[3] = 0; | |
| } | |
| CUDA_CALLABLE array_t(T* data, int dim0, int dim1, int dim2, T* grad=nullptr) : data(data), grad(grad) { | |
| // constructor for 3d array | |
| shape.dims[0] = dim0; | |
| shape.dims[1] = dim1; | |
| shape.dims[2] = dim2; | |
| shape.dims[3] = 0; | |
| ndim = 3; | |
| strides[0] = dim1 * dim2 * sizeof(T); | |
| strides[1] = dim2 * sizeof(T); | |
| strides[2] = sizeof(T); | |
| strides[3] = 0; | |
| } | |
| CUDA_CALLABLE array_t(T* data, int dim0, int dim1, int dim2, int dim3, T* grad=nullptr) : data(data), grad(grad) { | |
| // constructor for 4d array | |
| shape.dims[0] = dim0; | |
| shape.dims[1] = dim1; | |
| shape.dims[2] = dim2; | |
| shape.dims[3] = dim3; | |
| ndim = 4; | |
| strides[0] = dim1 * dim2 * dim3 * sizeof(T); | |
| strides[1] = dim2 * dim3 * sizeof(T); | |
| strides[2] = dim3 * sizeof(T); | |
| strides[3] = sizeof(T); | |
| } | |
| CUDA_CALLABLE inline bool empty() const { return !data; } | |
| T* data{nullptr}; | |
| T* grad{nullptr}; | |
| shape_t shape; | |
| int strides[ARRAY_MAX_DIMS]; | |
| int ndim; | |
| CUDA_CALLABLE inline operator T*() const { return data; } | |
| }; | |
| // TODO: | |
| // - templated index type? | |
| // - templated dimensionality? (also for array_t to save space when passing arrays to kernels) | |
| template <typename T> | |
| struct indexedarray_t | |
| { | |
| CUDA_CALLABLE inline indexedarray_t() {} | |
| CUDA_CALLABLE inline indexedarray_t(int) {} // for backward a = 0 initialization syntax | |
| CUDA_CALLABLE inline bool empty() const { return !arr.data; } | |
| array_t<T> arr; | |
| int* indices[ARRAY_MAX_DIMS]; // index array per dimension (can be NULL) | |
| shape_t shape; // element count per dimension (num. indices if indexed, array dim if not) | |
| }; | |
| // return stride (in bytes) of the given index | |
| template <typename T> | |
| CUDA_CALLABLE inline size_t stride(const array_t<T>& a, int dim) | |
| { | |
| return size_t(a.strides[dim]); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T* data_at_byte_offset(const array_t<T>& a, size_t byte_offset) | |
| { | |
| return reinterpret_cast<T*>(reinterpret_cast<char*>(a.data) + byte_offset); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T* grad_at_byte_offset(const array_t<T>& a, size_t byte_offset) | |
| { | |
| return reinterpret_cast<T*>(reinterpret_cast<char*>(a.grad) + byte_offset); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i) | |
| { | |
| assert(i >= 0 && i < arr.shape[0]); | |
| return i*stride(arr, 0); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j) | |
| { | |
| assert(i >= 0 && i < arr.shape[0]); | |
| assert(j >= 0 && j < arr.shape[1]); | |
| return i*stride(arr, 0) + j*stride(arr, 1); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j, int k) | |
| { | |
| assert(i >= 0 && i < arr.shape[0]); | |
| assert(j >= 0 && j < arr.shape[1]); | |
| assert(k >= 0 && k < arr.shape[2]); | |
| return i*stride(arr, 0) + j*stride(arr, 1) + k*stride(arr, 2); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j, int k, int l) | |
| { | |
| assert(i >= 0 && i < arr.shape[0]); | |
| assert(j >= 0 && j < arr.shape[1]); | |
| assert(k >= 0 && k < arr.shape[2]); | |
| assert(l >= 0 && l < arr.shape[3]); | |
| return i*stride(arr, 0) + j*stride(arr, 1) + k*stride(arr, 2) + l*stride(arr, 3); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i) | |
| { | |
| assert(arr.ndim == 1); | |
| T& result = *data_at_byte_offset(arr, byte_offset(arr, i)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j) | |
| { | |
| assert(arr.ndim == 2); | |
| T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j)); | |
| FP_VERIFY_FWD_2(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j, int k) | |
| { | |
| assert(arr.ndim == 3); | |
| T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j, k)); | |
| FP_VERIFY_FWD_3(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j, int k, int l) | |
| { | |
| assert(arr.ndim == 4); | |
| T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j, k, l)); | |
| FP_VERIFY_FWD_4(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i) | |
| { | |
| T& result = *grad_at_byte_offset(arr, byte_offset(arr, i)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j) | |
| { | |
| T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j)); | |
| FP_VERIFY_FWD_2(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j, int k) | |
| { | |
| T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j, k)); | |
| FP_VERIFY_FWD_3(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j, int k, int l) | |
| { | |
| T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j, k, l)); | |
| FP_VERIFY_FWD_4(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i) | |
| { | |
| assert(iarr.arr.ndim == 1); | |
| assert(i >= 0 && i < iarr.shape[0]); | |
| if (iarr.indices[0]) | |
| { | |
| i = iarr.indices[0][i]; | |
| assert(i >= 0 && i < iarr.arr.shape[0]); | |
| } | |
| T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j) | |
| { | |
| assert(iarr.arr.ndim == 2); | |
| assert(i >= 0 && i < iarr.shape[0]); | |
| assert(j >= 0 && j < iarr.shape[1]); | |
| if (iarr.indices[0]) | |
| { | |
| i = iarr.indices[0][i]; | |
| assert(i >= 0 && i < iarr.arr.shape[0]); | |
| } | |
| if (iarr.indices[1]) | |
| { | |
| j = iarr.indices[1][j]; | |
| assert(j >= 0 && j < iarr.arr.shape[1]); | |
| } | |
| T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j, int k) | |
| { | |
| assert(iarr.arr.ndim == 3); | |
| assert(i >= 0 && i < iarr.shape[0]); | |
| assert(j >= 0 && j < iarr.shape[1]); | |
| assert(k >= 0 && k < iarr.shape[2]); | |
| if (iarr.indices[0]) | |
| { | |
| i = iarr.indices[0][i]; | |
| assert(i >= 0 && i < iarr.arr.shape[0]); | |
| } | |
| if (iarr.indices[1]) | |
| { | |
| j = iarr.indices[1][j]; | |
| assert(j >= 0 && j < iarr.arr.shape[1]); | |
| } | |
| if (iarr.indices[2]) | |
| { | |
| k = iarr.indices[2][k]; | |
| assert(k >= 0 && k < iarr.arr.shape[2]); | |
| } | |
| T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j, k)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j, int k, int l) | |
| { | |
| assert(iarr.arr.ndim == 4); | |
| assert(i >= 0 && i < iarr.shape[0]); | |
| assert(j >= 0 && j < iarr.shape[1]); | |
| assert(k >= 0 && k < iarr.shape[2]); | |
| assert(l >= 0 && l < iarr.shape[3]); | |
| if (iarr.indices[0]) | |
| { | |
| i = iarr.indices[0][i]; | |
| assert(i >= 0 && i < iarr.arr.shape[0]); | |
| } | |
| if (iarr.indices[1]) | |
| { | |
| j = iarr.indices[1][j]; | |
| assert(j >= 0 && j < iarr.arr.shape[1]); | |
| } | |
| if (iarr.indices[2]) | |
| { | |
| k = iarr.indices[2][k]; | |
| assert(k >= 0 && k < iarr.arr.shape[2]); | |
| } | |
| if (iarr.indices[3]) | |
| { | |
| l = iarr.indices[3][l]; | |
| assert(l >= 0 && l < iarr.arr.shape[3]); | |
| } | |
| T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j, k, l)); | |
| FP_VERIFY_FWD_1(result) | |
| return result; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i) | |
| { | |
| assert(src.ndim > 1); | |
| assert(i >= 0 && i < src.shape[0]); | |
| array_t<T> a; | |
| a.data = data_at_byte_offset(src, byte_offset(src, i)); | |
| a.shape[0] = src.shape[1]; | |
| a.shape[1] = src.shape[2]; | |
| a.shape[2] = src.shape[3]; | |
| a.strides[0] = src.strides[1]; | |
| a.strides[1] = src.strides[2]; | |
| a.strides[2] = src.strides[3]; | |
| a.ndim = src.ndim-1; | |
| return a; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i, int j) | |
| { | |
| assert(src.ndim > 2); | |
| assert(i >= 0 && i < src.shape[0]); | |
| assert(j >= 0 && j < src.shape[1]); | |
| array_t<T> a; | |
| a.data = data_at_byte_offset(src, byte_offset(src, i, j)); | |
| a.shape[0] = src.shape[2]; | |
| a.shape[1] = src.shape[3]; | |
| a.strides[0] = src.strides[2]; | |
| a.strides[1] = src.strides[3]; | |
| a.ndim = src.ndim-2; | |
| return a; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i, int j, int k) | |
| { | |
| assert(src.ndim > 3); | |
| assert(i >= 0 && i < src.shape[0]); | |
| assert(j >= 0 && j < src.shape[1]); | |
| assert(k >= 0 && k < src.shape[2]); | |
| array_t<T> a; | |
| a.data = data_at_byte_offset(src, byte_offset(src, i, j, k)); | |
| a.shape[0] = src.shape[3]; | |
| a.strides[0] = src.strides[3]; | |
| a.ndim = src.ndim-3; | |
| return a; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i) | |
| { | |
| assert(src.arr.ndim > 1); | |
| if (src.indices[0]) | |
| { | |
| assert(i >= 0 && i < src.shape[0]); | |
| i = src.indices[0][i]; | |
| } | |
| indexedarray_t<T> a; | |
| a.arr = view(src.arr, i); | |
| a.indices[0] = src.indices[1]; | |
| a.indices[1] = src.indices[2]; | |
| a.indices[2] = src.indices[3]; | |
| a.shape[0] = src.shape[1]; | |
| a.shape[1] = src.shape[2]; | |
| a.shape[2] = src.shape[3]; | |
| return a; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i, int j) | |
| { | |
| assert(src.arr.ndim > 2); | |
| if (src.indices[0]) | |
| { | |
| assert(i >= 0 && i < src.shape[0]); | |
| i = src.indices[0][i]; | |
| } | |
| if (src.indices[1]) | |
| { | |
| assert(j >= 0 && j < src.shape[1]); | |
| j = src.indices[1][j]; | |
| } | |
| indexedarray_t<T> a; | |
| a.arr = view(src.arr, i, j); | |
| a.indices[0] = src.indices[2]; | |
| a.indices[1] = src.indices[3]; | |
| a.shape[0] = src.shape[2]; | |
| a.shape[1] = src.shape[3]; | |
| return a; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i, int j, int k) | |
| { | |
| assert(src.arr.ndim > 3); | |
| if (src.indices[0]) | |
| { | |
| assert(i >= 0 && i < src.shape[0]); | |
| i = src.indices[0][i]; | |
| } | |
| if (src.indices[1]) | |
| { | |
| assert(j >= 0 && j < src.shape[1]); | |
| j = src.indices[1][j]; | |
| } | |
| if (src.indices[2]) | |
| { | |
| assert(k >= 0 && k < src.shape[2]); | |
| k = src.indices[2][k]; | |
| } | |
| indexedarray_t<T> a; | |
| a.arr = view(src.arr, i, j, k); | |
| a.indices[0] = src.indices[3]; | |
| a.shape[0] = src.shape[3]; | |
| return a; | |
| } | |
| template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T> | |
| inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, A2<T>& adj_src, int adj_i, A3<T> adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T> | |
| inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, int j, A2<T>& adj_src, int adj_i, int adj_j, A3<T> adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T> | |
| inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, int j, int k, A2<T>& adj_src, int adj_i, int adj_j, int adj_k, A3<T> adj_ret) {} | |
| // TODO: lower_bound() for indexed arrays? | |
| template <typename T> | |
| CUDA_CALLABLE inline int lower_bound(const array_t<T>& arr, int arr_begin, int arr_end, T value) | |
| { | |
| assert(arr.ndim == 1); | |
| int lower = arr_begin; | |
| int upper = arr_end - 1; | |
| while(lower < upper) | |
| { | |
| int mid = lower + (upper - lower) / 2; | |
| if (arr[mid] < value) | |
| { | |
| lower = mid + 1; | |
| } | |
| else | |
| { | |
| upper = mid; | |
| } | |
| } | |
| return lower; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline int lower_bound(const array_t<T>& arr, T value) | |
| { | |
| return lower_bound(arr, 0, arr.shape[0], value); | |
| } | |
| template <typename T> inline CUDA_CALLABLE void adj_lower_bound(const array_t<T>& arr, T value, array_t<T> adj_arr, T adj_value, int adj_ret) {} | |
| template <typename T> inline CUDA_CALLABLE void adj_lower_bound(const array_t<T>& arr, int arr_begin, int arr_end, T value, array_t<T> adj_arr, int adj_arr_begin, int adj_arr_end, T adj_value, int adj_ret) {} | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, T value) { return atomic_add(&index(buf, i), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, T value) { return atomic_add(&index(buf, i, j), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, int k, T value) { return atomic_add(&index(buf, i, j, k), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_add(&index(buf, i, j, k, l), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, T value) { return atomic_add(&index(buf, i), -value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, T value) { return atomic_add(&index(buf, i, j), -value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, int k, T value) { return atomic_add(&index(buf, i, j, k), -value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_add(&index(buf, i, j, k, l), -value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, T value) { return atomic_min(&index(buf, i), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, T value) { return atomic_min(&index(buf, i, j), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, int k, T value) { return atomic_min(&index(buf, i, j, k), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_min(&index(buf, i, j, k, l), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, T value) { return atomic_max(&index(buf, i), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, T value) { return atomic_max(&index(buf, i, j), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, T value) { return atomic_max(&index(buf, i, j, k), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_max(&index(buf, i, j, k, l), value); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T* address(const A<T>& buf, int i) { return &index(buf, i); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j) { return &index(buf, i, j); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k) { return &index(buf, i, j, k); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k, int l) { return &index(buf, i, j, k, l); } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, T value) | |
| { | |
| FP_VERIFY_FWD_1(value) | |
| index(buf, i) = value; | |
| } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, T value) | |
| { | |
| FP_VERIFY_FWD_2(value) | |
| index(buf, i, j) = value; | |
| } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, T value) | |
| { | |
| FP_VERIFY_FWD_3(value) | |
| index(buf, i, j, k) = value; | |
| } | |
| template<template<typename> class A, typename T> | |
| inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, int l, T value) | |
| { | |
| FP_VERIFY_FWD_4(value) | |
| index(buf, i, j, k, l) = value; | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void store(T* address, T value) | |
| { | |
| FP_VERIFY_FWD(value) | |
| *address = value; | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE T load(T* address) | |
| { | |
| T value = *address; | |
| FP_VERIFY_FWD(value) | |
| return value; | |
| } | |
| // select operator to check for array being null | |
| template <typename T1, typename T2> | |
| CUDA_CALLABLE inline T2 select(const array_t<T1>& arr, const T2& a, const T2& b) { return arr.data?b:a; } | |
| template <typename T1, typename T2> | |
| CUDA_CALLABLE inline void adj_select(const array_t<T1>& arr, const T2& a, const T2& b, const array_t<T1>& adj_cond, T2& adj_a, T2& adj_b, const T2& adj_ret) | |
| { | |
| if (arr.data) | |
| adj_b += adj_ret; | |
| else | |
| adj_a += adj_ret; | |
| } | |
| // stub for the case where we have an nested array inside a struct and | |
| // atomic add the whole struct onto an array (e.g.: during backwards pass) | |
| template <typename T> | |
| CUDA_CALLABLE inline void atomic_add(array_t<T>*, array_t<T>) {} | |
| // for float and vector types this is just an alias for an atomic add | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_atomic_add(T* buf, T value) { atomic_add(buf, value); } | |
| // for integral types we do not accumulate gradients | |
| CUDA_CALLABLE inline void adj_atomic_add(int8* buf, int8 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(uint8* buf, uint8 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(int16* buf, int16 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(uint16* buf, uint16 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(int32* buf, int32 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(uint32* buf, uint32 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(int64* buf, int64 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(uint64* buf, uint64 value) { } | |
| CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { } | |
| // only generate gradients for T types | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int& adj_i, const T& adj_output) | |
| { | |
| if (buf.grad) | |
| adj_atomic_add(&index_grad(buf, i), adj_output); | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) | |
| { | |
| if (buf.grad) | |
| adj_atomic_add(&index_grad(buf, i, j), adj_output); | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) | |
| { | |
| if (buf.grad) | |
| adj_atomic_add(&index_grad(buf, i, j, k), adj_output); | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) | |
| { | |
| if (buf.grad) | |
| adj_atomic_add(&index_grad(buf, i, j, k, l), adj_output); | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i); | |
| FP_VERIFY_ADJ_1(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j); | |
| FP_VERIFY_ADJ_2(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j, k); | |
| FP_VERIFY_ADJ_3(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j, k, l); | |
| FP_VERIFY_ADJ_4(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_store(const T* address, T value, const T& adj_address, T& adj_value) | |
| { | |
| // nop; generic store() operations are not differentiable, only array_store() is | |
| FP_VERIFY_ADJ(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& adj_value) | |
| { | |
| // nop; generic load() operations are not differentiable | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i); | |
| FP_VERIFY_ADJ_1(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j); | |
| FP_VERIFY_ADJ_2(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j, k); | |
| FP_VERIFY_ADJ_3(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value += index_grad(buf, i, j, k, l); | |
| FP_VERIFY_ADJ_4(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value -= index_grad(buf, i); | |
| FP_VERIFY_ADJ_1(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value -= index_grad(buf, i, j); | |
| FP_VERIFY_ADJ_2(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value -= index_grad(buf, i, j, k); | |
| FP_VERIFY_ADJ_3(value, adj_value) | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) | |
| { | |
| if (buf.grad) | |
| adj_value -= index_grad(buf, i, j, k, l); | |
| FP_VERIFY_ADJ_4(value, adj_value) | |
| } | |
| // generic array types that do not support gradient computation (indexedarray, etc.) | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int& adj_i, const T& adj_output) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {} | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {} | |
| // generic handler for scalar values | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value); | |
| FP_VERIFY_ADJ_1(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value); | |
| FP_VERIFY_ADJ_2(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value); | |
| FP_VERIFY_ADJ_3(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value); | |
| FP_VERIFY_ADJ_4(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value); | |
| FP_VERIFY_ADJ_1(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value); | |
| FP_VERIFY_ADJ_2(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value); | |
| FP_VERIFY_ADJ_3(value, adj_value) | |
| } | |
| template<template<typename> class A1, template<typename> class A2, typename T> | |
| inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) { | |
| if (buf.grad) | |
| adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value); | |
| FP_VERIFY_ADJ_4(value, adj_value) | |
| } | |
| } // namespace wp | |