Spaces:
Sleeping
Sleeping
| /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved. | |
| * NVIDIA CORPORATION and its licensors retain all intellectual property | |
| * and proprietary rights in and to this software, related documentation | |
| * and any modifications thereto. Any use, reproduction, disclosure or | |
| * distribution of this software and related documentation without an express | |
| * license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| */ | |
| namespace wp | |
| { | |
| CUDA_CALLABLE inline int dense_index(int stride, int i, int j) | |
| { | |
| return i*stride + j; | |
| } | |
| template <bool transpose> | |
| CUDA_CALLABLE inline int dense_index(int rows, int cols, int i, int j) | |
| { | |
| if (transpose) | |
| return j*rows + i; | |
| else | |
| return i*cols + j; | |
| } | |
| template <bool t1, bool t2, bool add> | |
| CUDA_CALLABLE inline void dense_gemm_impl(int m, int n, int p, const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C) | |
| { | |
| for (int i=0; i < m; i++) | |
| { | |
| for (int j=0; j < n; ++j) | |
| { | |
| float sum = 0.0f; | |
| for (int k=0; k < p; ++k) | |
| { | |
| sum += A[dense_index<t1>(m, p, i, k)]*B[dense_index<t2>(p, n, k, j)]; | |
| } | |
| if (add) | |
| C[i*n + j] += sum; | |
| else | |
| C[i*n + j] = sum; | |
| } | |
| } | |
| } | |
| template <bool add=false> | |
| CUDA_CALLABLE inline void dense_gemm(int m, int n, int p, int t1, int t2, const array_t<float>& A, const array_t<float>& B, array_t<float>& C) | |
| { | |
| if (t1 == 0 && t2 == 0) | |
| dense_gemm_impl<false, false, add>(m, n, p, A.data, B.data, C.data); | |
| else if (t1 == 1 && t2 == 0) | |
| dense_gemm_impl<true, false, add>(m, n, p, A.data, B.data, C.data); | |
| else if (t1 == 0 && t2 == 1) | |
| dense_gemm_impl<false, true, add>(m, n, p, A.data, B.data, C.data); | |
| else if (t1 == 1 && t2 == 1) | |
| dense_gemm_impl<true, true, add>(m, n, p, A.data, B.data, C.data); | |
| } | |
| void CUDA_CALLABLE inline dense_chol(int n, const array_t<float>& A, float regularization, array_t<float>& L) | |
| { | |
| for (int j=0; j < n; ++j) | |
| { | |
| float s = A.data[dense_index(n, j, j)] + regularization; | |
| for (int k=0; k < j; ++k) | |
| { | |
| float r = L.data[dense_index(n, j, k)]; | |
| s -= r*r; | |
| } | |
| s = sqrt(s); | |
| const float invS = 1.0f/s; | |
| L.data[dense_index(n, j, j)] = s; | |
| for (int i=j+1; i < n; ++i) | |
| { | |
| s = A.data[dense_index(n, i, j)]; | |
| for (int k=0; k < j; ++k) | |
| { | |
| s -= L.data[dense_index(n, i, k)]*L.data[dense_index(n, j, k)]; | |
| } | |
| L.data[dense_index(n, i, j)] = s*invS; | |
| } | |
| } | |
| } | |
| // Solves (L*L^T)x = b given the Cholesky factor L | |
| CUDA_CALLABLE inline void dense_subs(int n, const array_t<float>& L, const array_t<float>& b, array_t<float>& x) | |
| { | |
| // forward substitution | |
| for (int i=0; i < n; ++i) | |
| { | |
| float s = b.data[i]; | |
| for (int j=0; j < i; ++j) | |
| { | |
| s -= L.data[dense_index(n, i, j)]*x.data[j]; | |
| } | |
| x.data[i] = s/L.data[dense_index(n, i, i)]; | |
| } | |
| // backward substitution | |
| for (int i=n-1; i >= 0; --i) | |
| { | |
| float s = x.data[i]; | |
| for (int j=i+1; j < n; ++j) | |
| { | |
| s -= L.data[dense_index(n, j, i)]*x.data[j]; | |
| } | |
| x.data[i] = s/L.data[dense_index(n, i, i)]; | |
| } | |
| } | |
| CUDA_CALLABLE inline void dense_solve(int n, const array_t<float>& A, const array_t<float>& L, const array_t<float>& b, array_t<float>& x) | |
| { | |
| dense_subs(n, L, b, x); | |
| } | |
| // CUDA_CALLABLE inline void print_matrix(const char* name, int m, int n, const float* data) | |
| // { | |
| // printf("%s = [", name); | |
| // for (int i=0; i < m; ++i) | |
| // { | |
| // for (int j=0; j < n; ++j) | |
| // { | |
| // printf("%f ", data[dense_index(n, i, j)]); | |
| // } | |
| // printf(";\n"); | |
| // } | |
| // printf("]\n"); | |
| // } | |
| // adjoint methods | |
| CUDA_CALLABLE inline void adj_dense_gemm( | |
| int m, int n, int p, int t1, int t2, const array_t<float>& A, const array_t<float>& B, array_t<float>& C, | |
| int adj_m, int adj_n, int adj_p, int adj_t1, int adj_t2, array_t<float>& adj_A, array_t<float>& adj_B, const array_t<float>& adj_C) | |
| { | |
| // print_matrix("A", m, p, A); | |
| // print_matrix("B", p, n, B); | |
| // printf("t1: %d t2: %d\n", t1, t2); | |
| if (t1) | |
| { | |
| dense_gemm<true>(p, m, n, 0, 1, B, adj_C, adj_A); | |
| dense_gemm<true>(p, n, m, int(!t1), 0, A, adj_C, adj_B); | |
| } | |
| else | |
| { | |
| dense_gemm<true>(m, p, n, 0, int(!t2), adj_C, B, adj_A); | |
| dense_gemm<true>(p, n, m, int(!t1), 0, A, adj_C, adj_B); | |
| } | |
| } | |
| CUDA_CALLABLE inline void adj_dense_chol( | |
| int n, const array_t<float>& A, float regularization, array_t<float>& L, | |
| int adj_n, const array_t<float>& adj_A, float adj_regularization, array_t<float>& adj_L) | |
| { | |
| // nop, use dense_solve to differentiate through (A^-1)b = x | |
| } | |
| CUDA_CALLABLE inline void adj_dense_subs( | |
| int n, const array_t<float>& L, const array_t<float>& b, array_t<float>& x, | |
| int adj_n, const array_t<float>& adj_L, const array_t<float>& adj_b, array_t<float>& adj_x) | |
| { | |
| // nop, use dense_solve to differentiate through (A^-1)b = x | |
| } | |
| CUDA_CALLABLE inline void adj_dense_solve(int n, | |
| const array_t<float>& A, const array_t<float>& L, const array_t<float>& b, const array_t<float>& x, | |
| int adj_n, array_t<float>& adj_A, array_t<float>& adj_L, array_t<float>& adj_b, const array_t<float>& adj_x) | |
| { | |
| // see https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pwp, section 2.3.1 | |
| dense_subs(n, L, adj_x, adj_b); | |
| // A* = -adj_b*x^T | |
| for (int i=0; i < n; ++i) | |
| { | |
| for (int j=0; j < n; ++j) | |
| { | |
| adj_A.data[dense_index(n, i, j)] += -adj_b.data[i]*x.data[j]; | |
| } | |
| } | |
| } | |
| template <typename F> | |
| CUDA_CALLABLE inline void mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int index, const array_t<float>& x, array_t<float>& out) | |
| { | |
| const int m = weights.shape[0]; | |
| const int n = weights.shape[1]; | |
| const int b = x.shape[1]; | |
| for (int i=0; i < m; ++i) | |
| { | |
| float tmp = bias.data[i]; | |
| for(int j=0; j < n; ++j) | |
| { | |
| tmp += weights.data[i*n + j]*x.data[index + b*j]; | |
| } | |
| out.data[index + b*i] = activation(tmp); | |
| } | |
| } | |
| template <typename F, typename AdjF> | |
| CUDA_CALLABLE inline void adj_mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int index, const array_t<float>& x, array_t<float>& out, | |
| array_t<float>& adj_weights, array_t<float>& adj_bias, AdjF adj_activation, int adj_index, array_t<float>& adj_x, array_t<float>& adj_out) | |
| { | |
| const int m = weights.shape[0]; | |
| const int n = weights.shape[1]; | |
| const int b = x.shape[1]; | |
| for (int i=0; i < m; ++i) | |
| { | |
| // recompute forward pass so we don't have to store pre-activation outputs | |
| float tmp = bias.data[i]; | |
| for(int j=0; j < n; ++j) | |
| { | |
| tmp += weights.data[i*n + j]*x.data[index + b*j]; | |
| } | |
| // adjoint w.r.t to acivation | |
| float adj_f = 0.0f; | |
| if (adj_out.data) | |
| adj_activation(tmp, adj_f, adj_out.data[index + b*i]); | |
| for (int j=0; j < n; ++j) | |
| { | |
| // adjoint w.r.t M_i | |
| if (adj_weights.data) | |
| atomic_add(&adj_weights.data[i*n + j], x.data[index + b*j]*adj_f); // todo: reduce these atomic stores using warp/block level reductions | |
| // adjoint w.r.t x | |
| if (adj_x.data) | |
| atomic_add(&adj_x.data[index + b*j], weights.data[i*n + j]*adj_f); | |
| } | |
| // adjoint w.r.t b | |
| if (adj_bias.data) | |
| atomic_add(&adj_bias.data[i], adj_f); | |
| } | |
| } | |
| // template <typename F> | |
| // CUDA_CALLABLE inline void mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int m, int n, int b, int index, const array_t<float>& x, array_t<float>& out) | |
| // { | |
| // x += index*n; | |
| // out += index*m; | |
| // for (int i=0; i < m; ++i) | |
| // { | |
| // float tmp = bias[i]; | |
| // for(int j=0; j < n; ++j) | |
| // { | |
| // tmp += weights[i*n + j]*x[j]; | |
| // } | |
| // out[i] = activation(tmp); | |
| // } | |
| // } | |
| // template <typename F, typename AdjF> | |
| // CUDA_CALLABLE inline void adj_mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int m, int n, int b, int index, const array_t<float>& x, const array_t<float>& out, | |
| // array_t<float>& adj_weights, array_t<float>& adj_bias, AdjF adj_activation, int adj_m, int adj_n, int adj_b, int adj_index, array_t<float>& adj_x, array_t<float>& adj_out) | |
| // { | |
| // x += index*n; | |
| // out += index*m; | |
| // adj_x += index*n; | |
| // adj_out += index*m; | |
| // for (int i=0; i < m; ++i) | |
| // { | |
| // // recompute forward pass so we don't have to store pre-activation outputs | |
| // float tmp = bias[i]; | |
| // for(int j=0; j < n; ++j) | |
| // { | |
| // tmp += weights[i*n + j]*x[index + b*j]; | |
| // } | |
| // // adjoint w.r.t to acivation | |
| // float adj_f = 0.0f; | |
| // adj_activation(tmp, adj_f, adj_out[index + b*i]); | |
| // for (int j=0; j < n; ++j) | |
| // { | |
| // // adjoint w.r.t M_i | |
| // adj_weights[i*n + j] += x[j]*adj_f; | |
| // // adjoint w.r.t x | |
| // adj_x[index + b*j] += weights[i*n + j]*adj_f; | |
| // } | |
| // // adjoint w.r.t b | |
| // adj_bias[i] += adj_f; | |
| // } | |
| // } | |
| } // namespace wp |