Spaces:

qbhf2
/

GarmentCode

Sleeping

File size: 9,615 Bytes

66c9c8a

/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 */

#pragma once

namespace wp
{


CUDA_CALLABLE inline int dense_index(int stride, int i, int j)
{
    return i*stride + j;
}

template <bool transpose>
CUDA_CALLABLE inline int dense_index(int rows, int cols, int i, int j)
{
    if (transpose)
        return j*rows + i;
    else
        return i*cols + j;
}



template <bool t1, bool t2, bool add>
CUDA_CALLABLE inline void dense_gemm_impl(int m, int n, int p, const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
{
    for (int i=0; i < m; i++)
    {
        for (int j=0; j < n; ++j)
        {
            float sum = 0.0f;

            for (int k=0; k < p; ++k)
            {
                sum += A[dense_index<t1>(m, p, i, k)]*B[dense_index<t2>(p, n, k, j)];
            }
            
            if (add)
                C[i*n + j] += sum;
            else
                C[i*n + j] = sum;
        }
    }
}


template <bool add=false>
CUDA_CALLABLE inline void dense_gemm(int m, int n, int p, int t1, int t2, const array_t<float>& A, const array_t<float>& B, array_t<float>& C)
{
    if (t1 == 0 && t2 == 0)
        dense_gemm_impl<false, false, add>(m, n, p, A.data, B.data, C.data);
    else if (t1 == 1 && t2 == 0)
        dense_gemm_impl<true, false, add>(m, n, p, A.data, B.data, C.data);
    else if (t1 == 0 && t2 == 1)
        dense_gemm_impl<false, true, add>(m, n, p, A.data, B.data, C.data);
    else if (t1 == 1 && t2 == 1)
        dense_gemm_impl<true, true, add>(m, n, p, A.data, B.data, C.data);
}




void  CUDA_CALLABLE inline dense_chol(int n, const array_t<float>& A, float regularization, array_t<float>& L)
{
    for (int j=0; j < n; ++j)
    {
        float s = A.data[dense_index(n, j, j)] + regularization;

        for (int k=0; k < j; ++k)
        {
            float r = L.data[dense_index(n, j, k)];
            s -= r*r;
        }

        s = sqrt(s);
        const float invS = 1.0f/s;

        L.data[dense_index(n, j, j)] = s;

        for (int i=j+1; i < n; ++i)
        {
            s = A.data[dense_index(n, i, j)];
            
            for (int k=0; k < j; ++k)
            {
                s -= L.data[dense_index(n, i, k)]*L.data[dense_index(n, j, k)];
            }

            L.data[dense_index(n, i, j)] = s*invS;
        }
    }
}




// Solves (L*L^T)x = b given the Cholesky factor L 
CUDA_CALLABLE inline void dense_subs(int n, const array_t<float>& L, const array_t<float>& b, array_t<float>& x)
{
    // forward substitution
    for (int i=0; i < n; ++i)
    {
        float s = b.data[i];

        for (int j=0; j < i; ++j)
        {
            s -= L.data[dense_index(n, i, j)]*x.data[j];
        }

        x.data[i] = s/L.data[dense_index(n, i, i)];
    }

    // backward substitution
    for (int i=n-1; i >= 0; --i)
    {
        float s = x.data[i];

        for (int j=i+1; j < n; ++j)
        {
            s -= L.data[dense_index(n, j, i)]*x.data[j];
        }

        x.data[i] = s/L.data[dense_index(n, i, i)];
    }
}

CUDA_CALLABLE inline void dense_solve(int n, const array_t<float>& A, const array_t<float>& L, const array_t<float>& b, array_t<float>& x)
{
    dense_subs(n, L, b, x);
}


// CUDA_CALLABLE inline void print_matrix(const char* name, int m, int n, const float* data)
// {
//     printf("%s = [", name);

//     for (int i=0; i < m; ++i)
//     {
//         for (int j=0; j < n; ++j)
//         {
//             printf("%f ", data[dense_index(n, i, j)]);
//         }

//         printf(";\n");
//     }

//     printf("]\n");
// }

// adjoint methods
CUDA_CALLABLE inline void adj_dense_gemm(
    int m, int n, int p, int t1, int t2, const array_t<float>& A, const array_t<float>& B, array_t<float>& C,
    int adj_m, int adj_n, int adj_p, int adj_t1, int adj_t2, array_t<float>& adj_A, array_t<float>& adj_B, const array_t<float>& adj_C)
{

    // print_matrix("A", m, p, A);
    // print_matrix("B", p, n, B);
    // printf("t1: %d t2: %d\n", t1, t2);

    if (t1)
    {
        dense_gemm<true>(p, m, n, 0, 1, B, adj_C, adj_A);
        dense_gemm<true>(p, n, m, int(!t1), 0, A, adj_C, adj_B);
    }
    else
    {
        dense_gemm<true>(m, p, n, 0, int(!t2), adj_C, B, adj_A);
        dense_gemm<true>(p, n, m, int(!t1), 0, A, adj_C, adj_B);
    }
}


CUDA_CALLABLE inline void adj_dense_chol(
    int n, const array_t<float>& A, float regularization, array_t<float>& L,
    int adj_n, const array_t<float>& adj_A, float adj_regularization, array_t<float>& adj_L)
{
    // nop, use dense_solve to differentiate through (A^-1)b = x
}

CUDA_CALLABLE inline void adj_dense_subs(
    int n, const array_t<float>& L, const array_t<float>& b, array_t<float>& x,
    int adj_n, const array_t<float>& adj_L, const array_t<float>& adj_b, array_t<float>& adj_x)
{
    // nop, use dense_solve to differentiate through (A^-1)b = x
}


CUDA_CALLABLE inline void adj_dense_solve(int n,
    const array_t<float>& A, const array_t<float>& L, const array_t<float>& b, const array_t<float>& x,
    int adj_n, array_t<float>& adj_A, array_t<float>& adj_L, array_t<float>& adj_b, const array_t<float>& adj_x)
{
    // see https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pwp, section 2.3.1
    dense_subs(n, L, adj_x, adj_b);

    // A* = -adj_b*x^T
    for (int i=0; i < n; ++i)
    {
        for (int j=0; j < n; ++j)
        {
            adj_A.data[dense_index(n, i, j)] += -adj_b.data[i]*x.data[j];
        }
    }
}


template <typename F>
CUDA_CALLABLE inline void mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int index, const array_t<float>& x, array_t<float>& out)
{
    const int m = weights.shape[0];
    const int n = weights.shape[1];
    const int b = x.shape[1];

    for (int i=0; i < m; ++i)
    {
        float tmp = bias.data[i];

        for(int j=0; j < n; ++j)
        {
            tmp += weights.data[i*n + j]*x.data[index + b*j];
        }

        out.data[index + b*i] = activation(tmp);
    }
}

template <typename F, typename AdjF>
CUDA_CALLABLE inline void adj_mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int index, const array_t<float>& x, array_t<float>& out,
                                  array_t<float>& adj_weights, array_t<float>& adj_bias, AdjF adj_activation, int adj_index, array_t<float>& adj_x, array_t<float>& adj_out)
{
    const int m = weights.shape[0];
    const int n = weights.shape[1];
    const int b = x.shape[1];

    for (int i=0; i < m; ++i)
    {
        // recompute forward pass so we don't have to store pre-activation outputs
        float tmp = bias.data[i];

        for(int j=0; j < n; ++j)
        {
            tmp += weights.data[i*n + j]*x.data[index + b*j];
        }

        // adjoint w.r.t to acivation
        float adj_f = 0.0f;
    
        if (adj_out.data)
            adj_activation(tmp, adj_f, adj_out.data[index + b*i]);

        for (int j=0; j < n; ++j)
        {
            // adjoint w.r.t M_i
            if (adj_weights.data)
                atomic_add(&adj_weights.data[i*n + j], x.data[index + b*j]*adj_f);    // todo: reduce these atomic stores using warp/block level reductions

            // adjoint w.r.t x
            if (adj_x.data)
                atomic_add(&adj_x.data[index + b*j], weights.data[i*n + j]*adj_f);
        }

        // adjoint w.r.t b
        if (adj_bias.data)
            atomic_add(&adj_bias.data[i], adj_f);

    }
}


// template <typename F>
// CUDA_CALLABLE inline void mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int m, int n, int b, int index, const array_t<float>& x, array_t<float>& out)
// {
//     x += index*n;
//     out += index*m;


//     for (int i=0; i < m; ++i)
//     {
//         float tmp = bias[i];

//         for(int j=0; j < n; ++j)
//         {
//             tmp += weights[i*n + j]*x[j];
//         }

//         out[i] = activation(tmp);
//     }
// }

// template <typename F, typename AdjF>
// CUDA_CALLABLE inline void adj_mlp(const array_t<float>& weights, const array_t<float>& bias, F activation, int m, int n, int b, int index, const array_t<float>& x, const array_t<float>& out,
//                                   array_t<float>& adj_weights, array_t<float>& adj_bias, AdjF adj_activation, int adj_m, int adj_n, int adj_b, int adj_index, array_t<float>& adj_x, array_t<float>& adj_out)
// {
//     x += index*n;
//     out += index*m;

//     adj_x += index*n;
//     adj_out += index*m;

//     for (int i=0; i < m; ++i)
//     {
//         // recompute forward pass so we don't have to store pre-activation outputs
//         float tmp = bias[i];

//         for(int j=0; j < n; ++j)
//         {
//             tmp += weights[i*n + j]*x[index + b*j];            
//         }

//         // adjoint w.r.t to acivation
//         float adj_f = 0.0f;
//         adj_activation(tmp, adj_f, adj_out[index + b*i]);

//         for (int j=0; j < n; ++j)
//         {
//             // adjoint w.r.t M_i
//             adj_weights[i*n + j] += x[j]*adj_f;

//             // adjoint w.r.t x
//             adj_x[index + b*j] += weights[i*n + j]*adj_f;
//         }

//         // adjoint w.r.t b
//         adj_bias[i] += adj_f;
//     }
// }

} // namespace wp