File size: 6,601 Bytes
d1d4335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>
#include "fbgemm/FbgemmBuild.h"
#include "fbgemm/UtilsAvx2.h"
#include "fbgemm/spmmUtilsAvx2.h"
namespace fbgemm {
template <typename T>
struct FBGEMM_API CSRMatrix {
std::vector<int> rowPtr;
std::vector<int> colIdx;
std::vector<T> values;
};
/**
* Tiled block CSR format
* Partial blocks are zero-filled
*
*/
template <typename T = std::int8_t, int ROW_BLOCK = 1, int COL_BLOCK = 4>
struct FBGEMM_API BCSRMatrix {
using DTYPE = T;
static constexpr int RB = ROW_BLOCK; // Block size for rows
static constexpr int CB = COL_BLOCK; // Block size for cols
// We only tile in column dimension currently
// COLTILE must be a multiple of COL_BLOCK
static constexpr int COLTILE = 4000;
std::vector<int> rowBPtr; // rowPtr for blocks
std::vector<int> colBIdx; // colIdx for blocks
std::vector<DTYPE> values;
// Sum of all elements in a row
std::vector<int32_t> row_offsets;
int R;
int C;
BCSRMatrix(int Rows, int Cols) {
R = Rows;
C = Cols;
row_offsets.resize(R, 0);
}
/**
* @brief pack from dense to tiled block CSR format
* @param R number of rows in the matrix
* @param C number of columns in the matrix
* @param src is the source matrix with data type DTYPE
* @param ld is the leading dimension
*/
void pack(const DTYPE* src, size_t ld);
/**
* @brief pack from dense to tiled block CSR format
* @param R number of rows in the matrix
* @param C number of columns in the matrix
* @param src is the source matrix with data type DTYPE
*
* leading dim of the matrix is assumed to be equal to C
*/
void pack(const DTYPE* src);
/**
* @brief unpack from tiled block CSR to dense
* @param dst should be able to hold R*C elements of type DTYPE
* @param ld is the leading dimension
*/
void unpack(DTYPE* dst, size_t ld);
/*
* @brief unpack from tiled block CSR to dense
* @param dst should be able to hold R*C elements of type DTYPE
*
* leading dimension of the matrix is assumed to be equal to C
*/
void unpack(DTYPE* dst);
};
template <typename T>
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
fbgemmDenseToCSR(int R, int C, const T* inp, int ld);
template <typename T>
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
fbgemmDenseToCSR(int R, int C, const T* inp);
template <typename T = std::int8_t, int RB = 1, int CB = 4>
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
fbgemmDenseToBCSR(int R, int C, const T* inp, int ld);
template <typename T = std::int8_t, int RB = 1, int CB = 4>
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
fbgemmDenseToBCSR(int R, int C, const T* inp);
/**
* @param accum Controls accumulation.
* 1 means we're accumulating to the C Matrix.
*
* Note on matrix order and layout:
* Unlike other fbgemm functions that follow PyTorch convention where A
* matrix is activation (so in uint8_t for quantized FC/Conv or fp32) and B
* matrix is weight (so in int8_t for quantized FC/Conv or fp32), here A is
* weight matrix. This is because we mostly target sparsity in weights and for
* row-major layout it's more efficient to have A as a sparse matrix: for each
* non-zero of A at ith row and kth column, we can access kth row of B, whose
* elements are contiguous in memory. If B matrix was sparse, for each non-zero
* of B at kth row and jth column, we would've needed to access kth column of A,
* whose elements are not contiguous in memory with C/C++'s row-major layout.
* Alternatively, we can call this function as if we're computing
* C^T = B^T * A^T while maintaining PyTorch's convention that the lefthand
* side matrix B is activation. If B matrix is in column-major layout, we don't
* need to do an extra transposition. The C matrix will be output in
* column-major layout, so if we have a back-to-back Sparse-Dense matrix-matrix
* multiplications, B matrices of subsequent matrices will be already in
* column-major layout. Refer to SparseDenseMMFP32Benchmark.cc for an example.
*
*/
FBGEMM_API void SparseDenseMM(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
FBGEMM_API void fbgemmSparseDenseInt8MM(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
namespace internal {
void SparseDenseMMAvx2(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
void SparseDenseMMAvx512(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MMAvx2(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MMAvx512(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MVAvx512(
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
} // namespace internal
} // namespace fbgemm
|