|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
|
|
#include <assert.h> |
|
|
#include <cpuinfo.h> |
|
|
#include <stdexcept> |
|
|
|
|
|
#include "SimdUtils.h" |
|
|
#include "Types.h" |
|
|
#include "Utils.h" |
|
|
|
|
|
namespace fbgemm { |
|
|
|
|
|
template <typename T> |
|
|
struct TypeConverter { |
|
|
template <typename F> |
|
|
T operator()(F) const; |
|
|
}; |
|
|
|
|
|
#define PMAT_ALIGNMENT 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename C = TypeConverter<T>> |
|
|
class PackedGemmMatrixB { |
|
|
public: |
|
|
using value_type = T; |
|
|
using size_type = uint64_t; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackedGemmMatrixB( |
|
|
const matrix_op_t trans, |
|
|
const int nrow, |
|
|
const int ncol, |
|
|
const float alpha, |
|
|
const float* smat, |
|
|
const int brow = 512) |
|
|
: nrow_(nrow), ncol_(ncol), brow_(brow), kernel_ncol_blocks_(2) { |
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI |
|
|
if constexpr (std::is_same<T, float16>::value) { |
|
|
kernel_ncol_blocks_ = 1; |
|
|
} |
|
|
#endif |
|
|
initializeParam(); |
|
|
initializeMemory(); |
|
|
|
|
|
this->packFromSrc(trans, alpha, smat); |
|
|
} |
|
|
|
|
|
PackedGemmMatrixB( |
|
|
const int nrow, |
|
|
const int ncol, |
|
|
const int brow, |
|
|
const int last_brow, |
|
|
const int bcol, |
|
|
const int nbrow, |
|
|
const int nbcol, |
|
|
const uint64_t size) |
|
|
: nrow_(nrow), |
|
|
ncol_(ncol), |
|
|
brow_(brow), |
|
|
last_brow_(last_brow), |
|
|
bcol_(bcol), |
|
|
nbrow_(nbrow), |
|
|
nbcol_(nbcol), |
|
|
size_(size), |
|
|
kernel_ncol_blocks_(2) { |
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI |
|
|
if constexpr (std::is_same<T, float16>::value) { |
|
|
kernel_ncol_blocks_ = 1; |
|
|
} |
|
|
#endif |
|
|
initializeMemory(); |
|
|
} |
|
|
|
|
|
PackedGemmMatrixB( |
|
|
const int nrow, |
|
|
const int ncol, |
|
|
const int brow, |
|
|
const int last_brow, |
|
|
const int bcol, |
|
|
const int nbrow, |
|
|
const int nbcol, |
|
|
const uint64_t size, |
|
|
const int kernel_ncol_blocks, |
|
|
void* pmat) |
|
|
: nrow_(nrow), |
|
|
ncol_(ncol), |
|
|
brow_(brow), |
|
|
last_brow_(last_brow), |
|
|
bcol_(bcol), |
|
|
nbrow_(nbrow), |
|
|
nbcol_(nbcol), |
|
|
size_(size), |
|
|
kernel_ncol_blocks_(kernel_ncol_blocks) { |
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI |
|
|
if constexpr (std::is_same<T, float16>::value) { |
|
|
kernel_ncol_blocks_ = 1; |
|
|
} |
|
|
#endif |
|
|
pmat_ = static_cast<T*>(pmat); |
|
|
packed_ = true; |
|
|
pmat_passed_in = true; |
|
|
} |
|
|
PackedGemmMatrixB(const PackedGemmMatrixB&) = delete; |
|
|
PackedGemmMatrixB(PackedGemmMatrixB&&) = delete; |
|
|
PackedGemmMatrixB& operator=(const PackedGemmMatrixB&) = delete; |
|
|
PackedGemmMatrixB& operator=(PackedGemmMatrixB&&) = delete; |
|
|
|
|
|
void initializeParam() { |
|
|
if (!cpuinfo_initialize()) { |
|
|
throw std::runtime_error("Failed to initialize cpuinfo!"); |
|
|
} |
|
|
bcol_ = (isZmm(fbgemmInstructionSet()) |
|
|
? simd_info<inst_set_t::avx512>::WIDTH_32BIT_ELEMS |
|
|
: simd_info<inst_set_t::avx2>::WIDTH_32BIT_ELEMS) * |
|
|
kernelNumColBlocks(); |
|
|
|
|
|
|
|
|
nbrow_ = (numRows() + blockRowSize() - 1) / blockRowSize(); |
|
|
last_brow_ = ((nrow_ % blockRowSize()) == 0) ? blockRowSize() |
|
|
: (nrow_ % blockRowSize()); |
|
|
nbcol_ = (numCols() + blockColSize() - 1) / blockColSize(); |
|
|
|
|
|
if (numCols() != blockColSize() * nbcol_) { |
|
|
#ifdef VLOG |
|
|
VLOG(0) << "Packer warning: ncol(" << numCols() |
|
|
<< ") is not a multiple of internal block size (" |
|
|
<< blockColSize() << ")"; |
|
|
VLOG(0) << "lefover is not super optimized hence overhead will inccur"; |
|
|
#endif |
|
|
} |
|
|
} |
|
|
|
|
|
void setPacked(bool p) { |
|
|
packed_ = p; |
|
|
} |
|
|
|
|
|
bool packed() const { |
|
|
return packed_; |
|
|
} |
|
|
|
|
|
void initializeMemory() { |
|
|
|
|
|
size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_); |
|
|
pmat_ = static_cast<T*>( |
|
|
fbgemmAlignedAlloc(PMAT_ALIGNMENT, matSize() * sizeof(T))); |
|
|
memset(pmat_, 0, matSize() * sizeof(T)); |
|
|
} |
|
|
|
|
|
~PackedGemmMatrixB() { |
|
|
if (pmat_passed_in == false) { |
|
|
fbgemmAlignedFree(pmat_); |
|
|
} |
|
|
} |
|
|
|
|
|
void unpackFromSrc(const matrix_op_t trans, T* src_mat) { |
|
|
bool tr = (trans == matrix_op_t::Transpose); |
|
|
for (int i = 0; i < numRows(); i++) { |
|
|
for (int j = 0; j < numCols(); j++) { |
|
|
pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)]; |
|
|
} |
|
|
} |
|
|
packed_ = false; |
|
|
} |
|
|
|
|
|
void unpack(T* origin_buf, const matrix_op_t trans) { |
|
|
assert(packed_); |
|
|
bool tr = (trans == matrix_op_t::Transpose); |
|
|
for (int i = 0; i < numRows(); i++) { |
|
|
for (int j = 0; j < numCols(); j++) { |
|
|
origin_buf[tr ? i + numRows() * j : i * numCols() + j] = |
|
|
pmat_[addr(i, j)]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint64_t addr(const int r_, const int c_) const { |
|
|
uint64_t r = (uint64_t)r_; |
|
|
uint64_t c = (uint64_t)c_; |
|
|
|
|
|
uint64_t block_row_id = r / blockRowSize(); |
|
|
uint64_t brow_offset = |
|
|
(block_row_id * nbcol_) * (blockRowSize() * blockColSize()); |
|
|
uint64_t block_col_id = c / blockColSize(); |
|
|
uint64_t bcol_offset = block_col_id * |
|
|
((static_cast<int64_t>(block_row_id) != nbrow_ - 1) |
|
|
? (blockRowSize() * blockColSize()) |
|
|
: (last_brow_ * blockColSize())); |
|
|
uint64_t block_offset = brow_offset + bcol_offset; |
|
|
uint64_t inblock_offset = |
|
|
r % blockRowSize() * blockColSize() + c % blockColSize(); |
|
|
|
|
|
uint64_t index = block_offset + inblock_offset; |
|
|
assert(static_cast<int64_t>(index) < matSize()); |
|
|
return index; |
|
|
} |
|
|
|
|
|
void |
|
|
packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) { |
|
|
bool tr = (trans == matrix_op_t::Transpose); |
|
|
|
|
|
for (int i = 0; i < numRows(); i++) { |
|
|
for (int j = 0; j < numCols(); j++) { |
|
|
float src = alpha * |
|
|
((tr == false) ? smat[i * numCols() + j] : smat[i + numRows() * j]); |
|
|
pmat_[addr(i, j)] = C()(src); |
|
|
} |
|
|
} |
|
|
packed_ = true; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void packFromSrc(const matrix_op_t trans, const T* smat) { |
|
|
bool tr = (trans == matrix_op_t::Transpose); |
|
|
for (int i = 0; i < numRows(); ++i) { |
|
|
for (int j = 0; j < numCols(); ++j) { |
|
|
pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j]; |
|
|
} |
|
|
} |
|
|
packed_ = true; |
|
|
} |
|
|
|
|
|
const T& operator()(const int r, const int c) const { |
|
|
const auto a = addr(r, c); |
|
|
assert(r < numRows()); |
|
|
assert(c < numCols()); |
|
|
assert(static_cast<int64_t>(a) < this->matSize()); |
|
|
return pmat_[a]; |
|
|
} |
|
|
|
|
|
int matSize() const { |
|
|
return size_; |
|
|
} |
|
|
int numRows() const { |
|
|
return nrow_; |
|
|
} |
|
|
int numCols() const { |
|
|
return ncol_; |
|
|
} |
|
|
int lastBrow() const { |
|
|
return last_brow_; |
|
|
} |
|
|
int numBrow() const { |
|
|
return nbrow_; |
|
|
} |
|
|
int numBcol() const { |
|
|
return nbcol_; |
|
|
} |
|
|
T* pmat() const { |
|
|
return pmat_; |
|
|
} |
|
|
int blockRowSize() const { |
|
|
return brow_; |
|
|
} |
|
|
int blockColSize() const { |
|
|
return bcol_; |
|
|
} |
|
|
int kernelNumColBlocks() const { |
|
|
return kernel_ncol_blocks_; |
|
|
} |
|
|
|
|
|
const value_type* data() const { |
|
|
return pmat_; |
|
|
} |
|
|
|
|
|
uint64_t size() const { |
|
|
return size_ / sizeof(value_type); |
|
|
} |
|
|
|
|
|
int nrow_, ncol_; |
|
|
int brow_, last_brow_, bcol_; |
|
|
int nbrow_, nbcol_; |
|
|
uint64_t size_; |
|
|
int kernel_ncol_blocks_; |
|
|
T* pmat_; |
|
|
bool packed_{false}; |
|
|
bool pmat_passed_in{false}; |
|
|
}; |
|
|
|
|
|
#ifndef _M_X64 |
|
|
|
|
|
template <> |
|
|
FBGEMM_API |
|
|
PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB( |
|
|
const matrix_op_t trans, |
|
|
const int nrow, |
|
|
const int ncol, |
|
|
const float alpha, |
|
|
const float* smat, |
|
|
const int brow); |
|
|
|
|
|
template <> |
|
|
FBGEMM_API |
|
|
PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB( |
|
|
const int nrow, |
|
|
const int ncol, |
|
|
const int brow, |
|
|
const int last_brow, |
|
|
const int bcol, |
|
|
const int nbrow, |
|
|
const int nbcol, |
|
|
const uint64_t size); |
|
|
|
|
|
#endif |
|
|
|
|
|
} |
|
|
|