|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
#include <assert.h>
|
|
|
#include <cpuinfo.h>
|
|
|
#include <array>
|
|
|
#include <memory>
|
|
|
#include <stdexcept>
|
|
|
#include <typeinfo>
|
|
|
#include <vector>
|
|
|
|
|
|
#include "SimdUtils.h"
|
|
|
#include "Types.h"
|
|
|
#include "Utils.h"
|
|
|
|
|
|
namespace fbgemm {
|
|
|
|
|
|
template <typename T>
|
|
|
struct TypeConverter {
|
|
|
template <typename F>
|
|
|
T operator()(F) const;
|
|
|
};
|
|
|
|
|
|
#define PMAT_ALIGNMENT 64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename C = TypeConverter<T>>
|
|
|
class PackedGemmMatrixB {
|
|
|
public:
|
|
|
using value_type = T;
|
|
|
using size_type = uint64_t;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackedGemmMatrixB(
|
|
|
const matrix_op_t trans,
|
|
|
const int nrow,
|
|
|
const int ncol,
|
|
|
const float alpha,
|
|
|
const float* smat,
|
|
|
const int brow = 512)
|
|
|
: nrow_(nrow), ncol_(ncol), brow_(brow), kernel_ncol_blocks_(2) {
|
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI
|
|
|
if (std::is_same<T, float16>::value) {
|
|
|
kernel_ncol_blocks_ = 1;
|
|
|
}
|
|
|
#endif
|
|
|
initializeParam();
|
|
|
initializeMemory();
|
|
|
|
|
|
this->packFromSrc(trans, alpha, smat);
|
|
|
}
|
|
|
|
|
|
PackedGemmMatrixB(
|
|
|
const int nrow,
|
|
|
const int ncol,
|
|
|
const int brow,
|
|
|
const int last_brow,
|
|
|
const int bcol,
|
|
|
const int nbrow,
|
|
|
const int nbcol,
|
|
|
const uint64_t size)
|
|
|
: nrow_(nrow),
|
|
|
ncol_(ncol),
|
|
|
brow_(brow),
|
|
|
last_brow_(last_brow),
|
|
|
bcol_(bcol),
|
|
|
nbrow_(nbrow),
|
|
|
nbcol_(nbcol),
|
|
|
size_(size),
|
|
|
kernel_ncol_blocks_(2) {
|
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI
|
|
|
if (std::is_same<T, float16>::value) {
|
|
|
kernel_ncol_blocks_ = 1;
|
|
|
}
|
|
|
#endif
|
|
|
initializeMemory();
|
|
|
}
|
|
|
|
|
|
PackedGemmMatrixB(
|
|
|
const int nrow,
|
|
|
const int ncol,
|
|
|
const int brow,
|
|
|
const int last_brow,
|
|
|
const int bcol,
|
|
|
const int nbrow,
|
|
|
const int nbcol,
|
|
|
const uint64_t size,
|
|
|
const int kernel_ncol_blocks,
|
|
|
void* pmat)
|
|
|
: nrow_(nrow),
|
|
|
ncol_(ncol),
|
|
|
brow_(brow),
|
|
|
last_brow_(last_brow),
|
|
|
bcol_(bcol),
|
|
|
nbrow_(nbrow),
|
|
|
nbcol_(nbcol),
|
|
|
size_(size),
|
|
|
kernel_ncol_blocks_(kernel_ncol_blocks) {
|
|
|
#ifdef FBGEMM_ENABLE_KLEIDIAI
|
|
|
if (std::is_same<T, float16>::value) {
|
|
|
kernel_ncol_blocks_ = 1;
|
|
|
}
|
|
|
#endif
|
|
|
pmat_ = static_cast<T*>(pmat);
|
|
|
packed_ = true;
|
|
|
pmat_passed_in = true;
|
|
|
}
|
|
|
|
|
|
void initializeParam() {
|
|
|
if (!cpuinfo_initialize()) {
|
|
|
throw std::runtime_error("Failed to initialize cpuinfo!");
|
|
|
}
|
|
|
bcol_ = (isZmm(fbgemmInstructionSet())
|
|
|
? simd_info<inst_set_t::avx512>::WIDTH_32BIT_ELEMS
|
|
|
: simd_info<inst_set_t::avx2>::WIDTH_32BIT_ELEMS) *
|
|
|
kernelNumColBlocks();
|
|
|
|
|
|
|
|
|
nbrow_ = (numRows() + blockRowSize() - 1) / blockRowSize();
|
|
|
last_brow_ = ((nrow_ % blockRowSize()) == 0) ? blockRowSize()
|
|
|
: (nrow_ % blockRowSize());
|
|
|
nbcol_ = (numCols() + blockColSize() - 1) / blockColSize();
|
|
|
|
|
|
if (numCols() != blockColSize() * nbcol_) {
|
|
|
#ifdef VLOG
|
|
|
VLOG(0) << "Packer warning: ncol(" << numCols()
|
|
|
<< ") is not a multiple of internal block size ("
|
|
|
<< blockColSize() << ")";
|
|
|
VLOG(0) << "lefover is not super optimized hence overhead will inccur";
|
|
|
#endif
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void setPacked(bool p) {
|
|
|
packed_ = p;
|
|
|
}
|
|
|
|
|
|
bool packed() const {
|
|
|
return packed_;
|
|
|
}
|
|
|
|
|
|
void initializeMemory() {
|
|
|
|
|
|
size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_);
|
|
|
pmat_ = static_cast<T*>(
|
|
|
fbgemmAlignedAlloc(PMAT_ALIGNMENT, matSize() * sizeof(T)));
|
|
|
memset(pmat_, 0, matSize() * sizeof(T));
|
|
|
}
|
|
|
|
|
|
~PackedGemmMatrixB() {
|
|
|
if (pmat_passed_in == false) {
|
|
|
fbgemmAlignedFree(pmat_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void unpackFromSrc(const matrix_op_t trans, T* src_mat) {
|
|
|
bool tr = (trans == matrix_op_t::Transpose);
|
|
|
for (int i = 0; i < numRows(); i++) {
|
|
|
for (int j = 0; j < numCols(); j++) {
|
|
|
pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)];
|
|
|
}
|
|
|
}
|
|
|
packed_ = false;
|
|
|
}
|
|
|
|
|
|
void unpack(T* origin_buf, const matrix_op_t trans) {
|
|
|
assert(packed_);
|
|
|
bool tr = (trans == matrix_op_t::Transpose);
|
|
|
for (int i = 0; i < numRows(); i++) {
|
|
|
for (int j = 0; j < numCols(); j++) {
|
|
|
origin_buf[tr ? i + numRows() * j : i * numCols() + j] =
|
|
|
pmat_[addr(i, j)];
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
uint64_t addr(const int r_, const int c_) const {
|
|
|
uint64_t r = (uint64_t)r_;
|
|
|
uint64_t c = (uint64_t)c_;
|
|
|
|
|
|
uint64_t block_row_id = r / blockRowSize(),
|
|
|
brow_offset =
|
|
|
(block_row_id * nbcol_) * (blockRowSize() * blockColSize());
|
|
|
uint64_t block_col_id = c / blockColSize(),
|
|
|
bcol_offset = block_col_id *
|
|
|
((static_cast<int64_t>(block_row_id) != nbrow_ - 1)
|
|
|
? (blockRowSize() * blockColSize())
|
|
|
: (last_brow_ * blockColSize()));
|
|
|
uint64_t block_offset = brow_offset + bcol_offset;
|
|
|
uint64_t inblock_offset =
|
|
|
r % blockRowSize() * blockColSize() + c % blockColSize();
|
|
|
|
|
|
uint64_t index = block_offset + inblock_offset;
|
|
|
assert(static_cast<int64_t>(index) < matSize());
|
|
|
return index;
|
|
|
}
|
|
|
|
|
|
void
|
|
|
packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) {
|
|
|
bool tr = (trans == matrix_op_t::Transpose);
|
|
|
|
|
|
for (int i = 0; i < numRows(); i++) {
|
|
|
for (int j = 0; j < numCols(); j++) {
|
|
|
float src = alpha *
|
|
|
((tr == false) ? smat[i * numCols() + j] : smat[i + numRows() * j]);
|
|
|
pmat_[addr(i, j)] = C()(src);
|
|
|
}
|
|
|
}
|
|
|
packed_ = true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void packFromSrc(const matrix_op_t trans, const T* smat) {
|
|
|
bool tr = (trans == matrix_op_t::Transpose);
|
|
|
for (int i = 0; i < numRows(); ++i) {
|
|
|
for (int j = 0; j < numCols(); ++j) {
|
|
|
pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j];
|
|
|
}
|
|
|
}
|
|
|
packed_ = true;
|
|
|
}
|
|
|
|
|
|
const T& operator()(const int r, const int c) const {
|
|
|
const auto a = addr(r, c);
|
|
|
assert(r < numRows());
|
|
|
assert(c < numCols());
|
|
|
assert(static_cast<int64_t>(a) < this->matSize());
|
|
|
return pmat_[a];
|
|
|
}
|
|
|
|
|
|
int matSize() const {
|
|
|
return size_;
|
|
|
}
|
|
|
int numRows() const {
|
|
|
return nrow_;
|
|
|
}
|
|
|
int numCols() const {
|
|
|
return ncol_;
|
|
|
}
|
|
|
int lastBrow() const {
|
|
|
return last_brow_;
|
|
|
}
|
|
|
int numBrow() const {
|
|
|
return nbrow_;
|
|
|
}
|
|
|
int numBcol() const {
|
|
|
return nbcol_;
|
|
|
}
|
|
|
T* pmat() const {
|
|
|
return pmat_;
|
|
|
}
|
|
|
inline int blockRowSize() const {
|
|
|
return brow_;
|
|
|
}
|
|
|
inline int blockColSize() const {
|
|
|
return bcol_;
|
|
|
}
|
|
|
inline int kernelNumColBlocks() const {
|
|
|
return kernel_ncol_blocks_;
|
|
|
}
|
|
|
|
|
|
const value_type* data() const {
|
|
|
return pmat_;
|
|
|
}
|
|
|
|
|
|
uint64_t size() const {
|
|
|
return size_ / sizeof(value_type);
|
|
|
}
|
|
|
|
|
|
int nrow_, ncol_;
|
|
|
int brow_, last_brow_, bcol_;
|
|
|
int nbrow_, nbcol_;
|
|
|
uint64_t size_;
|
|
|
int kernel_ncol_blocks_;
|
|
|
T* pmat_;
|
|
|
bool packed_{false};
|
|
|
bool pmat_passed_in{false};
|
|
|
};
|
|
|
|
|
|
#ifndef _M_X64
|
|
|
|
|
|
template <>
|
|
|
FBGEMM_API
|
|
|
PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
|
|
|
const matrix_op_t trans,
|
|
|
const int nrow,
|
|
|
const int ncol,
|
|
|
const float alpha,
|
|
|
const float* smat,
|
|
|
const int brow);
|
|
|
|
|
|
template <>
|
|
|
FBGEMM_API
|
|
|
PackedGemmMatrixB<float16, TypeConverter<float16>>::PackedGemmMatrixB(
|
|
|
const int nrow,
|
|
|
const int ncol,
|
|
|
const int brow,
|
|
|
const int last_brow,
|
|
|
const int bcol,
|
|
|
const int nbrow,
|
|
|
const int nbcol,
|
|
|
const uint64_t size);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|