|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cassert>
|
|
|
#include <cmath>
|
|
|
#include <limits>
|
|
|
#include <memory>
|
|
|
#include <type_traits>
|
|
|
#include "./ConvUtils.h"
|
|
|
#include "./FbgemmBuild.h"
|
|
|
#include "./FbgemmEmbedding.h"
|
|
|
#include "./FbgemmI8DepthwiseAvx2.h"
|
|
|
#include "./FbgemmI8DirectconvAvx2.h"
|
|
|
#include "./FbgemmI8Spmdm.h"
|
|
|
#include "./FloatConversion.h"
|
|
|
#include "./QuantUtilsAvx2.h"
|
|
|
#include "./Types.h"
|
|
|
#include "./Utils.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
|
|
|
#include <chrono>
|
|
|
#include <iostream>
|
|
|
extern double packing_time;
|
|
|
extern double computing_time;
|
|
|
extern double kernel_time;
|
|
|
extern double postprocessing_time;
|
|
|
extern double run_time;
|
|
|
#endif
|
|
|
|
|
|
namespace fbgemm {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename T,
|
|
|
typename accT,
|
|
|
inst_set_t instSet,
|
|
|
typename int8Type = void>
|
|
|
struct PackingTraits;
|
|
|
|
|
|
|
|
|
#include "./PackingTraits-inl.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename PT, typename inpType, typename accType = std::int32_t>
|
|
|
class PackMatrix {
|
|
|
public:
|
|
|
PackMatrix() = delete;
|
|
|
PackMatrix(const PackMatrix&) = delete;
|
|
|
PackMatrix& operator==(const PackMatrix&) = delete;
|
|
|
PackMatrix(PackMatrix&&) = delete;
|
|
|
PackMatrix& operator==(PackMatrix&& rhs) noexcept = delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackMatrix(
|
|
|
std::int32_t rows,
|
|
|
std::int32_t cols,
|
|
|
inpType* pmat,
|
|
|
int groups = 1,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return static_cast<const PT*>(this)->isPrePacked();
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return PT::isA();
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int packedBufferSize(
|
|
|
int rows = 0,
|
|
|
int cols = 0,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
FBGEMM_PUSH_WARNING_AND_DISABLE("-Winfinite-recursion")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t* getRowOffsetBuffer() const {
|
|
|
return static_cast<const PT*>(this)->getRowOffsetBuffer();
|
|
|
}
|
|
|
FBGEMM_POP_WARNING
|
|
|
|
|
|
FBGEMM_PUSH_WARNING_AND_DISABLE("-Winfinite-recursion")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isThisLastKBlock(int block_id) const {
|
|
|
return static_cast<const PT*>(this)->isThisLastKBlock(block_id);
|
|
|
}
|
|
|
FBGEMM_POP_WARNING
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block) {
|
|
|
static_cast<PT*>(this)->pack(block);
|
|
|
}
|
|
|
|
|
|
std::int32_t numRows() const {
|
|
|
return nrows_;
|
|
|
}
|
|
|
|
|
|
std::int32_t numCols() const {
|
|
|
return ncols_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t blockRowSize() const {
|
|
|
return brow_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t blockColSize() const {
|
|
|
return bcol_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t blockRows() const {
|
|
|
return nbrow_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t blockCols() const {
|
|
|
return nbcol_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t numPackedRows() const {
|
|
|
return packedBlock_.row_size;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t numPackedCols() const {
|
|
|
return packedBlock_.col_size;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t packedRowStart() const {
|
|
|
return packedBlock_.row_start;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t packedColStart() const {
|
|
|
return packedBlock_.col_start;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inpType* getBuf(std::int32_t rowBlockNum = 0, std::int32_t colBlockNum = 0) {
|
|
|
return buf_ + blockRowSize() * blockColSize() * rowBlockNum +
|
|
|
blockRowSize() * blockColSize() * blockCols() * colBlockNum;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(std::string name) {
|
|
|
static_cast<PT*>(this)->printPackedMatrix(name);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t lastBrow() const {
|
|
|
return last_brow_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t lastBcol() const {
|
|
|
return last_bcol_;
|
|
|
}
|
|
|
|
|
|
int numGroups() const {
|
|
|
return G_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isThereColRemainder() const {
|
|
|
return last_bcol_ != blockColSize();
|
|
|
}
|
|
|
|
|
|
virtual ~PackMatrix() {
|
|
|
if (bufAllocatedHere_) {
|
|
|
fbgemmAlignedFree(buf_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
|
|
|
|
void packedBlock(const block_type_t& block) {
|
|
|
packedBlock_ = block;
|
|
|
nbrow_ = (numPackedRows() + blockRowSize() - 1) / blockRowSize();
|
|
|
nbcol_ = (numPackedCols() + blockColSize() - 1) / blockColSize();
|
|
|
|
|
|
last_brow_ = ((numPackedRows() % blockRowSize()) == 0)
|
|
|
? blockRowSize()
|
|
|
: (numPackedRows() % blockRowSize());
|
|
|
last_bcol_ = ((numPackedCols() % blockColSize()) == 0)
|
|
|
? blockColSize()
|
|
|
: (numPackedCols() % blockColSize());
|
|
|
}
|
|
|
|
|
|
inpType* buf_;
|
|
|
std::int32_t brow_;
|
|
|
std::int32_t bcol_;
|
|
|
std::int32_t nbrow_;
|
|
|
std::int32_t nbcol_;
|
|
|
bool bufAllocatedHere_{false};
|
|
|
const BlockingFactors*
|
|
|
blocking_params;
|
|
|
|
|
|
private:
|
|
|
std::int32_t nrows_, ncols_;
|
|
|
int G_;
|
|
|
block_type_t packedBlock_;
|
|
|
std::int32_t last_brow_, last_bcol_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t>
|
|
|
class FBGEMM_API PackAMatrix final
|
|
|
: public PackMatrix<PackAMatrix<T, accT>, T, accT> {
|
|
|
public:
|
|
|
using This = PackAMatrix<T, accT>;
|
|
|
using BaseType = PackMatrix<This, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackAMatrix() = delete;
|
|
|
|
|
|
PackAMatrix(
|
|
|
matrix_op_t trans,
|
|
|
std::int32_t nRow,
|
|
|
std::int32_t nCol,
|
|
|
const inpType* smat,
|
|
|
std::int32_t ld,
|
|
|
inpType* pmat = nullptr,
|
|
|
int groups = 1,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t* getRowOffsetBuffer() const {
|
|
|
return nullptr;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t addr(std::int32_t i, std::int32_t j) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(std::string name);
|
|
|
|
|
|
private:
|
|
|
matrix_op_t trans_;
|
|
|
const T* smat_;
|
|
|
std::int32_t ld_;
|
|
|
std::int32_t row_interleave_B_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t>
|
|
|
class FBGEMM_API PackBMatrix final
|
|
|
: public PackMatrix<PackBMatrix<T, accT>, T, accT> {
|
|
|
public:
|
|
|
using This = PackBMatrix<T, accT>;
|
|
|
using BaseType = PackMatrix<This, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackBMatrix() = delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackBMatrix(
|
|
|
matrix_op_t trans,
|
|
|
std::int32_t nRow,
|
|
|
std::int32_t nCol,
|
|
|
const inpType* smat,
|
|
|
std::int32_t ld,
|
|
|
inpType* pmat = nullptr,
|
|
|
int groups = 1,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isThisLastKBlock(int block_id) const {
|
|
|
return (BaseType::blockRows() - 1) == block_id;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t addr(std::int32_t i, std::int32_t j) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block, const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(
|
|
|
std::string name,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool metaEquals(const PackBMatrix<T, accT>& that) const;
|
|
|
|
|
|
|
|
|
|
|
|
bool equals(const PackBMatrix<T, accT>& that) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void unpack(T* origin_buf, const BlockingFactors* params = nullptr);
|
|
|
|
|
|
~PackBMatrix() {}
|
|
|
|
|
|
private:
|
|
|
matrix_op_t trans_;
|
|
|
const T* smat_;
|
|
|
std::int32_t ld_;
|
|
|
std::int32_t row_interleave_;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack_unpack_(
|
|
|
const block_type_t& block,
|
|
|
T* unpack_buf,
|
|
|
T* pack_buf,
|
|
|
bool ispack,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
|
|
|
class FBGEMM_API PackWeightMatrixForGConv {
|
|
|
public:
|
|
|
using This = PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackWeightMatrixForGConv() = delete;
|
|
|
PackWeightMatrixForGConv(const PackWeightMatrixForGConv&) = delete;
|
|
|
PackWeightMatrixForGConv& operator==(const PackWeightMatrixForGConv&) =
|
|
|
delete;
|
|
|
|
|
|
PackWeightMatrixForGConv(PackWeightMatrixForGConv&&) = delete;
|
|
|
PackWeightMatrixForGConv& operator==(PackWeightMatrixForGConv&&) =
|
|
|
delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackWeightMatrixForGConv(
|
|
|
matrix_op_t trans,
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_param,
|
|
|
const inpType* sdata,
|
|
|
inpType* pdata = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int numOfGroupsTogether(const conv_param_t<SPATIAL_DIM>& conv_param);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void unpack(T* origin_buf);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inpType* getBuf() {
|
|
|
return pdata_;
|
|
|
}
|
|
|
|
|
|
~PackWeightMatrixForGConv() {
|
|
|
if (bufAllocatedHere_) {
|
|
|
fbgemmAlignedFree(pdata_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
matrix_op_t trans_;
|
|
|
const conv_param_t<SPATIAL_DIM> conv_param_;
|
|
|
const T* sdata_;
|
|
|
T* pdata_;
|
|
|
bool bufAllocatedHere_{false};
|
|
|
|
|
|
int GTogether_;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack_unpack_(const T* src, T* dst, bool ispack);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int unpacked_index_(int t, int r, int s, int k, int g, int c, bool tr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int packed_index_(int t, int r, int s, int k, int g, int c);
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
int SPATIAL_DIM = 2,
|
|
|
typename T = std::int8_t,
|
|
|
typename accT = std::int32_t>
|
|
|
class FBGEMM_API PackWeightsForConv {
|
|
|
public:
|
|
|
using This = PackWeightsForConv<SPATIAL_DIM, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackWeightsForConv() = delete;
|
|
|
|
|
|
PackWeightsForConv(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_param,
|
|
|
const inpType* sdata,
|
|
|
const BlockingFactors* blocking_params = nullptr);
|
|
|
|
|
|
std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() {
|
|
|
return W_im2col_packed_;
|
|
|
}
|
|
|
|
|
|
std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
|
|
|
return W_dw_packed_;
|
|
|
}
|
|
|
|
|
|
std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() {
|
|
|
return W_dc_packed_;
|
|
|
}
|
|
|
|
|
|
std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
|
|
|
getPackedWForGroupwise() {
|
|
|
return W_gconv_packed_;
|
|
|
}
|
|
|
|
|
|
std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() {
|
|
|
return W_pointwise_packed_;
|
|
|
}
|
|
|
|
|
|
int inputChannels() {
|
|
|
return conv_param_.IC;
|
|
|
}
|
|
|
|
|
|
int outputChannels() {
|
|
|
return conv_param_.OC;
|
|
|
}
|
|
|
|
|
|
std::array<int, SPATIAL_DIM> kernelDims() {
|
|
|
return conv_param_.K;
|
|
|
}
|
|
|
|
|
|
int groups() {
|
|
|
return conv_param_.G;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string mismatchingParams(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void unpack(T* origin_buf);
|
|
|
|
|
|
private:
|
|
|
const conv_param_t<SPATIAL_DIM> conv_param_;
|
|
|
|
|
|
std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
|
|
|
|
|
|
std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
|
|
|
|
|
|
std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_;
|
|
|
|
|
|
|
|
|
std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
|
|
|
W_gconv_packed_;
|
|
|
|
|
|
std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
|
|
|
class FBGEMM_API PackAWithIm2Col
|
|
|
: public PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT> {
|
|
|
public:
|
|
|
using This = PackAWithIm2Col<T, accT, SPATIAL_DIM>;
|
|
|
using BaseType = PackMatrix<This, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackAWithIm2Col() = delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackAWithIm2Col(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_param,
|
|
|
const T* sdata,
|
|
|
inpType* pmat = nullptr,
|
|
|
std::int32_t a_zero_pt = 0,
|
|
|
std::int32_t* row_offset = nullptr,
|
|
|
bool b_symmetric = false,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t* getRowOffsetBuffer() const {
|
|
|
return row_offset_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(std::string name);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
|
|
|
|
|
|
~PackAWithIm2Col() {
|
|
|
if (rowOffsetAllocatedHere) {
|
|
|
fbgemmAlignedFree(row_offset_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
const conv_param_t<SPATIAL_DIM> conv_p_;
|
|
|
const T* sdata_;
|
|
|
std::int32_t a_zero_pt_;
|
|
|
std::int32_t* row_offset_{nullptr};
|
|
|
bool rowOffsetAllocatedHere{false};
|
|
|
std::int32_t row_interleave_B_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t>
|
|
|
class FBGEMM_API PackAWithRowOffset final
|
|
|
: public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
|
|
|
public:
|
|
|
using This = PackAWithRowOffset<T, accT>;
|
|
|
using BaseType = PackMatrix<This, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackAWithRowOffset() = delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackAWithRowOffset(
|
|
|
matrix_op_t trans,
|
|
|
std::uint32_t nRow,
|
|
|
std::uint32_t nCol,
|
|
|
const T* smat,
|
|
|
std::uint32_t ld,
|
|
|
inpType* pmat = nullptr,
|
|
|
int groups = 1,
|
|
|
std::int32_t* row_offset = nullptr,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t addr(std::int32_t i, std::int32_t j) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t* getRowOffsetBuffer() const {
|
|
|
return row_offset_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(std::string name);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
|
|
|
|
|
|
~PackAWithRowOffset() {
|
|
|
if (rowOffsetAllocatedHere) {
|
|
|
fbgemmAlignedFree(row_offset_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
matrix_op_t trans_;
|
|
|
const T* smat_;
|
|
|
std::uint32_t ld_;
|
|
|
std::int32_t* row_offset_{nullptr};
|
|
|
bool rowOffsetAllocatedHere{false};
|
|
|
std::int32_t row_interleave_B_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename accT = std::int32_t>
|
|
|
class FBGEMM_API PackAWithQuantRowOffset final
|
|
|
: public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
|
|
|
public:
|
|
|
using This = PackAWithQuantRowOffset<T, accT>;
|
|
|
using BaseType = PackMatrix<This, T, accT>;
|
|
|
using inpType = T;
|
|
|
using accType = accT;
|
|
|
|
|
|
PackAWithQuantRowOffset() = delete;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PackAWithQuantRowOffset(
|
|
|
matrix_op_t trans,
|
|
|
std::int32_t nRow,
|
|
|
std::int32_t nCol,
|
|
|
const float* smat,
|
|
|
std::int32_t ld,
|
|
|
inpType* pmat = nullptr,
|
|
|
float scale = 1.0f,
|
|
|
std::int32_t zero_pt = 0,
|
|
|
int groups = 1,
|
|
|
std::int32_t* row_offset = nullptr,
|
|
|
const BlockingFactors* params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isPrePacked() const {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr bool isA() {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t addr(std::int32_t i, std::int32_t j) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const block_type_t& block);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::int32_t* getRowOffsetBuffer() const {
|
|
|
return row_offset_;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void printPackedMatrix(std::string name);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
|
|
|
|
|
|
~PackAWithQuantRowOffset() {
|
|
|
if (rowOffsetAllocatedHere) {
|
|
|
fbgemmAlignedFree(row_offset_);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
matrix_op_t trans_;
|
|
|
const float* smat_;
|
|
|
std::int32_t ld_;
|
|
|
float scale_;
|
|
|
std::int32_t zero_pt_;
|
|
|
std::int32_t* row_offset_{nullptr};
|
|
|
bool rowOffsetAllocatedHere{false};
|
|
|
std::int32_t row_interleave_B_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename outT = std::uint8_t, typename inT = std::uint8_t>
|
|
|
class FBGEMM_API DoNothing {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
DoNothing() {}
|
|
|
template <inst_set_t instSet>
|
|
|
int f(
|
|
|
outType* ,
|
|
|
inpType* ,
|
|
|
const block_type_t& ,
|
|
|
int ,
|
|
|
int ) const {
|
|
|
return 0;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename outT = std::int32_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<outT, outT>>
|
|
|
class FBGEMM_API memCopy {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
explicit memCopy(nextOPType& nextop) : nextop_(nextop) {}
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outType* out,
|
|
|
inpType* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
nextOPType& nextop_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename outT = std::int32_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<outT, outT>>
|
|
|
class ScaleOP {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
explicit ScaleOP(inpType scalingFactor) : scalingFactor_(scalingFactor) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outType* out,
|
|
|
inpType* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
inpType scalingFactor_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename outT = std::int32_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<outT, outT>>
|
|
|
class ReluOutput {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
explicit ReluOutput(inpType zero_pt) : zero_pt_(zero_pt) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outType* out,
|
|
|
inpType* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
inpType zero_pt_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename outT = std::int32_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<inT, inT>>
|
|
|
class FBGEMM_API DoSpmdmOnInpBuffer {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
DoSpmdmOnInpBuffer(
|
|
|
nextOPType& nextop,
|
|
|
const std::uint8_t* A,
|
|
|
int lda,
|
|
|
const CompressedSparseColumn& B_csc,
|
|
|
int groups = 1)
|
|
|
: nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outT* out,
|
|
|
inT* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
nextOPType& nextop_;
|
|
|
const std::uint8_t* A_;
|
|
|
const int lda_;
|
|
|
const CompressedSparseColumn& B_csc_;
|
|
|
const int groups_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename outT = std::int32_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<inT, inT>>
|
|
|
class FBGEMM_API DoSConvOnInpBuffer {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
DoSConvOnInpBuffer(
|
|
|
nextOPType& nextop,
|
|
|
const std::uint8_t* A,
|
|
|
const conv_param_t<>& conv_p,
|
|
|
std::int32_t A_zero_point,
|
|
|
const CompressedSparseColumn& B_csc)
|
|
|
: nextop_(nextop),
|
|
|
A_(A),
|
|
|
conv_p_(conv_p),
|
|
|
A_zero_point_(A_zero_point),
|
|
|
B_csc_(B_csc) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outT* out,
|
|
|
inT* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
nextOPType& nextop_;
|
|
|
const std::uint8_t* A_;
|
|
|
const conv_param_t<> conv_p_;
|
|
|
const std::int32_t A_zero_point_;
|
|
|
const CompressedSparseColumn& B_csc_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
bool FUSE_RELU,
|
|
|
QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
|
|
|
typename BIAS_TYPE = std::int32_t,
|
|
|
typename outT = std::uint8_t,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<outT, outT>>
|
|
|
class FBGEMM_API ReQuantizeOutput {
|
|
|
public:
|
|
|
static constexpr int RELU_FUSED = FUSE_RELU;
|
|
|
static constexpr QuantizationGranularity QGRANType = Q_GRAN;
|
|
|
using BIAS_T = BIAS_TYPE;
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ReQuantizeOutput(
|
|
|
nextOPType& nextop,
|
|
|
const float* C_multiplier,
|
|
|
std::int32_t C_zero_point,
|
|
|
std::int32_t Aq_zero_point,
|
|
|
const std::int32_t* Bq_zero_point,
|
|
|
const std::int32_t* row_offsets,
|
|
|
const std::int32_t* col_offsets,
|
|
|
const BIAS_T* bias,
|
|
|
std::uint32_t nCol,
|
|
|
int groups = 1,
|
|
|
const float* act_times_w_scale = nullptr)
|
|
|
: nextop_(nextop),
|
|
|
C_multiplier_(C_multiplier),
|
|
|
C_zero_point_(C_zero_point),
|
|
|
Aq_zero_point_(Aq_zero_point),
|
|
|
Bq_zero_point_(Bq_zero_point),
|
|
|
q_row_offsets_(row_offsets),
|
|
|
q_col_offsets_(col_offsets),
|
|
|
bias_(bias),
|
|
|
ncols_(nCol),
|
|
|
groups_(groups),
|
|
|
act_times_w_scale_(act_times_w_scale) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outT* out,
|
|
|
const inT* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
const float* getCMultiplier() const {
|
|
|
return C_multiplier_;
|
|
|
}
|
|
|
std::int32_t getAZeroPoint() const {
|
|
|
return Aq_zero_point_;
|
|
|
}
|
|
|
std::int32_t getCZeroPoint() const {
|
|
|
return C_zero_point_;
|
|
|
}
|
|
|
const std::int32_t* getBZeroPoint() const {
|
|
|
return Bq_zero_point_;
|
|
|
}
|
|
|
const std::int32_t* getRowOffsets() const {
|
|
|
return q_row_offsets_;
|
|
|
}
|
|
|
const std::int32_t* getColOffsets() const {
|
|
|
return q_col_offsets_;
|
|
|
}
|
|
|
const BIAS_T* getBias() const {
|
|
|
return bias_;
|
|
|
}
|
|
|
std::uint32_t getNCols() const {
|
|
|
return ncols_;
|
|
|
}
|
|
|
const float* getActWScale() const {
|
|
|
return act_times_w_scale_;
|
|
|
}
|
|
|
|
|
|
void setRowOffsets(const std::int32_t* row_offsets) {
|
|
|
q_row_offsets_ = row_offsets;
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
nextOPType& nextop_;
|
|
|
const float* C_multiplier_;
|
|
|
std::int32_t C_zero_point_;
|
|
|
std::int32_t Aq_zero_point_;
|
|
|
const std::int32_t* Bq_zero_point_;
|
|
|
const std::int32_t* q_row_offsets_;
|
|
|
const std::int32_t* q_col_offsets_;
|
|
|
const BIAS_T* bias_;
|
|
|
std::uint32_t ncols_;
|
|
|
int groups_;
|
|
|
const float* act_times_w_scale_;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
bool FUSE_RELU,
|
|
|
QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
|
|
|
typename outT = float,
|
|
|
typename inT = std::int32_t,
|
|
|
typename nextOPType = DoNothing<outT, outT>>
|
|
|
class FBGEMM_API ReQuantizeForFloat {
|
|
|
public:
|
|
|
using outType = outT;
|
|
|
using inpType = inT;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ReQuantizeForFloat(
|
|
|
nextOPType& nextop,
|
|
|
float Aq_scale,
|
|
|
const float* Bq_scale,
|
|
|
std::int32_t Aq_zero_point,
|
|
|
const std::int32_t* Bq_zero_point,
|
|
|
const std::int32_t* row_offsets,
|
|
|
const std::int32_t* col_offsets,
|
|
|
const float* bias,
|
|
|
std::uint32_t nCol,
|
|
|
int groups = 1)
|
|
|
: nextop_(nextop),
|
|
|
Aq_scale_(Aq_scale),
|
|
|
Bq_scale_(Bq_scale),
|
|
|
Aq_zero_point_(Aq_zero_point),
|
|
|
Bq_zero_point_(Bq_zero_point),
|
|
|
q_row_offsets_(row_offsets),
|
|
|
q_col_offsets_(col_offsets),
|
|
|
bias_(bias),
|
|
|
ncols_(nCol),
|
|
|
groups_(groups) {}
|
|
|
|
|
|
template <inst_set_t instSet>
|
|
|
inline int f(
|
|
|
outT* out,
|
|
|
inT* inp,
|
|
|
const block_type_t& block,
|
|
|
int ld_out,
|
|
|
int ld_in) const;
|
|
|
|
|
|
private:
|
|
|
nextOPType& nextop_;
|
|
|
float Aq_scale_;
|
|
|
const float* Bq_scale_;
|
|
|
std::int32_t Aq_zero_point_;
|
|
|
const std::int32_t* Bq_zero_point_;
|
|
|
const std::int32_t* q_row_offsets_;
|
|
|
const std::int32_t* q_col_offsets_;
|
|
|
const float* bias_;
|
|
|
std::uint32_t ncols_;
|
|
|
int groups_;
|
|
|
};
|
|
|
|
|
|
|
|
|
#include "./OutputProcessing-inl.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename packingAMatrix,
|
|
|
typename packingBMatrix,
|
|
|
typename cT,
|
|
|
typename processOutputType>
|
|
|
FBGEMM_API void fbgemmPacked(
|
|
|
PackMatrix<
|
|
|
packingAMatrix,
|
|
|
typename packingAMatrix::inpType,
|
|
|
typename packingAMatrix::accType>& packA,
|
|
|
PackMatrix<
|
|
|
packingBMatrix,
|
|
|
typename packingBMatrix::inpType,
|
|
|
typename packingBMatrix::accType>& packB,
|
|
|
cT* C,
|
|
|
std::int32_t* C_buffer,
|
|
|
std::uint32_t ldc,
|
|
|
const processOutputType& outProcess,
|
|
|
int thread_id,
|
|
|
int num_threads,
|
|
|
const BlockingFactors* blocking_params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename packed_W,
|
|
|
typename outType,
|
|
|
bool FUSE_RELU,
|
|
|
QuantizationGranularity Q_GRAN,
|
|
|
int SPATIAL_DIM = 2,
|
|
|
typename BIAS_TYPE = std::int32_t>
|
|
|
FBGEMM_API void fbgemmGroupwiseConv(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_param,
|
|
|
const std::uint8_t* activations,
|
|
|
std::int32_t a_zero_point,
|
|
|
std::int32_t* rowOffsetBuf,
|
|
|
packed_W& packed_weights,
|
|
|
outType* out,
|
|
|
std::int32_t* outBuffer,
|
|
|
const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
|
|
|
int thread_id,
|
|
|
int num_threads);
|
|
|
|
|
|
template <
|
|
|
int SPATIAL_DIM,
|
|
|
QuantizationGranularity Q_GRAN,
|
|
|
bool FUSE_RELU,
|
|
|
typename BIAS_TYPE = std::int32_t>
|
|
|
FBGEMM_API void fbgemmDirectConv(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_p,
|
|
|
const uint8_t* Aint8,
|
|
|
PackedDirectConvMatrix& Bint8_tr,
|
|
|
uint8_t* C,
|
|
|
int32_t* C_buffer,
|
|
|
const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
|
|
|
const BIAS_TYPE* bias,
|
|
|
int thread_id,
|
|
|
int num_threads);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int SPATIAL_DIM = 2>
|
|
|
FBGEMM_API int rowOffsetBufferSizeGConv(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_param);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
|
|
|
bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int SPATIAL_DIM>
|
|
|
FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int SPATIAL_DIM>
|
|
|
FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FBGEMM_API bool fbgemmSupportedCPU();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <
|
|
|
typename processOutputType,
|
|
|
int SPATIAL_DIM = 2,
|
|
|
typename ACC_T = std::int32_t>
|
|
|
FBGEMM_API int fbgemmConv(
|
|
|
const conv_param_t<SPATIAL_DIM>& conv_p,
|
|
|
const std::uint8_t* activations,
|
|
|
PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights,
|
|
|
typename processOutputType::outType* out,
|
|
|
std::int32_t* outBuffer,
|
|
|
processOutputType& outProcess,
|
|
|
int thread_id,
|
|
|
int num_threads,
|
|
|
const BlockingFactors* blocking_params = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
|
|
|
FBGEMM_API optimized_conv_t
|
|
|
ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
|
|
|
}
|
|
|
|