|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
#include <cstdint>
|
|
|
#include <functional>
|
|
|
#include <memory>
|
|
|
#include <vector>
|
|
|
|
|
|
#include "fbgemm/FbgemmBuild.h"
|
|
|
#include "fbgemm/UtilsAvx2.h"
|
|
|
#include "fbgemm/spmmUtilsAvx2.h"
|
|
|
|
|
|
namespace fbgemm {
|
|
|
|
|
|
template <typename T>
|
|
|
struct FBGEMM_API CSRMatrix {
|
|
|
std::vector<int> rowPtr;
|
|
|
std::vector<int> colIdx;
|
|
|
std::vector<T> values;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T = std::int8_t, int ROW_BLOCK = 1, int COL_BLOCK = 4>
|
|
|
struct FBGEMM_API BCSRMatrix {
|
|
|
using DTYPE = T;
|
|
|
static constexpr int RB = ROW_BLOCK;
|
|
|
static constexpr int CB = COL_BLOCK;
|
|
|
|
|
|
|
|
|
static constexpr int COLTILE = 4000;
|
|
|
std::vector<int> rowBPtr;
|
|
|
std::vector<int> colBIdx;
|
|
|
std::vector<DTYPE> values;
|
|
|
|
|
|
std::vector<int32_t> row_offsets;
|
|
|
int R;
|
|
|
int C;
|
|
|
|
|
|
BCSRMatrix(int Rows, int Cols) {
|
|
|
R = Rows;
|
|
|
C = Cols;
|
|
|
row_offsets.resize(R, 0);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const DTYPE* src, size_t ld);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void pack(const DTYPE* src);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void unpack(DTYPE* dst, size_t ld);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void unpack(DTYPE* dst);
|
|
|
};
|
|
|
|
|
|
template <typename T>
|
|
|
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
|
|
|
fbgemmDenseToCSR(int R, int C, const T* inp, int ld);
|
|
|
|
|
|
template <typename T>
|
|
|
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
|
|
|
fbgemmDenseToCSR(int R, int C, const T* inp);
|
|
|
|
|
|
template <typename T = std::int8_t, int RB = 1, int CB = 4>
|
|
|
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
|
|
|
fbgemmDenseToBCSR(int R, int C, const T* inp, int ld);
|
|
|
|
|
|
template <typename T = std::int8_t, int RB = 1, int CB = 4>
|
|
|
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
|
|
|
fbgemmDenseToBCSR(int R, int C, const T* inp);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FBGEMM_API void SparseDenseMM(
|
|
|
int M,
|
|
|
int N,
|
|
|
const int* row_ptr,
|
|
|
const int* col_idx,
|
|
|
const float* values,
|
|
|
const float* B,
|
|
|
int ldb,
|
|
|
float* C,
|
|
|
int ldc,
|
|
|
bool accum = false);
|
|
|
|
|
|
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
|
|
|
FBGEMM_API void fbgemmSparseDenseInt8MM(
|
|
|
int N,
|
|
|
const std::unique_ptr<BCSRMatrix<>>& bcsr,
|
|
|
const uint8_t* B,
|
|
|
int ldb,
|
|
|
int32_t* C_i32,
|
|
|
uint8_t* C_u8,
|
|
|
int ldc,
|
|
|
trRequantizationParams_t& rParams,
|
|
|
bool accum = false,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
namespace internal {
|
|
|
|
|
|
void SparseDenseMMAvx2(
|
|
|
int M,
|
|
|
int N,
|
|
|
const int* row_ptr,
|
|
|
const int* col_idx,
|
|
|
const float* values,
|
|
|
const float* B,
|
|
|
int ldb,
|
|
|
float* C,
|
|
|
int ldc,
|
|
|
bool accum = false);
|
|
|
|
|
|
void SparseDenseMMAvx512(
|
|
|
int M,
|
|
|
int N,
|
|
|
const int* row_ptr,
|
|
|
const int* col_idx,
|
|
|
const float* values,
|
|
|
const float* B,
|
|
|
int ldb,
|
|
|
float* C,
|
|
|
int ldc,
|
|
|
bool accum = false);
|
|
|
|
|
|
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
|
|
|
void SparseDenseInt8MMAvx2(
|
|
|
int N,
|
|
|
const std::unique_ptr<BCSRMatrix<>>& bcsr,
|
|
|
const uint8_t* B,
|
|
|
int ldb,
|
|
|
int32_t* C_i32,
|
|
|
uint8_t* C_u8,
|
|
|
int ldc,
|
|
|
trRequantizationParams_t& rParams,
|
|
|
bool accum = false,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
|
|
|
void SparseDenseInt8MMAvx512(
|
|
|
int N,
|
|
|
const std::unique_ptr<BCSRMatrix<>>& bcsr,
|
|
|
const uint8_t* B,
|
|
|
int ldb,
|
|
|
int32_t* C_i32,
|
|
|
uint8_t* C_u8,
|
|
|
int ldc,
|
|
|
trRequantizationParams_t& rParams,
|
|
|
bool accum = false,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
|
|
|
void SparseDenseInt8MVAvx512(
|
|
|
const std::unique_ptr<BCSRMatrix<>>& bcsr,
|
|
|
const uint8_t* B,
|
|
|
int ldb,
|
|
|
int32_t* C_i32,
|
|
|
uint8_t* C_u8,
|
|
|
trRequantizationParams_t& rParams,
|
|
|
bool accum = false,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|