|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
#include "./FbgemmBuild.h"
|
|
|
#include "./QuantUtilsAvx2.h"
|
|
|
#include "./QuantUtilsNeon.h"
|
|
|
#include "./Types.h"
|
|
|
#include "./Utils.h"
|
|
|
|
|
|
#include <algorithm>
|
|
|
#include <cassert>
|
|
|
#include <cmath>
|
|
|
#include <cstdint>
|
|
|
#include <limits>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace fbgemm {
|
|
|
|
|
|
FBGEMM_API TensorQuantizationParams ChooseQuantizationParams(
|
|
|
float min,
|
|
|
float max,
|
|
|
std::int32_t qmin,
|
|
|
std::int32_t qmax,
|
|
|
bool preserve_sparsity = false,
|
|
|
bool force_scale_power_of_two = false);
|
|
|
|
|
|
FBGEMM_API void ChooseRequantizationMultiplier(
|
|
|
float real_multiplier,
|
|
|
std::int32_t* quantized_multiplier,
|
|
|
int* right_shift,
|
|
|
int requantization_multiplier_precision = 32);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T1, typename T2 = std::uint8_t>
|
|
|
NO_SANITIZE("signed-integer-overflow")
|
|
|
T2 clamp(T1 src, int precision, bool is_signed = false) {
|
|
|
std::int32_t min = is_signed ? -(1LL << (precision - 1)) : 0;
|
|
|
std::int32_t max =
|
|
|
is_signed ? ((1LL << (precision - 1)) - 1) : (1LL << precision) - 1;
|
|
|
|
|
|
|
|
|
assert(min >= std::numeric_limits<T1>::lowest());
|
|
|
assert(min >= std::numeric_limits<T2>::lowest());
|
|
|
assert(max <= std::numeric_limits<T1>::max());
|
|
|
assert(max <= std::numeric_limits<T2>::max());
|
|
|
|
|
|
return std::min<T1>(std::max<T1>(src, min), max);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, bool LEGACY = true>
|
|
|
T Quantize(
|
|
|
float src,
|
|
|
std::int32_t zero_point,
|
|
|
float scale,
|
|
|
int result_precision,
|
|
|
bool result_is_signed = std::is_signed<T>::value) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
float inv_scale = 1.0f / scale;
|
|
|
|
|
|
float transformed_val = src * inv_scale;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (LEGACY) {
|
|
|
transformed_val = std::nearbyint(zero_point + transformed_val);
|
|
|
} else {
|
|
|
transformed_val = zero_point + std::nearbyint(transformed_val);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return clamp<double, T>(transformed_val, result_precision, result_is_signed);
|
|
|
}
|
|
|
|
|
|
template <typename T, bool LEGACY = true>
|
|
|
T Quantize(float src, const TensorQuantizationParams& qparams) {
|
|
|
return Quantize<T, LEGACY>(
|
|
|
src, qparams.zero_point, qparams.scale, qparams.precision);
|
|
|
}
|
|
|
|
|
|
template <typename T, bool LEGACY = true>
|
|
|
FBGEMM_API void Quantize(
|
|
|
const float* src,
|
|
|
T* dst,
|
|
|
std::int64_t len,
|
|
|
const TensorQuantizationParams& qparams,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, layout_t LAYOUT = layout_t::KCX>
|
|
|
FBGEMM_API void QuantizeGroupwise(
|
|
|
const float* src,
|
|
|
int K,
|
|
|
int C,
|
|
|
int X,
|
|
|
int G,
|
|
|
const float* scales,
|
|
|
const std::int32_t* zero_points,
|
|
|
T* dst);
|
|
|
|
|
|
template <typename T>
|
|
|
float Dequantize(T src, const TensorQuantizationParams& qparams) {
|
|
|
return qparams.scale * (src - qparams.zero_point);
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
void Dequantize(
|
|
|
const T* src,
|
|
|
float* dst,
|
|
|
std::int64_t len,
|
|
|
const TensorQuantizationParams& qparams,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1) {
|
|
|
int64_t i_begin, i_end;
|
|
|
fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end);
|
|
|
for (int64_t i = i_begin; i < i_end; i++) {
|
|
|
dst[i] = Dequantize(src[i], qparams);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
float FusedQuantizeDequantize(
|
|
|
float src,
|
|
|
const TensorQuantizationParams& qparams) {
|
|
|
T q = Quantize<T, false>(
|
|
|
src, qparams.zero_point, qparams.scale, qparams.precision);
|
|
|
return Dequantize<T>(q, qparams);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
FBGEMM_API void FusedQuantizeDequantize(
|
|
|
const float* src,
|
|
|
float* dst,
|
|
|
std::int64_t len,
|
|
|
const TensorQuantizationParams& qparams,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1,
|
|
|
float noise_ratio = 0.0f);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FBGEMM_API std::int64_t
|
|
|
SaturatingRoundingMulWithShift(std::int32_t a, std::int32_t b, int right_shift);
|
|
|
|
|
|
template <typename T>
|
|
|
T Requantize(
|
|
|
std::int32_t src,
|
|
|
std::int32_t zero_point,
|
|
|
std::int32_t multiplier,
|
|
|
int right_shift,
|
|
|
int result_precision,
|
|
|
bool result_is_signed = false) {
|
|
|
std::int64_t quantized_down =
|
|
|
zero_point + SaturatingRoundingMulWithShift(src, multiplier, right_shift);
|
|
|
return clamp<std::int64_t, T>(
|
|
|
quantized_down, result_precision, result_is_signed);
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
T RequantizeFixedPoint(
|
|
|
std::int32_t src,
|
|
|
const RequantizationParams& params) {
|
|
|
return Requantize<T>(
|
|
|
src,
|
|
|
params.target_qparams.zero_point,
|
|
|
params.multiplier,
|
|
|
params.right_shift,
|
|
|
params.target_qparams.precision);
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
FBGEMM_API void RequantizeFixedPoint(
|
|
|
const std::int32_t* src,
|
|
|
T* dst,
|
|
|
std::int64_t len,
|
|
|
const RequantizationParams& params,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
T Requantize(
|
|
|
std::int32_t src,
|
|
|
std::int32_t zero_point,
|
|
|
float multiplier,
|
|
|
int result_precision,
|
|
|
bool result_is_signed = false) {
|
|
|
long quantized_down = zero_point + std::lrintf(src * multiplier);
|
|
|
return clamp<long, T>(quantized_down, result_precision, result_is_signed);
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
T Requantize(
|
|
|
std::int32_t src,
|
|
|
const RequantizationParams& params) {
|
|
|
return Requantize<T>(
|
|
|
src,
|
|
|
params.target_qparams.zero_point,
|
|
|
params.real_multiplier,
|
|
|
params.target_qparams.precision);
|
|
|
}
|
|
|
|
|
|
template <typename T>
|
|
|
FBGEMM_API void Requantize(
|
|
|
const std::int32_t* src,
|
|
|
T* dst,
|
|
|
std::int64_t len,
|
|
|
const RequantizationParams& params,
|
|
|
int thread_id = 0,
|
|
|
int num_threads = 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType>
|
|
|
FBGEMM_API void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(
|
|
|
int bit_rate,
|
|
|
const InputType* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
std::uint8_t* output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType>
|
|
|
FBGEMM_API void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf(
|
|
|
int bit_rate,
|
|
|
const uint8_t* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
OutputType* output,
|
|
|
bool scale_bias_last = true);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType>
|
|
|
FBGEMM_API void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat(
|
|
|
const InputType* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
std::uint8_t* output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType>
|
|
|
FBGEMM_API void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf(
|
|
|
const uint8_t* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
OutputType* output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType>
|
|
|
FBGEMM_API void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef(
|
|
|
int bit_rate,
|
|
|
const InputType* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
std::uint8_t* output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType>
|
|
|
FBGEMM_API void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef(
|
|
|
const InputType* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
std::uint8_t* output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType, bool is_uint16_t_of_type_bf16 = false>
|
|
|
FBGEMM_API void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef(
|
|
|
int bit_rate,
|
|
|
const uint8_t* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
OutputType* output,
|
|
|
bool scale_bias_last = true);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType>
|
|
|
FBGEMM_API void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef(
|
|
|
const uint8_t* input,
|
|
|
size_t input_rows,
|
|
|
int input_columns,
|
|
|
OutputType* output);
|
|
|
|
|
|
}
|
|
|
|