|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
|
|
#include "./FbgemmBuild.h" |
|
|
#include "./QuantUtilsAvx2.h" |
|
|
#include "./QuantUtilsNeon.h" |
|
|
#include "./Types.h" |
|
|
#include "./Utils.h" |
|
|
|
|
|
#include <algorithm> |
|
|
#include <cassert> |
|
|
#include <cmath> |
|
|
#include <cstdint> |
|
|
#include <limits> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace fbgemm { |
|
|
|
|
|
FBGEMM_API TensorQuantizationParams ChooseQuantizationParams( |
|
|
float min, |
|
|
float max, |
|
|
std::int32_t qmin, |
|
|
std::int32_t qmax, |
|
|
bool preserve_sparsity = false, |
|
|
bool force_scale_power_of_two = false); |
|
|
|
|
|
FBGEMM_API void ChooseRequantizationMultiplier( |
|
|
float real_multiplier, |
|
|
std::int32_t* quantized_multiplier, |
|
|
int* right_shift, |
|
|
int requantization_multiplier_precision = 32); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T1, typename T2 = std::uint8_t> |
|
|
NO_SANITIZE("signed-integer-overflow") |
|
|
T2 clamp(T1 src, int precision, bool is_signed = false) { |
|
|
std::int32_t min = is_signed ? -(1LL << (precision - 1)) : 0; |
|
|
std::int32_t max = |
|
|
is_signed ? ((1LL << (precision - 1)) - 1) : (1LL << precision) - 1; |
|
|
|
|
|
|
|
|
assert(min >= std::numeric_limits<T1>::lowest()); |
|
|
assert(min >= std::numeric_limits<T2>::lowest()); |
|
|
assert(max <= std::numeric_limits<T1>::max()); |
|
|
assert(max <= std::numeric_limits<T2>::max()); |
|
|
|
|
|
return std::min<T1>(std::max<T1>(src, min), max); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T, bool LEGACY = true> |
|
|
T Quantize( |
|
|
float src, |
|
|
std::int32_t zero_point, |
|
|
float scale, |
|
|
int result_precision, |
|
|
bool result_is_signed = std::is_signed_v<T>) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
float inv_scale = 1.0f / scale; |
|
|
|
|
|
float transformed_val = src * inv_scale; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if constexpr (LEGACY) { |
|
|
transformed_val = std::nearbyint(zero_point + transformed_val); |
|
|
} else { |
|
|
transformed_val = zero_point + std::nearbyint(transformed_val); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return clamp<double, T>(transformed_val, result_precision, result_is_signed); |
|
|
} |
|
|
|
|
|
template <typename T, bool LEGACY = true> |
|
|
T Quantize(float src, const TensorQuantizationParams& qparams) { |
|
|
return Quantize<T, LEGACY>( |
|
|
src, qparams.zero_point, qparams.scale, qparams.precision); |
|
|
} |
|
|
|
|
|
template <typename T, bool LEGACY = true> |
|
|
FBGEMM_API void Quantize( |
|
|
const float* src, |
|
|
T* dst, |
|
|
std::int64_t len, |
|
|
const TensorQuantizationParams& qparams, |
|
|
int thread_id = 0, |
|
|
int num_threads = 1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, layout_t LAYOUT = layout_t::KCX> |
|
|
FBGEMM_API void QuantizeGroupwise( |
|
|
const float* src, |
|
|
int K, |
|
|
int C, |
|
|
int X, |
|
|
int G, |
|
|
const float* scales, |
|
|
const std::int32_t* zero_points, |
|
|
T* dst); |
|
|
|
|
|
template <typename T> |
|
|
float Dequantize(T src, const TensorQuantizationParams& qparams) { |
|
|
return qparams.scale * (src - qparams.zero_point); |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
void Dequantize( |
|
|
const T* src, |
|
|
float* dst, |
|
|
std::int64_t len, |
|
|
const TensorQuantizationParams& qparams, |
|
|
int thread_id = 0, |
|
|
int num_threads = 1) { |
|
|
int64_t i_begin = 0, i_end = 0; |
|
|
fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end); |
|
|
for (int64_t i = i_begin; i < i_end; i++) { |
|
|
dst[i] = Dequantize(src[i], qparams); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
float FusedQuantizeDequantize( |
|
|
float src, |
|
|
const TensorQuantizationParams& qparams) { |
|
|
T q = Quantize<T, false>( |
|
|
src, qparams.zero_point, qparams.scale, qparams.precision); |
|
|
return Dequantize<T>(q, qparams); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
FBGEMM_API void FusedQuantizeDequantize( |
|
|
const float* src, |
|
|
float* dst, |
|
|
std::int64_t len, |
|
|
const TensorQuantizationParams& qparams, |
|
|
int thread_id = 0, |
|
|
int num_threads = 1, |
|
|
float noise_ratio = 0.0f); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FBGEMM_API std::int64_t |
|
|
SaturatingRoundingMulWithShift(std::int32_t a, std::int32_t b, int right_shift); |
|
|
|
|
|
template <typename T> |
|
|
T Requantize( |
|
|
std::int32_t src, |
|
|
std::int32_t zero_point, |
|
|
std::int32_t multiplier, |
|
|
int right_shift, |
|
|
int result_precision, |
|
|
bool result_is_signed = false) { |
|
|
std::int64_t quantized_down = |
|
|
zero_point + SaturatingRoundingMulWithShift(src, multiplier, right_shift); |
|
|
return clamp<std::int64_t, T>( |
|
|
quantized_down, result_precision, result_is_signed); |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
T RequantizeFixedPoint( |
|
|
std::int32_t src, |
|
|
const RequantizationParams& params) { |
|
|
return Requantize<T>( |
|
|
src, |
|
|
params.target_qparams.zero_point, |
|
|
params.multiplier, |
|
|
params.right_shift, |
|
|
params.target_qparams.precision); |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
FBGEMM_API void RequantizeFixedPoint( |
|
|
const std::int32_t* src, |
|
|
T* dst, |
|
|
std::int64_t len, |
|
|
const RequantizationParams& params, |
|
|
int thread_id = 0, |
|
|
int num_threads = 1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
T Requantize( |
|
|
std::int32_t src, |
|
|
std::int32_t zero_point, |
|
|
float multiplier, |
|
|
int result_precision, |
|
|
bool result_is_signed = false) { |
|
|
long quantized_down = zero_point + std::lrintf(src * multiplier); |
|
|
return clamp<long, T>(quantized_down, result_precision, result_is_signed); |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
T Requantize( |
|
|
std::int32_t src, |
|
|
const RequantizationParams& params) { |
|
|
return Requantize<T>( |
|
|
src, |
|
|
params.target_qparams.zero_point, |
|
|
params.real_multiplier, |
|
|
params.target_qparams.precision); |
|
|
} |
|
|
|
|
|
template <typename T> |
|
|
FBGEMM_API void Requantize( |
|
|
const std::int32_t* src, |
|
|
T* dst, |
|
|
std::int64_t len, |
|
|
const RequantizationParams& params, |
|
|
int thread_id = 0, |
|
|
int num_threads = 1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType> |
|
|
FBGEMM_API void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( |
|
|
int bit_rate, |
|
|
const InputType* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
std::uint8_t* output); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType> |
|
|
FBGEMM_API void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf( |
|
|
int bit_rate, |
|
|
const uint8_t* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
OutputType* output, |
|
|
bool scale_bias_last = true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType> |
|
|
FBGEMM_API void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( |
|
|
const InputType* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
std::uint8_t* output, |
|
|
const InputType* rowwise_min_max = nullptr); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType> |
|
|
FBGEMM_API void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf( |
|
|
const uint8_t* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
OutputType* output); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType> |
|
|
FBGEMM_API void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( |
|
|
int bit_rate, |
|
|
const InputType* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
std::uint8_t* output); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename InputType> |
|
|
FBGEMM_API void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( |
|
|
const InputType* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
std::uint8_t* output); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType, bool is_uint16_t_of_type_bf16 = false> |
|
|
FBGEMM_API void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( |
|
|
int bit_rate, |
|
|
const uint8_t* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
OutputType* output, |
|
|
bool scale_bias_last = true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename OutputType> |
|
|
FBGEMM_API void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef( |
|
|
const uint8_t* input, |
|
|
size_t input_rows, |
|
|
int input_columns, |
|
|
OutputType* output); |
|
|
|
|
|
} |
|
|
|