JSX_TTS / torch /include /ATen /quantized /Quantizer.h

Upload 5875 files

9dd3461 almost 3 years ago

9.19 kB

	#pragma once

	#include <c10/core/QScheme.h>
	#include <c10/core/MemoryFormat.h>
	#include <c10/macros/Macros.h>
	#include <c10/util/Exception.h>
	#include <c10/util/intrusive_ptr.h>
	#include <c10/core/ScalarType.h>
	#include <c10/core/TensorOptions.h>

	#include <ATen/Tensor.h>
	#include <ATen/TensorUtils.h>

	#include <ATen/core/QuantizerBase.h>

	#include <cmath>
	#include <memory>

	namespace at {

	/**
	* UnknownQuantizer is a placeholder quantizer for functions that implement
	* quantization in a two step process. First a tensor is allocated but with
	* unknown quantizer, and then the quantization kernel decides what the final
	* quantizer will be.
	*/
	struct TORCH_API UnknownQuantizer : public Quantizer {
	explicit UnknownQuantizer(ScalarType scalar_type)
	: Quantizer(scalar_type) {}

	Tensor quantize(const Tensor& tensor) override;
	Tensor dequantize(const Tensor& qtensor) override;
	Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
	QScheme qscheme() const override;
	bool equalTo(QuantizerPtr other) const override;
	};

	/**
	* UniformQuantizer is the parent class for all uniform quantizers.
	* These quantization scheme will map float value uniformly to
	* the quantized value. For example, affine quantizer is
	* the most commonly used scheme in this category.
	*/
	struct TORCH_API UniformQuantizer : public Quantizer {
	explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
	};

	/**
	* NonUniformQuantizer is the parent class for all non-uniform quantizers.
	* These quantization scheme may map float value non-uniformly to the quantized
	* value. K-means quantization is a representative example in this category.
	*/
	struct TORCH_API NonUniformQuantizer : public Quantizer {
	explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
	};

	// There is also StochasticQuantizer which is uniform but not affine

	/**
	* AffineQuantizer uses affine transformation to do quantization.
	*
	* For quantize:
	* Y = clamp(round(X / scale + zero_point), min, max)
	* For dequantize:
	* X = (Y - zero_point) * scale
	*/
	struct TORCH_API AffineQuantizer : public UniformQuantizer {
	explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
	};

	// Note that we will not have Symmetric Quantizer in backend to reduce
	// complications in quantized kernel implementation.

	/**
	* PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
	* all the values in the Tensor.
	*/
	struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
	explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
	: AffineQuantizer(scalar_type),
	scale_(scale),
	zero_point_(zero_point) {}

	Tensor quantize(const Tensor& tensor) override;
	Tensor dequantize(const Tensor& qtensor) override;
	Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;

	QScheme qscheme() const override {
	return kPerTensorAffine;
	}

	double scale() const {
	return scale_;
	}

	int64_t zero_point() const {
	return zero_point_;
	}

	bool equalTo(QuantizerPtr other) const override {
	if (!other.get() \|\| other->qscheme() != kPerTensorAffine) {
	return false;
	}
	auto* other_per_tensor_affine =
	static_cast<PerTensorAffineQuantizer*>(other.get());
	return scalar_type() == other_per_tensor_affine->scalar_type() &&
	scale() == other_per_tensor_affine->scale() &&
	zero_point() == other_per_tensor_affine->zero_point();
	}

	private:
	const double scale_;
	// We use int64_t for consistency with Python
	const int64_t zero_point_;
	};

	/**
	* PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
	* except that we have an independent scale and zero_point parameter
	* for each channel.
	*
	* Also note that per channel quantization is mostly applied to output channels
	* of weights since per-input channel of weight quantization or per-channel
	* quantization for activations can't be efficiently supported in most of
	* processors since it requires each multiplication result within a single
	* dot-product to have a different scale.
	*/
	struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
	explicit PerChannelAffineQuantizer(
	ScalarType scalar_type,
	Tensor scales,
	Tensor zero_points,
	int64_t axis)
	: AffineQuantizer(scalar_type),
	scales_(scales),
	zero_points_(zero_points),
	axis_(axis) {}

	QScheme qscheme() const override {
	return kPerChannelAffine;
	}

	Tensor scales() const {
	return scales_;
	}

	Tensor zero_points() const {
	return zero_points_;
	}

	int64_t axis() const {
	return axis_;
	}

	Tensor quantize(const Tensor& tensor) override;
	Tensor dequantize(const Tensor& qtensor) override;
	Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;

	bool equalTo(QuantizerPtr other) const override {
	if (!other.get() \|\| other->qscheme() != kPerChannelAffine) {
	return false;
	}
	auto* other_per_channel_affine =
	static_cast<PerChannelAffineQuantizer*>(other.get());
	return scalar_type() == other_per_channel_affine->scalar_type() &&
	scales().equal(other_per_channel_affine->scales()) &&
	zero_points().equal(other_per_channel_affine->zero_points()) &&
	axis() == other_per_channel_affine->axis();
	}

	protected:
	Tensor scales_;
	Tensor zero_points_;
	const int64_t axis_;
	};

	/**
	* PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer
	* except that it expects both scale and zero point to be floating point values.
	*
	* This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of
	* kPerChannelAffine.
	*
	* The quantize equation in this case looks like -
	* Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale
	*
	* Note: Usage of floating point zero point is useful in cases where 0 doesn't need to
	* be exactly represented in the quantized space. We can get additional precision by
	* using floating point values for zero point.
	*/
	struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
	explicit PerChannelAffineFloatQParamsQuantizer(
	ScalarType scalar_type,
	Tensor scales,
	Tensor zero_points,
	int64_t axis)
	: PerChannelAffineQuantizer(scalar_type,
	scales,
	zero_points,
	axis) {}

	QScheme qscheme() const override {
	return kPerChannelAffineFloatQParams;
	}

	Tensor quantize(const Tensor& tensor) override;
	Tensor dequantize(const Tensor& qtensor) override;
	Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;

	bool equalTo(QuantizerPtr other) const override {
	if (!other.get() \|\| other->qscheme() != kPerChannelAffineFloatQParams) {
	return false;
	}
	auto* other_per_channel_float_qparams =
	static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get());
	return scalar_type() == other_per_channel_float_qparams->scalar_type() &&
	scales().equal(other_per_channel_float_qparams->scales()) &&
	zero_points().equal(other_per_channel_float_qparams->zero_points()) &&
	axis() == other_per_channel_float_qparams->axis();
	}
	};

	// This is an internal utility function for getting at the QTensorImpl,
	// You should only use this for writing low level
	// setters/getters for QTensorImpl fields; otherwise, you should use
	// the low level setters/getters that were implemented using this.
	// This may be called repeatedly, so make sure it's pretty cheap.
	TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self);

	// double and int64_t are because of the native function API, we only have these
	// argument types right now in native functions
	TORCH_API QuantizerPtr
	make_per_tensor_affine_quantizer(
	double scale, int64_t zero_point, ScalarType scalar_type);

	TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
	const Tensor& scales,
	const Tensor& zero_points,
	int64_t axis,
	ScalarType scalar_type);

	TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);

	// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
	TORCH_API Tensor new_qtensor(
	IntArrayRef sizes,
	const TensorOptions& options,
	QuantizerPtr quantizer);

	TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);

	TORCH_API Tensor from_blob_quantized_per_tensor_affine(
	void* data,
	IntArrayRef sizes,
	IntArrayRef strides,
	std::function<void(void*)> deleter,
	const float scale,
	const int64_t zeroPoint,
	const TensorOptions& options);

	TORCH_API Tensor from_blob_quantized_per_tensor_affine(
	void* data,
	IntArrayRef sizes,
	std::function<void(void*)> deleter,
	const float scale,
	const int64_t zeroPoint,
	const TensorOptions& options);

	TORCH_API Tensor from_blob_quantized_per_channel_affine(
	void* data,
	IntArrayRef sizes,
	std::function<void(void*)> deleter,
	const Tensor& scales,
	const Tensor& zero_points,
	const int64_t axis,
	const TensorOptions& options);

	} // namespace at