Add files using upload-large-folder tool

dc9bb20 verified about 2 months ago

15.9 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	from bitsandbytes.optim.optimizer import Optimizer2State


	class Adam(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	Base Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	optim_bits,
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=is_paged,
	)


	class Adam8bit(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	8-bit Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	Note: This parameter is not supported in Adam8bit and must be False.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	Note: This parameter is not used in Adam8bit as it always uses 8-bit optimization.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	# Validate unsupported parameters
	if amsgrad:
	raise ValueError("Adam8bit does not support amsgrad=True")

	if optim_bits != 32:
	# We allow the default value of 32 to maintain compatibility with the function signature,
	# but any other value is invalid since Adam8bit always uses 8-bit optimization
	raise ValueError("Adam8bit only supports optim_bits=32 (default value for compatibility)")

	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	8, # Hardcoded to 8 bits
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=is_paged,
	)


	class Adam32bit(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	32-bit Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	32,
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=is_paged,
	)


	class PagedAdam(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	Paged Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	optim_bits,
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=True,
	)


	class PagedAdam8bit(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	8-bit paged Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	Note: This parameter is not supported in PagedAdam8bit and must be False.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	Note: This parameter is not used in PagedAdam8bit as it always uses 8-bit optimization.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	# Validate unsupported parameters
	if amsgrad:
	raise ValueError("PagedAdam8bit does not support amsgrad=True")

	if optim_bits != 32:
	# We allow the default value of 32 to maintain compatibility with the function signature,
	# but any other value is invalid since PagedAdam8bit always uses 8-bit optimization
	raise ValueError("PagedAdam8bit only supports optim_bits=32 (default value for compatibility)")

	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	8, # Hardcoded to 8 bits
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=True,
	)


	class PagedAdam32bit(Optimizer2State):
	def __init__(
	self,
	params,
	lr=1e-3,
	betas=(0.9, 0.999),
	eps=1e-8,
	weight_decay=0,
	amsgrad=False,
	optim_bits=32,
	args=None,
	min_8bit_size=4096,
	percentile_clipping=100,
	block_wise=True,
	is_paged=False,
	):
	"""
	Paged 32-bit Adam optimizer.

	Arguments:
	params (`torch.tensor`):
	The input parameters to optimize.
	lr (`float`, defaults to 1e-3):
	The learning rate.
	betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
	The beta values are the decay rates of the first and second-order moment of the optimizer.
	eps (`float`, defaults to 1e-8):
	The epsilon value prevents division by zero in the optimizer.
	weight_decay (`float`, defaults to 0.0):
	The weight decay value for the optimizer.
	amsgrad (`bool`, defaults to `False`):
	Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
	optim_bits (`int`, defaults to 32):
	The number of bits of the optimizer state.
	args (`object`, defaults to `None`):
	An object with additional arguments.
	min_8bit_size (`int`, defaults to 4096):
	The minimum number of elements of the parameter tensors for 8-bit optimization.
	percentile_clipping (`int`, defaults to 100):
	Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
	block_wise (`bool`, defaults to `True`):
	Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
	is_paged (`bool`, defaults to `False`):
	Whether the optimizer is a paged optimizer or not.
	"""
	super().__init__(
	"adam",
	params,
	lr,
	betas,
	eps,
	weight_decay,
	32,
	args,
	min_8bit_size,
	percentile_clipping,
	block_wise,
	is_paged=True,
	)