Spaces:

assasinatee
/

STAR

Sleeping

Yixuan Li

add fairseq folder

85ba398 6 months ago

15.3 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import types

	import torch


	def get_fused_adam_class():
	"""
	Look for the FusedAdam optimizer from apex. We first try to load the
	"contrib" interface, which is a bit faster than the main interface,
	but is technically deprecated.
	"""
	try:
	# The "deprecated" interface in recent versions of apex is a bit
	# faster than the main interface, since we don't use the apex
	# optimizer. This can be installed by passing the
	# `--deprecated_fused_adam` option when building apex.
	global fused_adam_cuda
	import importlib

	fused_adam_cuda = importlib.import_module("fused_adam_cuda")
	return FusedAdamV1
	except ImportError:
	try:
	# fallback to the newer interface
	from apex.multi_tensor_apply import multi_tensor_applier
	from apex.optimizers import FusedAdam as _FusedAdam # noqa

	if multi_tensor_applier.available:
	return FusedAdamV2
	except ImportError:
	pass
	return None


	class FusedAdamV1(torch.optim.Optimizer):
	"""
	Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
	``python setup.py install --cuda_ext --cpp_ext``.

	It has been proposed in `Adam: A Method for Stochastic Optimization`_.

	Compared to the original version in Apex, the fairseq version casts grads
	and params to FP32 internally to support ``--memory-efficient-fp16``.

	Args:
	params (iterable): iterable of parameters to optimize or dicts defining
	parameter groups.
	lr (float, optional): learning rate. (default: 1e-3)
	betas (Tuple[float, float], optional): coefficients used for computing
	running averages of gradient and its square. (default: (0.9, 0.999))
	eps (float, optional): term added to the denominator to improve
	numerical stability. (default: 1e-8)
	weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
	amsgrad (boolean, optional): whether to use the AMSGrad variant of this
	algorithm from the paper `On the Convergence of Adam and Beyond`_
	(default: False) NOT SUPPORTED in FusedAdam!
	eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
	adds eps to the bias-corrected second moment estimate before
	evaluating square root instead of adding it to the square root of
	second moment estimate as in the original paper. (default: False)
	.. _Adam: A Method for Stochastic Optimization:
	https://arxiv.org/abs/1412.6980
	.. _On the Convergence of Adam and Beyond:
	https://openreview.net/forum?id=ryQu7f-RZ
	"""

	def __init__(
	self,
	params,
	lr=1e-3,
	bias_correction=True,
	betas=(0.9, 0.999),
	eps=1e-8,
	eps_inside_sqrt=False,
	weight_decay=0.0,
	max_grad_norm=0.0,
	amsgrad=False,
	use_fp16_stats=False,
	):
	global fused_adam_cuda
	import importlib

	fused_adam_cuda = importlib.import_module("fused_adam_cuda")

	if amsgrad:
	raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
	defaults = {
	"lr": lr,
	"bias_correction": bias_correction,
	"betas": betas,
	"eps": eps,
	"weight_decay": weight_decay,
	"max_grad_norm": max_grad_norm,
	}
	super().__init__(params, defaults)
	self.eps_mode = 0 if eps_inside_sqrt else 1

	self.use_fp16_stats = use_fp16_stats
	self.FLOAT16_MAX = 65504.0

	@property
	def supports_memory_efficient_fp16(self):
	return True

	@property
	def supports_flat_params(self):
	return True

	@property
	def supports_step_with_scale(self):
	return True

	def step(self, closure=None, grads=None, scale=1.0, grad_norms=None):
	"""Performs a single optimization step.
	Args:
	closure (callable, optional): A closure that reevaluates the model
	and returns the loss.
	grads (list of tensors, optional): weight gradient to use for the
	optimizer update. If gradients have type torch.half, parameters
	are expected to be in type torch.float. (default: None)
	output params (list of tensors, optional): A reduced precision copy
	of the updated weights written out in addition to the regular
	updated weights. Have to be of same type as gradients. (default: None)
	scale (float, optional): factor to divide gradient tensor values
	by before applying to weights. (default: 1)
	"""
	loss = None
	if closure is not None:
	loss = closure()

	if grads is None:
	grads_group = [None] * len(self.param_groups)
	# backward compatibility
	# assuming a list/generator of parameter means single group
	elif isinstance(grads, types.GeneratorType):
	grads_group = [grads]
	elif type(grads[0]) != list:
	grads_group = [grads]
	else:
	grads_group = grads

	if grad_norms is None:
	grad_norms = [None] * len(self.param_groups)

	for group, grads_this_group, grad_norm in zip(
	self.param_groups, grads_group, grad_norms
	):
	if grads_this_group is None:
	grads_this_group = [None] * len(group["params"])

	# compute combined scale factor for this group
	combined_scale = scale
	if group.get("max_grad_norm", 0) > 0:
	# norm is in fact norm*scale
	clip = ((grad_norm / scale) + 1e-6) / group["max_grad_norm"]
	if clip > 1:
	combined_scale = clip * scale

	bias_correction = 1 if group.get("bias_correction", 1) else 0

	for p, grad in zip(group["params"], grads_this_group):
	# note: p.grad should not ever be set for correct
	# operation of mixed precision optimizer that sometimes
	# sends None gradients
	if p.grad is None and grad is None:
	continue
	if grad is None:
	grad = p.grad.data
	if grad.is_sparse:
	raise RuntimeError(
	"FusedAdam does not support sparse gradients, "
	"please consider SparseAdam instead"
	)

	if p.device.type == "cpu":
	p_data_fp32 = p.data.cuda(non_blocking=True).float()
	out_p = torch.tensor([], dtype=torch.float)
	else:
	p_data_fp32 = p.data.float()
	out_p = p.data

	state = self.state[p]

	# State initialization
	dtype = torch.float16 if self.use_fp16_stats else p_data_fp32.dtype
	if len(state) == 0:
	state["step"] = 0
	# Exponential moving average of gradient values
	state["exp_avg"] = torch.zeros_like(p_data_fp32, dtype=dtype)
	# Exponential moving average of squared gradient values
	state["exp_avg_sq"] = torch.zeros_like(p_data_fp32, dtype=dtype)
	if self.use_fp16_stats:
	state["exp_avg_scale"] = 1.0
	state["exp_avg_sq_scale"] = 1.0
	else:
	device = p_data_fp32.device
	state["exp_avg"] = state["exp_avg"].to(device, dtype)
	state["exp_avg_sq"] = state["exp_avg_sq"].to(device, dtype)

	exp_avg = state["exp_avg"]
	exp_avg_sq = state["exp_avg_sq"]
	if self.use_fp16_stats:
	assert exp_avg.dtype == torch.float16
	exp_avg = exp_avg.float() * state["exp_avg_scale"]
	exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"]
	beta1, beta2 = group["betas"]

	if "step" not in state:
	state["step"] = group["step"]

	state["step"] += 1

	with torch.cuda.device(p_data_fp32.device):
	fused_adam_cuda.adam(
	p_data_fp32,
	out_p,
	exp_avg,
	exp_avg_sq,
	grad,
	group["lr"],
	beta1,
	beta2,
	group["eps"],
	combined_scale,
	state["step"],
	self.eps_mode,
	bias_correction,
	group["weight_decay"],
	)

	if p.device.type == "cpu":
	p.data.copy_(p_data_fp32, non_blocking=True)

	if self.use_fp16_stats:

	def inf_norm(t):
	return torch.norm(t, float("inf"))

	# from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py
	state["exp_avg_scale"], state["exp_avg_sq_scale"] = (
	1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX,
	1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX,
	)
	state["exp_avg"], state["exp_avg_sq"] = (
	(exp_avg / state["exp_avg_scale"]).half(),
	(exp_avg_sq / state["exp_avg_sq_scale"]).half(),
	)

	return loss


	try:
	from apex.multi_tensor_apply import multi_tensor_applier
	from apex.optimizers import FusedAdam

	class FusedAdamV2(FusedAdam):
	"""
	Compared to the original version in Apex, the fairseq version casts grads
	and params to FP32 internally to support ``--memory-efficient-fp16``.
	"""

	def __init__(self, args, use_fp16_stats=False, *kwargs):
	if use_fp16_stats:
	raise NotImplementedError(
	"--fp16-adam-stats is only supported with FusedAdamV1"
	)
	super().__init__(args, *kwargs)
	if not hasattr(self, "multi_tensor_adam"):
	raise Exception(
	"Apex installation is outdated. Please install an updated version of apex."
	)

	@property
	def supports_memory_efficient_fp16(self):
	return True

	@property
	def supports_flat_params(self):
	return True

	def step(
	self,
	closure=None,
	grads=None,
	output_params=None,
	scale=None,
	grad_norms=None,
	):
	"""Performs a single optimization step."""
	loss = None
	if closure is not None:
	loss = closure()

	for group in self.param_groups:
	bias_correction = 1 if group["bias_correction"] else 0
	beta1, beta2 = group["betas"]

	# assume same step across group now to simplify things
	# per parameter step can be easily support by making it tensor, or pass list into kernel
	if "step" in group:
	group["step"] += 1
	else:
	group["step"] = 1

	# create lists for multi-tensor apply
	g_16, p_16, orig_p_16, m_16, v_16 = [], [], [], [], []
	g_32, p_32, m_32, v_32 = [], [], [], []

	for p in group["params"]:
	if p.grad is None:
	continue
	if p.grad.data.is_sparse:
	raise RuntimeError(
	"FusedAdam does not support sparse gradients, "
	"please consider SparseAdam instead"
	)

	state = self.state[p]
	# State initialization
	if len(state) == 0:
	# Exponential moving average of gradient values
	state["exp_avg"] = torch.zeros_like(p.data, dtype=torch.float)
	# Exponential moving average of squared gradient values
	state["exp_avg_sq"] = torch.zeros_like(
	p.data, dtype=torch.float
	)
	else:
	state["exp_avg"] = state["exp_avg"].to(
	device=p.data.device, dtype=torch.float
	)
	state["exp_avg_sq"] = state["exp_avg_sq"].to(
	device=p.data.device, dtype=torch.float
	)

	if p.dtype == torch.float16:
	g_16.append(p.grad.data.float())
	p_16.append(p.data.float())
	orig_p_16.append(p.data)
	m_16.append(state["exp_avg"])
	v_16.append(state["exp_avg_sq"])
	elif p.dtype == torch.float32:
	g_32.append(p.grad.data)
	p_32.append(p.data)
	m_32.append(state["exp_avg"])
	v_32.append(state["exp_avg_sq"])
	else:
	raise RuntimeError("FusedAdam only support fp16 and fp32.")

	with torch.cuda.device(p.device):
	if len(g_16) > 0:
	multi_tensor_applier(
	self.multi_tensor_adam,
	self._dummy_overflow_buf,
	[g_16, p_16, m_16, v_16],
	group["lr"],
	beta1,
	beta2,
	group["eps"],
	group["step"],
	self.adam_w_mode,
	bias_correction,
	group["weight_decay"],
	)
	for orig_p, p in zip(orig_p_16, p_16):
	orig_p.copy_(p.data)
	if len(g_32) > 0:
	multi_tensor_applier(
	self.multi_tensor_adam,
	self._dummy_overflow_buf,
	[g_32, p_32, m_32, v_32],
	group["lr"],
	beta1,
	beta2,
	group["eps"],
	group["step"],
	self.adam_w_mode,
	bias_correction,
	group["weight_decay"],
	)

	return loss

	except ImportError:
	pass