# Extension of MXNet Module import logging import mxnet as mx import numpy as np import mxnet.ndarray as nd from collections import OrderedDict from mxnet.module import Module def nd_global_norm(t_list): """Computes the global norm of multiple tensors. Given a tuple or list of tensors t_list, this operation returns the global norm of the elements in all tensors in t_list. The global norm is computed as: ``global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))`` Any entries in t_list that are of type None are ignored. Parameters ---------- t_list: list or tuple The NDArray list Returns ------- ret: NDArray The global norm. The shape of the NDArray will be (1,) Examples -------- >>> x = mx.nd.ones((2, 3)) >>> y = mx.nd.ones((5, 6)) >>> z = mx.nd.ones((4, 2, 3)) >>> print(nd_global_norm([x, y, z]).asscalar()) 7.74597 >>> xnone = None >>> ret = nd_global_norm([x, y, z, xnone]) >>> print(ret.asscalar()) 7.74597 """ ret = None for arr in t_list: if arr is not None: if ret is None: ret = nd.square(nd.norm(arr)) else: ret += nd.square(nd.norm(arr)) ret = nd.sqrt(ret) return ret class MyModule(Module): """Some enhancement to the mx.mod.Module """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), logger=logging, context=mx.context.gpu(), work_load_list=None, fixed_param_names=None, state_names=None, name=None): self._name = name super(MyModule, self).__init__(symbol=symbol, data_names=data_names, label_names=label_names, logger=logger, context=context, work_load_list=work_load_list, fixed_param_names=fixed_param_names, state_names=state_names) self._tmp_grads = None def clip_by_global_norm(self, max_norm=1.0): """Clips gradient norm. The norm is computed over all gradients together, as if they were concatenated into a single vector. Gradients are modified in-place. The method is first used in `[ICML2013] On the difficulty of training recurrent neural networks` Parameters ---------- max_norm : float or int The maximum clipping threshold of the gradient norm. Returns ------- norm_val : float The computed norm of the gradients. Examples -------- An example of using clip_grad_norm to clip the gradient before updating the parameters:: >>> #Get the gradient via back-propagation >>> net.forward_backward(data_batch=data_batch) >>> norm_val = net.clip_by_global_norm(max_norm=1.0) >>> net.update() """ assert self.binded and self.params_initialized and self.optimizer_initialized norm_val = self.global_grad_norm() if norm_val > max_norm: ratio = max_norm / float(norm_val) for grads in self._exec_group.grad_arrays: for grad in grads: grad *= ratio return norm_val def global_grad_norm(self): """Calculate global gradient norm. The L2 norm is computed over all gradients together, as if they were concatenated into a single vector. Could be used to debug the optimization process. See http://videolectures.net/deeplearning2015_goodfellow_network_optimization/ Returns ------- norm_val : float The computed norm of the gradients. Examples -------- An example of using global_norm to calculate the gradient norm after back-propgation:: >>> #Get the gradient via back-propagation >>> net.forward_backward(data_batch=data_batch) >>> norm_val = net.global_grad_norm() >>> print(norm_val) """ assert self.binded and self.params_initialized and self.optimizer_initialized # The code in the following will cause the estimated norm to be different for multiple gpus norm_val = 0.0 for exe in self._exec_group.execs: norm_val += nd_global_norm(exe.grad_arrays).asscalar() norm_val /= float(len(self._exec_group.execs)) norm_val *= self._optimizer.rescale_grad return norm_val def debug_norm_all(self, debug_gnorm=True): if debug_gnorm: for k, v, grad_v in zip(self._param_names, self._exec_group.param_arrays, self._exec_group.grad_arrays): logging.debug("%s: v-norm: %g, g-norm: %g" %(k, nd.norm(v[0]).asnumpy()[0], nd.norm(grad_v[0]).asnumpy()[0])) else: for k, v in zip(self._param_names, self._exec_group.param_arrays): logging.debug("%s: v-norm: %g" %(k, nd.norm(v[0]).asnumpy()[0])) def summary(self, level=2): """Summarize the network parameters. Parameters ---------- level : int, optional Level of the summarization logs to print. The log becomes more verbose with higher summary level. - Level = 0 Print the total param number + aux param number - Level = 1 Print the shape of all parameters + The total number of paremter numbers - Level = 2 Print the shape of the data/state and other available information in Level 1 """ self.logger.info("Summary of %s" %self._name) assert self.binded and self.params_initialized assert 0 <= level <= 2, \ "Level must be between 0 and 2, level=%d is not supported" % level def _log_var(key, value, typ="param"): if typ == "param": if k in self._fixed_param_names: self.logger.info(" %s: %s, %d, req = %s, fixed" % (key, str(value.shape), np.prod(value.shape), self._exec_group.grad_req[k])) else: self.logger.info(" %s: %s, %d, req = %s" % (key, str(value.shape), np.prod(value.shape), self._exec_group.grad_req[k])) elif typ == "data" or typ == "aux": self.logger.info(" %s: %s, %d" % (key, str(value.shape), np.prod(value.shape))) total_param_num = 0 total_fixed_param_num = 0 total_aux_param_num = 0 if level >= 2: if len(self.data_names) == 0: self.logger.info("Data: None") else: self.logger.info("Data:") for k, v in zip(self.data_names, self.data_shapes): _log_var(k, v, typ="data") if len(self._state_names) == 0: self.logger.info("State: None") else: self.logger.info("State:") for k in self._state_names: v = self._exec_group.execs[0].arg_dict[k] _log_var(k, v, typ="data") if level >= 1: if len(self._param_names) == 0: self.logger.info("Param: None") else: self.logger.info("Params:") for k in self._param_names: v = self._arg_params[k] _log_var(k, v) if k in self._fixed_param_names: total_fixed_param_num += np.prod(v.shape) else: total_param_num += np.prod(v.shape) if len(self._aux_names) == 0: self.logger.info("Aux States: None") else: self.logger.info("Aux States: ") for k in self._aux_names: v = self._aux_params[k] _log_var(k, v, typ="aux") total_aux_param_num += np.prod(v.shape) else: for k in self._param_names: v = self._arg_params[k] total_param_num += np.prod(v.shape) for k in self._aux_names: v = self._aux_params[k] total_aux_param_num += np.prod(v.shape) self.logger.info("Total Param Num (exclude fixed ones): " + str(total_param_num)) self.logger.info("Total Fixed Param Num: " + str(total_fixed_param_num)) self.logger.info("Total Aux Param Num: " + str(total_aux_param_num)) def get_output_dict(self): outputs = self.get_outputs() return OrderedDict([(k, v) for k, v in zip(self._output_names, outputs)]) def clear_grad(self): assert self.binded and self.params_initialized and self.optimizer_initialized # clear the gradient for grads in self._exec_group.grad_arrays: for grad in grads: grad[:] = 0 def save_tmp_grad(self): if self._tmp_grads is None: self._tmp_grads = [] for grads in self._exec_group.grad_arrays: vec = [] for grad in grads: vec.append(grad.copyto(grad.context)) self._tmp_grads.append(vec) else: for i, grads in enumerate(self._exec_group.grad_arrays): for j, grad in enumerate(grads): self._tmp_grads[i][j][:] = grad def acc_grad_with_tmp(self): assert self._tmp_grads is not None for i, grads in enumerate(self._exec_group.grad_arrays): for j, grad in enumerate(grads): grad += self._tmp_grads[i][j] def load_params_allow_missing(self, fname): """Loads model parameters from file. Parameters ---------- fname : str Path to input param file. Examples -------- >>> # An example of loading module parameters. >>> mod.load_params('myfile') """ logging.info("Load Param From %s" %fname) save_dict = mx.nd.load(fname) arg_params = {} aux_params = {} for k, value in save_dict.items(): arg_type, name = k.split(':', 1) if arg_type == 'arg': if name in self._param_names: logging.info("set %s" %name) arg_params[name] = value elif arg_type == 'aux': if name in self._aux_names: logging.info("set %s" % name) aux_params[name] = value else: raise ValueError("Invalid param file " + fname) self.set_params(arg_params, aux_params, allow_missing=True)