STLDM_official / nowcasting /my_module.py
sqfoo's picture
Upload 99 files
6021dd1 verified
# Extension of MXNet Module
import logging
import mxnet as mx
import numpy as np
import mxnet.ndarray as nd
from collections import OrderedDict
from mxnet.module import Module
def nd_global_norm(t_list):
"""Computes the global norm of multiple tensors.
Given a tuple or list of tensors t_list, this operation returns the global norm of the elements
in all tensors in t_list. The global norm is computed as:
``global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))``
Any entries in t_list that are of type None are ignored.
Parameters
----------
t_list: list or tuple
The NDArray list
Returns
-------
ret: NDArray
The global norm. The shape of the NDArray will be (1,)
Examples
--------
>>> x = mx.nd.ones((2, 3))
>>> y = mx.nd.ones((5, 6))
>>> z = mx.nd.ones((4, 2, 3))
>>> print(nd_global_norm([x, y, z]).asscalar())
7.74597
>>> xnone = None
>>> ret = nd_global_norm([x, y, z, xnone])
>>> print(ret.asscalar())
7.74597
"""
ret = None
for arr in t_list:
if arr is not None:
if ret is None:
ret = nd.square(nd.norm(arr))
else:
ret += nd.square(nd.norm(arr))
ret = nd.sqrt(ret)
return ret
class MyModule(Module):
"""Some enhancement to the mx.mod.Module
"""
def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
logger=logging, context=mx.context.gpu(), work_load_list=None,
fixed_param_names=None, state_names=None, name=None):
self._name = name
super(MyModule, self).__init__(symbol=symbol,
data_names=data_names,
label_names=label_names,
logger=logger,
context=context,
work_load_list=work_load_list,
fixed_param_names=fixed_param_names,
state_names=state_names)
self._tmp_grads = None
def clip_by_global_norm(self, max_norm=1.0):
"""Clips gradient norm.
The norm is computed over all gradients together, as if they were
concatenated into a single vector. Gradients are modified in-place.
The method is first used in
`[ICML2013] On the difficulty of training recurrent neural networks`
Parameters
----------
max_norm : float or int
The maximum clipping threshold of the gradient norm.
Returns
-------
norm_val : float
The computed norm of the gradients.
Examples
--------
An example of using clip_grad_norm to clip the gradient before updating the parameters::
>>> #Get the gradient via back-propagation
>>> net.forward_backward(data_batch=data_batch)
>>> norm_val = net.clip_by_global_norm(max_norm=1.0)
>>> net.update()
"""
assert self.binded and self.params_initialized and self.optimizer_initialized
norm_val = self.global_grad_norm()
if norm_val > max_norm:
ratio = max_norm / float(norm_val)
for grads in self._exec_group.grad_arrays:
for grad in grads:
grad *= ratio
return norm_val
def global_grad_norm(self):
"""Calculate global gradient norm.
The L2 norm is computed over all gradients together, as if they were
concatenated into a single vector.
Could be used to debug the optimization process.
See http://videolectures.net/deeplearning2015_goodfellow_network_optimization/
Returns
-------
norm_val : float
The computed norm of the gradients.
Examples
--------
An example of using global_norm to calculate the gradient norm after back-propgation::
>>> #Get the gradient via back-propagation
>>> net.forward_backward(data_batch=data_batch)
>>> norm_val = net.global_grad_norm()
>>> print(norm_val)
"""
assert self.binded and self.params_initialized and self.optimizer_initialized
# The code in the following will cause the estimated norm to be different for multiple gpus
norm_val = 0.0
for exe in self._exec_group.execs:
norm_val += nd_global_norm(exe.grad_arrays).asscalar()
norm_val /= float(len(self._exec_group.execs))
norm_val *= self._optimizer.rescale_grad
return norm_val
def debug_norm_all(self, debug_gnorm=True):
if debug_gnorm:
for k, v, grad_v in zip(self._param_names, self._exec_group.param_arrays,
self._exec_group.grad_arrays):
logging.debug("%s: v-norm: %g, g-norm: %g"
%(k,
nd.norm(v[0]).asnumpy()[0],
nd.norm(grad_v[0]).asnumpy()[0]))
else:
for k, v in zip(self._param_names, self._exec_group.param_arrays):
logging.debug("%s: v-norm: %g"
%(k,
nd.norm(v[0]).asnumpy()[0]))
def summary(self, level=2):
"""Summarize the network parameters.
Parameters
----------
level : int, optional
Level of the summarization logs to print.
The log becomes more verbose with higher summary level.
- Level = 0
Print the total param number + aux param number
- Level = 1
Print the shape of all parameters + The total number of paremter numbers
- Level = 2
Print the shape of the data/state and other available information in Level 1
"""
self.logger.info("Summary of %s" %self._name)
assert self.binded and self.params_initialized
assert 0 <= level <= 2, \
"Level must be between 0 and 2, level=%d is not supported" % level
def _log_var(key, value, typ="param"):
if typ == "param":
if k in self._fixed_param_names:
self.logger.info(" %s: %s, %d, req = %s, fixed"
% (key,
str(value.shape),
np.prod(value.shape),
self._exec_group.grad_req[k]))
else:
self.logger.info(" %s: %s, %d, req = %s"
% (key,
str(value.shape),
np.prod(value.shape),
self._exec_group.grad_req[k]))
elif typ == "data" or typ == "aux":
self.logger.info(" %s: %s, %d"
% (key,
str(value.shape),
np.prod(value.shape)))
total_param_num = 0
total_fixed_param_num = 0
total_aux_param_num = 0
if level >= 2:
if len(self.data_names) == 0:
self.logger.info("Data: None")
else:
self.logger.info("Data:")
for k, v in zip(self.data_names, self.data_shapes):
_log_var(k, v, typ="data")
if len(self._state_names) == 0:
self.logger.info("State: None")
else:
self.logger.info("State:")
for k in self._state_names:
v = self._exec_group.execs[0].arg_dict[k]
_log_var(k, v, typ="data")
if level >= 1:
if len(self._param_names) == 0:
self.logger.info("Param: None")
else:
self.logger.info("Params:")
for k in self._param_names:
v = self._arg_params[k]
_log_var(k, v)
if k in self._fixed_param_names:
total_fixed_param_num += np.prod(v.shape)
else:
total_param_num += np.prod(v.shape)
if len(self._aux_names) == 0:
self.logger.info("Aux States: None")
else:
self.logger.info("Aux States: ")
for k in self._aux_names:
v = self._aux_params[k]
_log_var(k, v, typ="aux")
total_aux_param_num += np.prod(v.shape)
else:
for k in self._param_names:
v = self._arg_params[k]
total_param_num += np.prod(v.shape)
for k in self._aux_names:
v = self._aux_params[k]
total_aux_param_num += np.prod(v.shape)
self.logger.info("Total Param Num (exclude fixed ones): " + str(total_param_num))
self.logger.info("Total Fixed Param Num: " + str(total_fixed_param_num))
self.logger.info("Total Aux Param Num: " + str(total_aux_param_num))
def get_output_dict(self):
outputs = self.get_outputs()
return OrderedDict([(k, v) for k, v in zip(self._output_names, outputs)])
def clear_grad(self):
assert self.binded and self.params_initialized and self.optimizer_initialized
# clear the gradient
for grads in self._exec_group.grad_arrays:
for grad in grads:
grad[:] = 0
def save_tmp_grad(self):
if self._tmp_grads is None:
self._tmp_grads = []
for grads in self._exec_group.grad_arrays:
vec = []
for grad in grads:
vec.append(grad.copyto(grad.context))
self._tmp_grads.append(vec)
else:
for i, grads in enumerate(self._exec_group.grad_arrays):
for j, grad in enumerate(grads):
self._tmp_grads[i][j][:] = grad
def acc_grad_with_tmp(self):
assert self._tmp_grads is not None
for i, grads in enumerate(self._exec_group.grad_arrays):
for j, grad in enumerate(grads):
grad += self._tmp_grads[i][j]
def load_params_allow_missing(self, fname):
"""Loads model parameters from file.
Parameters
----------
fname : str
Path to input param file.
Examples
--------
>>> # An example of loading module parameters.
>>> mod.load_params('myfile')
"""
logging.info("Load Param From %s" %fname)
save_dict = mx.nd.load(fname)
arg_params = {}
aux_params = {}
for k, value in save_dict.items():
arg_type, name = k.split(':', 1)
if arg_type == 'arg':
if name in self._param_names:
logging.info("set %s" %name)
arg_params[name] = value
elif arg_type == 'aux':
if name in self._aux_names:
logging.info("set %s" % name)
aux_params[name] = value
else:
raise ValueError("Invalid param file " + fname)
self.set_params(arg_params, aux_params, allow_missing=True)