| | """Some standard gradient-based stochastic optimizers. |
| | |
| | These are just standard routines that don't make any use of autograd, |
| | though you could take gradients of these functions too if you want |
| | to do meta-optimization. |
| | |
| | These routines can optimize functions whose inputs are structured |
| | objects, such as dicts of numpy arrays.""" |
| |
|
| | import autograd.numpy as np |
| | from autograd.misc import flatten |
| | from autograd.wrap_util import wraps |
| |
|
| |
|
| | def unflatten_optimizer(optimize): |
| | """Takes an optimizer that operates on flat 1D numpy arrays and returns a |
| | wrapped version that handles trees of nested containers (lists/tuples/dicts) |
| | with arrays/scalars at the leaves.""" |
| |
|
| | @wraps(optimize) |
| | def _optimize(grad, x0, callback=None, *args, **kwargs): |
| | _x0, unflatten = flatten(x0) |
| | _grad = lambda x, i: flatten(grad(unflatten(x), i))[0] |
| | if callback: |
| | _callback = lambda x, i, g: callback(unflatten(x), i, unflatten(g)) |
| | else: |
| | _callback = None |
| | return unflatten(optimize(_grad, _x0, _callback, *args, **kwargs)) |
| |
|
| | return _optimize |
| |
|
| |
|
| | @unflatten_optimizer |
| | def sgd(grad, x, callback=None, num_iters=200, step_size=0.1, mass=0.9): |
| | """Stochastic gradient descent with momentum. |
| | grad() must have signature grad(x, i), where i is the iteration number.""" |
| | velocity = np.zeros(len(x)) |
| | for i in range(num_iters): |
| | g = grad(x, i) |
| | if callback: |
| | callback(x, i, g) |
| | velocity = mass * velocity - (1.0 - mass) * g |
| | x = x + step_size * velocity |
| | return x |
| |
|
| |
|
| | @unflatten_optimizer |
| | def rmsprop(grad, x, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8): |
| | """Root mean squared prop: See Adagrad paper for details.""" |
| | avg_sq_grad = np.ones(len(x)) |
| | for i in range(num_iters): |
| | g = grad(x, i) |
| | if callback: |
| | callback(x, i, g) |
| | avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma) |
| | x = x - step_size * g / (np.sqrt(avg_sq_grad) + eps) |
| | return x |
| |
|
| |
|
| | @unflatten_optimizer |
| | def adam(grad, x, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): |
| | """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. |
| | It's basically RMSprop with momentum and some correction terms.""" |
| | m = np.zeros(len(x)) |
| | v = np.zeros(len(x)) |
| | for i in range(num_iters): |
| | g = grad(x, i) |
| | if callback: |
| | callback(x, i, g) |
| | m = (1 - b1) * g + b1 * m |
| | v = (1 - b2) * (g**2) + b2 * v |
| | mhat = m / (1 - b1 ** (i + 1)) |
| | vhat = v / (1 - b2 ** (i + 1)) |
| | x = x - step_size * mhat / (np.sqrt(vhat) + eps) |
| | return x |
| |
|