optimization / backend /src /optimization_logic.py
Joel Woodfield
Fix bug in nesterov optimizer
b883adb
import numpy as np
from sympy import lambdify, Expr
def _gradient_values(fx, fy, x: float, y: float) -> list:
return [float(fx(x, y)), float(fy(x, y))]
def _hessian_values(fxx, fxy, fyy, x: float, y: float) -> list:
return [
[float(fxx(x, y)), float(fxy(x, y))],
[float(fxy(x, y)), float(fyy(x, y))],
]
def gd_univariate(
function: Expr,
x0: float,
learning_rate: float,
momentum: float,
steps: int,
) -> dict:
"""
Perform gradient descent on a univariate function.
Assumes function is valid and in terms of x
"""
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
for i in range(steps - 1):
if i == 0:
m = 0
else:
m = momentum * (x_values[-1] - x_values[-2])
x = x - learning_rate * f_prime(x) + m
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def gd_bivariate(
function: Expr,
x0: float,
y0: float,
learning_rate: float,
momentum: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
for i in range(steps - 1):
if i == 0:
mx = 0
my = 0
else:
mx = momentum * (x_values[-1] - x_values[-2])
my = momentum * (y_values[-1] - y_values[-2])
x = x - learning_rate * fx(x, y) + mx
y = y - learning_rate * fy(x, y) + my
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def nesterov_univariate(
function: Expr,
x0: float,
learning_rate: float,
momentum: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
for i in range(steps - 1):
if i == 0:
m = 0
else:
m = momentum * (x_values[-1] - x_values[-2])
x_lookahead = x + m
x = x_lookahead - learning_rate * f_prime(x_lookahead)
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def nesterov_bivariate(
function: Expr,
x0: float,
y0: float,
learning_rate: float,
momentum: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
for i in range(steps - 1):
if i == 0:
mx = 0
my = 0
else:
mx = momentum * (x_values[-1] - x_values[-2])
my = momentum * (y_values[-1] - y_values[-2])
x_lookahead = x + mx
y_lookahead = y + my
x = x_lookahead - learning_rate * fx(x_lookahead, y_lookahead)
y = y_lookahead - learning_rate * fy(x_lookahead, y_lookahead)
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def newton_univariate(
function: Expr,
x0: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
for i in range(steps - 1):
x = x - f_prime(x) / f_prime_prime(x)
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def newton_bivariate(
function: Expr,
x0: float,
y0: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
for i in range(steps - 1):
hessian = np.array(
[
[fxx(x, y), fxy(x, y)],
[fxy(x, y), fyy(x, y)],
],
)
grad = np.array([fx(x, y), fy(x, y)])
try:
# delta = hessian^-1 * grad
delta = np.linalg.solve(hessian, grad)
except np.linalg.LinAlgError:
# singular hessian - cannot proceed
break
x = x - delta[0]
y = y - delta[1]
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def adagrad_univariate(
function: Expr,
x0: float,
learning_rate: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
v = 0 # accumulated squared gradients
for i in range(steps - 1):
g = f_prime(x)
v += g ** 2
x = x - (learning_rate / (np.sqrt(v + epsilon))) * g
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def adagrad_bivariate(
function: Expr,
x0: float,
y0: float,
learning_rate: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
# accumulated squared gradients
vx = 0
vy = 0
for i in range(steps - 1):
gx = fx(x, y)
gy = fy(x, y)
vx += gx ** 2
vy += gy ** 2
x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def rmsprop_univariate(
function: Expr,
x0: float,
learning_rate: float,
beta: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
v = 0 # exponentially weighted average of squared gradients
for i in range(steps - 1):
g = f_prime(x)
v = beta * v + (1 - beta) * g ** 2
x = x - (learning_rate / (np.sqrt(v + epsilon))) * g
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def rmsprop_bivariate(
function: Expr,
x0: float,
y0: float,
learning_rate: float,
beta: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
# exponentially weighted average of squared gradients
vx = 0
vy = 0
for i in range(steps - 1):
gx = fx(x, y)
gy = fy(x, y)
vx = beta * vx + (1 - beta) * gx ** 2
vy = beta * vy + (1 - beta) * gy ** 2
x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def adadelta_univariate(
function: Expr,
x0: float,
beta: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
v = 0 # exponentially weighted average of squared gradients
s = 0 # exponentially weighted average of squared updates
for i in range(steps - 1):
g = f_prime(x)
v = beta * v + (1 - beta) * g ** 2
delta_x = - (np.sqrt(s + epsilon) / np.sqrt(v + epsilon)) * g
x = x + delta_x
s = beta * s + (1 - beta) * delta_x ** 2
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def adadelta_bivariate(
function: Expr,
x0: float,
y0: float,
beta: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
# exponentially weighted average of squared gradients
vx = 0
vy = 0
# exponentially weighted average of squared updates
sx = 0
sy = 0
for i in range(steps - 1):
gx = fx(x, y)
gy = fy(x, y)
vx = beta * vx + (1 - beta) * gx ** 2
vy = beta * vy + (1 - beta) * gy ** 2
delta_x = - (np.sqrt(sx + epsilon) / np.sqrt(vx + epsilon)) * gx
delta_y = - (np.sqrt(sy + epsilon) / np.sqrt(vy + epsilon)) * gy
x = x + delta_x
y = y + delta_y
sx = beta * sx + (1 - beta) * delta_x ** 2
sy = beta * sy + (1 - beta) * delta_y ** 2
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}
def adam_univariate(
function: Expr,
x0: float,
learning_rate: float,
beta1: float,
beta2: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify('x', function, modules=['numpy'])
f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])
x_values = [x0]
y_values = [f(x0)]
derivative_values = [f_prime(x0)]
second_derivative_values = [f_prime_prime(x0)]
x = x0
m = 0 # first moment
v = 0 # second moment
for i in range(steps - 1):
g = f_prime(x)
m = beta1 * m + (1 - beta1) * g
v = beta2 * v + (1 - beta2) * g ** 2
m_hat = m / (1 - beta1 ** (i + 1))
v_hat = v / (1 - beta2 ** (i + 1))
x = x - (learning_rate / (np.sqrt(v_hat) + epsilon)) * m_hat
x_values.append(x)
y_values.append(f(x))
derivative_values.append(f_prime(x))
second_derivative_values.append(f_prime_prime(x))
return {
"x": x_values,
"y": y_values,
"derivative": derivative_values,
"secondDerivative": second_derivative_values,
}
def adam_bivariate(
function: Expr,
x0: float,
y0: float,
learning_rate: float,
beta1: float,
beta2: float,
epsilon: float,
steps: int,
) -> dict:
f = lambdify(('x', 'y'), function, modules=['numpy'])
fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])
x_values = [x0]
y_values = [y0]
z_values = [f(x0, y0)]
gradient_values = [_gradient_values(fx, fy, x0, y0)]
hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]
x = x0
y = y0
# first moments
mx = 0
my = 0
# second moments
vx = 0
vy = 0
for i in range(steps - 1):
gx = fx(x, y)
gy = fy(x, y)
mx = beta1 * mx + (1 - beta1) * gx
my = beta1 * my + (1 - beta1) * gy
vx = beta2 * vx + (1 - beta2) * gx ** 2
vy = beta2 * vy + (1 - beta2) * gy ** 2
mx_hat = mx / (1 - beta1 ** (i + 1))
my_hat = my / (1 - beta1 ** (i + 1))
vx_hat = vx / (1 - beta2 ** (i + 1))
vy_hat = vy / (1 - beta2 ** (i + 1))
x = x - (learning_rate / (np.sqrt(vx_hat) + epsilon)) * mx_hat
y = y - (learning_rate / (np.sqrt(vy_hat) + epsilon)) * my_hat
x_values.append(x)
y_values.append(y)
z_values.append(f(x, y))
gradient_values.append(_gradient_values(fx, fy, x, y))
hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))
return {
"x": x_values,
"y": y_values,
"z": z_values,
"gradient": gradient_values,
"hessian": hessian_values,
}