Spaces:

elvis-hf
/

optimization

Running

optimization / backend /src /optimization_logic.py

Joel Woodfield

Fix bug in nesterov optimizer

b883adb 11 days ago

18.9 kB

	import numpy as np
	from sympy import lambdify, Expr


	def _gradient_values(fx, fy, x: float, y: float) -> list:
	return [float(fx(x, y)), float(fy(x, y))]


	def _hessian_values(fxx, fxy, fyy, x: float, y: float) -> list:
	return [
	[float(fxx(x, y)), float(fxy(x, y))],
	[float(fxy(x, y)), float(fyy(x, y))],
	]


	def gd_univariate(
	function: Expr,
	x0: float,
	learning_rate: float,
	momentum: float,
	steps: int,
	) -> dict:
	"""
	Perform gradient descent on a univariate function.

	Assumes function is valid and in terms of x
	"""
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	for i in range(steps - 1):
	if i == 0:
	m = 0
	else:
	m = momentum * (x_values[-1] - x_values[-2])

	x = x - learning_rate * f_prime(x) + m
	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def gd_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	learning_rate: float,
	momentum: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	for i in range(steps - 1):
	if i == 0:
	mx = 0
	my = 0
	else:
	mx = momentum * (x_values[-1] - x_values[-2])
	my = momentum * (y_values[-1] - y_values[-2])

	x = x - learning_rate * fx(x, y) + mx
	y = y - learning_rate * fy(x, y) + my
	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def nesterov_univariate(
	function: Expr,
	x0: float,
	learning_rate: float,
	momentum: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	for i in range(steps - 1):
	if i == 0:
	m = 0
	else:
	m = momentum * (x_values[-1] - x_values[-2])

	x_lookahead = x + m
	x = x_lookahead - learning_rate * f_prime(x_lookahead)

	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def nesterov_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	learning_rate: float,
	momentum: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	for i in range(steps - 1):
	if i == 0:
	mx = 0
	my = 0
	else:
	mx = momentum * (x_values[-1] - x_values[-2])
	my = momentum * (y_values[-1] - y_values[-2])

	x_lookahead = x + mx
	y_lookahead = y + my

	x = x_lookahead - learning_rate * fx(x_lookahead, y_lookahead)
	y = y_lookahead - learning_rate * fy(x_lookahead, y_lookahead)

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def newton_univariate(
	function: Expr,
	x0: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	for i in range(steps - 1):
	x = x - f_prime(x) / f_prime_prime(x)
	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def newton_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	for i in range(steps - 1):
	hessian = np.array(
	[
	[fxx(x, y), fxy(x, y)],
	[fxy(x, y), fyy(x, y)],
	],
	)
	grad = np.array([fx(x, y), fy(x, y)])

	try:
	# delta = hessian^-1 * grad
	delta = np.linalg.solve(hessian, grad)
	except np.linalg.LinAlgError:
	# singular hessian - cannot proceed
	break

	x = x - delta[0]
	y = y - delta[1]

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def adagrad_univariate(
	function: Expr,
	x0: float,
	learning_rate: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	v = 0 # accumulated squared gradients
	for i in range(steps - 1):
	g = f_prime(x)
	v += g ** 2
	x = x - (learning_rate / (np.sqrt(v + epsilon))) * g

	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def adagrad_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	learning_rate: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	# accumulated squared gradients
	vx = 0
	vy = 0
	for i in range(steps - 1):
	gx = fx(x, y)
	gy = fy(x, y)
	vx += gx ** 2
	vy += gy ** 2

	x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
	y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def rmsprop_univariate(
	function: Expr,
	x0: float,
	learning_rate: float,
	beta: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	v = 0 # exponentially weighted average of squared gradients
	for i in range(steps - 1):
	g = f_prime(x)
	v = beta * v + (1 - beta) * g ** 2
	x = x - (learning_rate / (np.sqrt(v + epsilon))) * g

	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def rmsprop_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	learning_rate: float,
	beta: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	# exponentially weighted average of squared gradients
	vx = 0
	vy = 0
	for i in range(steps - 1):
	gx = fx(x, y)
	gy = fy(x, y)
	vx = beta * vx + (1 - beta) * gx ** 2
	vy = beta * vy + (1 - beta) * gy ** 2

	x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
	y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def adadelta_univariate(
	function: Expr,
	x0: float,
	beta: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	v = 0 # exponentially weighted average of squared gradients
	s = 0 # exponentially weighted average of squared updates
	for i in range(steps - 1):
	g = f_prime(x)
	v = beta * v + (1 - beta) * g ** 2
	delta_x = - (np.sqrt(s + epsilon) / np.sqrt(v + epsilon)) * g
	x = x + delta_x

	s = beta * s + (1 - beta) * delta_x ** 2

	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def adadelta_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	beta: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	# exponentially weighted average of squared gradients
	vx = 0
	vy = 0
	# exponentially weighted average of squared updates
	sx = 0
	sy = 0
	for i in range(steps - 1):
	gx = fx(x, y)
	gy = fy(x, y)
	vx = beta * vx + (1 - beta) * gx ** 2
	vy = beta * vy + (1 - beta) * gy ** 2

	delta_x = - (np.sqrt(sx + epsilon) / np.sqrt(vx + epsilon)) * gx
	delta_y = - (np.sqrt(sy + epsilon) / np.sqrt(vy + epsilon)) * gy

	x = x + delta_x
	y = y + delta_y

	sx = beta * sx + (1 - beta) * delta_x ** 2
	sy = beta * sy + (1 - beta) * delta_y ** 2

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}


	def adam_univariate(
	function: Expr,
	x0: float,
	learning_rate: float,
	beta1: float,
	beta2: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify('x', function, modules=['numpy'])
	f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
	f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

	x_values = [x0]
	y_values = [f(x0)]
	derivative_values = [f_prime(x0)]
	second_derivative_values = [f_prime_prime(x0)]

	x = x0
	m = 0 # first moment
	v = 0 # second moment
	for i in range(steps - 1):
	g = f_prime(x)
	m = beta1 * m + (1 - beta1) * g
	v = beta2 * v + (1 - beta2) * g ** 2

	m_hat = m / (1 - beta1 ** (i + 1))
	v_hat = v / (1 - beta2 ** (i + 1))

	x = x - (learning_rate / (np.sqrt(v_hat) + epsilon)) * m_hat

	x_values.append(x)
	y_values.append(f(x))
	derivative_values.append(f_prime(x))
	second_derivative_values.append(f_prime_prime(x))

	return {
	"x": x_values,
	"y": y_values,
	"derivative": derivative_values,
	"secondDerivative": second_derivative_values,
	}


	def adam_bivariate(
	function: Expr,
	x0: float,
	y0: float,
	learning_rate: float,
	beta1: float,
	beta2: float,
	epsilon: float,
	steps: int,
	) -> dict:
	f = lambdify(('x', 'y'), function, modules=['numpy'])
	fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
	fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
	fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
	fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
	fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

	x_values = [x0]
	y_values = [y0]
	z_values = [f(x0, y0)]
	gradient_values = [_gradient_values(fx, fy, x0, y0)]
	hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

	x = x0
	y = y0
	# first moments
	mx = 0
	my = 0
	# second moments
	vx = 0
	vy = 0
	for i in range(steps - 1):
	gx = fx(x, y)
	gy = fy(x, y)

	mx = beta1 * mx + (1 - beta1) * gx
	my = beta1 * my + (1 - beta1) * gy

	vx = beta2 * vx + (1 - beta2) * gx ** 2
	vy = beta2 * vy + (1 - beta2) * gy ** 2

	mx_hat = mx / (1 - beta1 ** (i + 1))
	my_hat = my / (1 - beta1 ** (i + 1))

	vx_hat = vx / (1 - beta2 ** (i + 1))
	vy_hat = vy / (1 - beta2 ** (i + 1))

	x = x - (learning_rate / (np.sqrt(vx_hat) + epsilon)) * mx_hat
	y = y - (learning_rate / (np.sqrt(vy_hat) + epsilon)) * my_hat

	x_values.append(x)
	y_values.append(y)
	z_values.append(f(x, y))
	gradient_values.append(_gradient_values(fx, fy, x, y))
	hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

	return {
	"x": x_values,
	"y": y_values,
	"z": z_values,
	"gradient": gradient_values,
	"hessian": hessian_values,
	}