import numpy as np
from sympy import lambdify, Expr


def _gradient_values(fx, fy, x: float, y: float) -> list:
    return [float(fx(x, y)), float(fy(x, y))]


def _hessian_values(fxx, fxy, fyy, x: float, y: float) -> list:
    return [
        [float(fxx(x, y)), float(fxy(x, y))],
        [float(fxy(x, y)), float(fyy(x, y))],
    ]


def gd_univariate(
    function: Expr,
    x0: float,
    learning_rate: float,
    momentum: float,
    steps: int,
) -> dict:
    """
    Perform gradient descent on a univariate function.

    Assumes function is valid and in terms of x
    """
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    for i in range(steps - 1):
        if i == 0:
            m = 0
        else:
            m = momentum * (x_values[-1] - x_values[-2])

        x = x - learning_rate * f_prime(x) + m
        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def gd_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    learning_rate: float,
    momentum: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    for i in range(steps - 1):
        if i == 0:
            mx = 0
            my = 0
        else:
            mx = momentum * (x_values[-1] - x_values[-2])
            my = momentum * (y_values[-1] - y_values[-2])

        x = x - learning_rate * fx(x, y) + mx
        y = y - learning_rate * fy(x, y) + my
        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def nesterov_univariate(
    function: Expr,
    x0: float,
    learning_rate: float,
    momentum: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    for i in range(steps - 1):
        if i == 0:
            m = 0
        else:
            m = momentum * (x_values[-1] - x_values[-2])

        x_lookahead = x + m
        x = x_lookahead - learning_rate * f_prime(x_lookahead)

        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def nesterov_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    learning_rate: float,
    momentum: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    for i in range(steps - 1):
        if i == 0:
            mx = 0
            my = 0
        else:
            mx = momentum * (x_values[-1] - x_values[-2])
            my = momentum * (y_values[-1] - y_values[-2])

        x_lookahead = x + mx
        y_lookahead = y + my

        x = x_lookahead - learning_rate * fx(x_lookahead, y_lookahead)
        y = y_lookahead - learning_rate * fy(x_lookahead, y_lookahead)

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def newton_univariate(
    function: Expr,
    x0: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    for i in range(steps - 1):
        x = x - f_prime(x) / f_prime_prime(x)
        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def newton_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    for i in range(steps - 1):
        hessian = np.array(
            [
                [fxx(x, y), fxy(x, y)],
                [fxy(x, y), fyy(x, y)],
            ],
        )
        grad = np.array([fx(x, y), fy(x, y)])

        try:
            # delta = hessian^-1 * grad
            delta = np.linalg.solve(hessian, grad)
        except np.linalg.LinAlgError:
            # singular hessian - cannot proceed
            break

        x = x - delta[0]
        y = y - delta[1]

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def adagrad_univariate(
    function: Expr,
    x0: float,
    learning_rate: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    v = 0  # accumulated squared gradients
    for i in range(steps - 1):
        g = f_prime(x)
        v += g ** 2
        x = x - (learning_rate / (np.sqrt(v + epsilon))) * g

        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def adagrad_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    learning_rate: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    # accumulated squared gradients
    vx = 0
    vy = 0
    for i in range(steps - 1):
        gx = fx(x, y)
        gy = fy(x, y)
        vx += gx ** 2
        vy += gy ** 2

        x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
        y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def rmsprop_univariate(
    function: Expr,
    x0: float,
    learning_rate: float,
    beta: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    v = 0  # exponentially weighted average of squared gradients
    for i in range(steps - 1):
        g = f_prime(x)
        v = beta * v + (1 - beta) * g ** 2
        x = x - (learning_rate / (np.sqrt(v + epsilon))) * g

        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def rmsprop_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    learning_rate: float,
    beta: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    # exponentially weighted average of squared gradients
    vx = 0
    vy = 0
    for i in range(steps - 1):
        gx = fx(x, y)
        gy = fy(x, y)
        vx = beta * vx + (1 - beta) * gx ** 2
        vy = beta * vy + (1 - beta) * gy ** 2

        x = x - (learning_rate / (np.sqrt(vx + epsilon))) * gx
        y = y - (learning_rate / (np.sqrt(vy + epsilon))) * gy

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def adadelta_univariate(
    function: Expr,
    x0: float,
    beta: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    v = 0  # exponentially weighted average of squared gradients
    s = 0  # exponentially weighted average of squared updates
    for i in range(steps - 1):
        g = f_prime(x)
        v = beta * v + (1 - beta) * g ** 2
        delta_x = - (np.sqrt(s + epsilon) / np.sqrt(v + epsilon)) * g
        x = x + delta_x

        s = beta * s + (1 - beta) * delta_x ** 2

        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def adadelta_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    beta: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    # exponentially weighted average of squared gradients
    vx = 0
    vy = 0
    # exponentially weighted average of squared updates
    sx = 0
    sy = 0
    for i in range(steps - 1):
        gx = fx(x, y)
        gy = fy(x, y)
        vx = beta * vx + (1 - beta) * gx ** 2
        vy = beta * vy + (1 - beta) * gy ** 2

        delta_x = - (np.sqrt(sx + epsilon) / np.sqrt(vx + epsilon)) * gx
        delta_y = - (np.sqrt(sy + epsilon) / np.sqrt(vy + epsilon)) * gy

        x = x + delta_x
        y = y + delta_y

        sx = beta * sx + (1 - beta) * delta_x ** 2
        sy = beta * sy + (1 - beta) * delta_y ** 2

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }


def adam_univariate(
    function: Expr,
    x0: float,
    learning_rate: float,
    beta1: float,
    beta2: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify('x', function, modules=['numpy'])
    f_prime = lambdify('x', function.diff('x'), modules=['numpy'])
    f_prime_prime = lambdify('x', function.diff('x', 2), modules=['numpy'])

    x_values = [x0]
    y_values = [f(x0)]
    derivative_values = [f_prime(x0)]
    second_derivative_values = [f_prime_prime(x0)]

    x = x0
    m = 0  # first moment
    v = 0  # second moment
    for i in range(steps - 1):
        g = f_prime(x)
        m = beta1 * m + (1 - beta1) * g
        v = beta2 * v + (1 - beta2) * g ** 2

        m_hat = m / (1 - beta1 ** (i + 1))
        v_hat = v / (1 - beta2 ** (i + 1))

        x = x - (learning_rate / (np.sqrt(v_hat) + epsilon)) * m_hat

        x_values.append(x)
        y_values.append(f(x))
        derivative_values.append(f_prime(x))
        second_derivative_values.append(f_prime_prime(x))

    return {
        "x": x_values,
        "y": y_values,
        "derivative": derivative_values,
        "secondDerivative": second_derivative_values,
    }


def adam_bivariate(
    function: Expr,
    x0: float,
    y0: float,
    learning_rate: float,
    beta1: float,
    beta2: float,
    epsilon: float,
    steps: int,
) -> dict:
    f = lambdify(('x', 'y'), function, modules=['numpy'])
    fx = lambdify(('x', 'y'), function.diff('x'), modules=['numpy'])
    fy = lambdify(('x', 'y'), function.diff('y'), modules=['numpy'])
    fxx = lambdify(('x', 'y'), function.diff('x', 2), modules=['numpy'])
    fyy = lambdify(('x', 'y'), function.diff('y', 2), modules=['numpy'])
    fxy = lambdify(('x', 'y'), function.diff('x', 'y'), modules=['numpy'])

    x_values = [x0]
    y_values = [y0]
    z_values = [f(x0, y0)]
    gradient_values = [_gradient_values(fx, fy, x0, y0)]
    hessian_values = [_hessian_values(fxx, fxy, fyy, x0, y0)]

    x = x0
    y = y0
    # first moments
    mx = 0
    my = 0
    # second moments
    vx = 0
    vy = 0
    for i in range(steps - 1):
        gx = fx(x, y)
        gy = fy(x, y)

        mx = beta1 * mx + (1 - beta1) * gx
        my = beta1 * my + (1 - beta1) * gy

        vx = beta2 * vx + (1 - beta2) * gx ** 2
        vy = beta2 * vy + (1 - beta2) * gy ** 2

        mx_hat = mx / (1 - beta1 ** (i + 1))
        my_hat = my / (1 - beta1 ** (i + 1))

        vx_hat = vx / (1 - beta2 ** (i + 1))
        vy_hat = vy / (1 - beta2 ** (i + 1))

        x = x - (learning_rate / (np.sqrt(vx_hat) + epsilon)) * mx_hat
        y = y - (learning_rate / (np.sqrt(vy_hat) + epsilon)) * my_hat

        x_values.append(x)
        y_values.append(y)
        z_values.append(f(x, y))
        gradient_values.append(_gradient_values(fx, fy, x, y))
        hessian_values.append(_hessian_values(fxx, fxy, fyy, x, y))

    return {
        "x": x_values,
        "y": y_values,
        "z": z_values,
        "gradient": gradient_values,
        "hessian": hessian_values,
    }