# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import unittest

import numpy as np

import warp as wp
from warp.tests.unittest_utils import *

wp.init()


@wp.kernel
def load_store_half(f32: wp.array(dtype=wp.float32), f16: wp.array(dtype=wp.float16)):
    tid = wp.tid()

    # check conversion from f32->f16
    a = wp.float16(f32[tid])
    b = f16[tid]

    wp.expect_eq(a, b)

    # check stores
    f16[tid] = a


def test_fp16_conversion(test, device):
    s = [1.0, 2.0, 3.0, -3.14159]

    np_f32 = np.array(s, dtype=np.float32)
    np_f16 = np.array(s, dtype=np.float16)

    wp_f32 = wp.array(s, dtype=wp.float32, device=device)
    wp_f16 = wp.array(s, dtype=wp.float16, device=device)

    assert_np_equal(np_f32, wp_f32.numpy())
    assert_np_equal(np_f16, wp_f16.numpy())

    wp.launch(load_store_half, dim=len(s), inputs=[wp_f32, wp_f16], device=device)

    # check that stores worked
    assert_np_equal(np_f16, wp_f16.numpy())


@wp.kernel
def value_load_store_half(f16_value: wp.float16, f16_array: wp.array(dtype=wp.float16)):
    wp.expect_eq(f16_value, f16_array[0])

    # check stores
    f16_array[0] = f16_value


def test_fp16_kernel_parameter(test, device):
    """Test the ability to pass in fp16 into kernels as parameters"""

    s = [1.0, 2.0, 3.0, -3.14159]

    for test_val in s:
        np_f16 = np.array([test_val], dtype=np.float16)
        wp_f16 = wp.array([test_val], dtype=wp.float16, device=device)

        wp.launch(value_load_store_half, (1,), inputs=[wp.float16(test_val), wp_f16], device=device)

        # check that stores worked
        assert_np_equal(np_f16, wp_f16.numpy())

        # Do the same thing but pass in test_val as a Python float to test automatic conversion
        wp_f16 = wp.array([test_val], dtype=wp.float16, device=device)
        wp.launch(value_load_store_half, (1,), inputs=[test_val, wp_f16], device=device)
        assert_np_equal(np_f16, wp_f16.numpy())


@wp.kernel
def mul_half(input: wp.array(dtype=wp.float16), output: wp.array(dtype=wp.float16)):
    tid = wp.tid()

    # convert to compute type fp32
    x = wp.float(input[tid]) * 2.0

    # store back as fp16
    output[tid] = wp.float16(x)


def test_fp16_grad(test, device):
    rng = np.random.default_rng(123)

    # checks that gradients are correctly propagated for
    # fp16 arrays, even when intermediate calculations
    # are performed in e.g.: fp32

    s = rng.random(size=15).astype(np.float16)

    input = wp.array(s, dtype=wp.float16, device=device, requires_grad=True)
    output = wp.zeros_like(input)

    tape = wp.Tape()
    with tape:
        wp.launch(mul_half, dim=len(s), inputs=[input, output], device=device)

    ones = wp.array(np.ones(len(output)), dtype=wp.float16, device=device)

    tape.backward(grads={output: ones})

    assert_np_equal(input.grad.numpy(), np.ones(len(s)) * 2.0)


class TestFp16(unittest.TestCase):
    pass


devices = []
if wp.is_cpu_available():
    devices.append("cpu")
for cuda_device in get_unique_cuda_test_devices():
    if cuda_device.arch >= 70:
        devices.append(cuda_device)

add_function_test(TestFp16, "test_fp16_conversion", test_fp16_conversion, devices=devices)
add_function_test(TestFp16, "test_fp16_grad", test_fp16_grad, devices=devices)
add_function_test(TestFp16, "test_fp16_kernel_parameter", test_fp16_kernel_parameter, devices=devices)


if __name__ == "__main__":
    wp.build.clear_kernel_cache()
    unittest.main(verbosity=2)