# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""Sample node that simulates flocking behaviors by animating prim attributes."""

import math
import traceback
import numpy as np

import carb.settings
import omni.kit.app
import omni.graph.core as og
import omni.usd
import usdrt
import warp as wp

import omni.warp.nodes
from omni.warp.nodes.ogn.OgnSamplePrimFlockingDatabase import OgnSamplePrimFlockingDatabase


# device used for flocking simulation
MAIN_DEVICE = "cuda:0"

# device used for updating colors
COLOR_DEVICE = "cpu"


#   Kernels
# -----------------------------------------------------------------------------


@wp.struct
class Boid:
    vel: wp.vec3f
    wander_angles: wp.vec2f
    mass: float
    group: int


@wp.struct
class Obstacle:
    pos: wp.vec3f
    radius: float


@wp.struct
class World:
    lower: wp.vec3f
    upper: wp.vec3f
    grid: wp.uint64
    seed: int
    biases: wp.mat33f
    obstacles: wp.array(dtype=Obstacle)


@wp.kernel(enable_backward=False)
def copy_positions(dst: wp.array(dtype=wp.vec3f), src: wp.fabricarray(dtype=wp.vec3d)):
    tid = wp.tid()
    pos = src[tid]
    dst[tid] = wp.vec3f(float(pos[0]), float(pos[1]), float(pos[2]))


@wp.kernel(enable_backward=False)
def assign_colors(
    glows: wp.array(dtype=float),
    groups: wp.array(dtype=int),
    color_ramps: wp.array2d(dtype=wp.vec3f),
    colors: wp.fabricarrayarray(dtype=wp.vec3f)
):
    tid = wp.tid()

    glow = glows[tid]
    group = groups[tid]

    if glow < 0.4:
        alpha = glow / 0.4
        colors[tid][0] = (1.0 - alpha) * color_ramps[group, 0] + alpha * color_ramps[group, 1]
    elif glow < 0.8:
        alpha = (glow - 0.4) / 0.4
        colors[tid][0] = (1.0 - alpha) * color_ramps[group, 1] + alpha * color_ramps[group, 2]
    else:
        alpha = (glow - 0.8) / 0.2
        colors[tid][0] = (1.0 - alpha) * color_ramps[group, 2] + alpha * color_ramps[group, 3]


@wp.func
def intersect_ray_sphere(origin: wp.vec3f, dir: wp.vec3f, center: wp.vec3f, radius: float):

    to_sphere = center - origin

    tc = wp.dot(to_sphere, dir)

    if tc < 0.0:
        return tc
    
    d = wp.sqrt(wp.length_sq(to_sphere) - tc * tc)
    if d < 0.0:
        return -999999.0
    
    ts = wp.sqrt(radius * radius - d * d)

    return tc - ts


@wp.kernel(enable_backward=False)
def boids(
    boids: wp.array(dtype=Boid),
    world: World,
    dt: float,
    positions: wp.fabricarray(dtype=wp.vec3d),
    orientations: wp.fabricarray(dtype=wp.quatf),
    glows: wp.array(dtype=float),
):
    tid = wp.tid()

    boid = boids[tid]
    
    old_pos = positions[tid]
    old_rot = orientations[tid]

    pos = wp.vec3(float(old_pos[0]), float(old_pos[1]), float(old_pos[2]))
    vel = boid.vel

    forward = wp.quat_rotate(old_rot, wp.vec3f(1.0, 0.0, 0.0))

    force = wp.vec3f(0.0)

    # obstacle avoidance
    depenetration_force = 100.0
    avoidance_dist = 20.0
    avoidance_force = 20.0
    obstacles = world.obstacles
    num_obstacles = obstacles.shape[0]
    for i in range(num_obstacles):
        obstacle = obstacles[i]
        to_obstacle = obstacle.pos - pos
        # use padded radius
        radius = obstacle.radius + 2.0
        if wp.length(to_obstacle) < radius:
            # depenetration
            force += depenetration_force * wp.normalize(-to_obstacle)
        else:
            # avoidance
            t = intersect_ray_sphere(pos, forward, obstacle.pos, radius)
            if t > 0.0 and t < avoidance_dist:
                intersection_point = pos + t * forward
                out = intersection_point - obstacle.pos
                force += avoidance_force * (1.0 - t / avoidance_dist) * wp.normalize(out)

    # wander
    r = 10.0
    s0 = wp.sin(boid.wander_angles[0])
    c0 = wp.cos(boid.wander_angles[0])
    s1 = wp.sin(boid.wander_angles[1])
    c1 = wp.cos(boid.wander_angles[1])
    p = wp.vec3f(r * s0 * s1, r * s0 * c1, r * c0)
    offset = r + 1.0
    target = pos + wp.quat_rotate(old_rot, wp.vec3f(offset, 0.0, 0.0) + p)

    wander_force = 7.0
    force += wander_force * wp.normalize(target - pos)

    state = wp.rand_init(world.seed, tid)

    angle0 = boid.wander_angles[0] + wp.pi * (0.1 - 0.2 * wp.randf(state))
    angle1 = boid.wander_angles[1] + wp.pi * (0.1 - 0.2 * wp.randf(state))
    boid.wander_angles = wp.vec2f(angle0, angle1)

    cohesion_radius = 15.0
    cohesion_force = 20.0

    separation_radius = 10.0
    separation_force = 100.0

    # flocking
    query = wp.hash_grid_query(world.grid, pos, cohesion_radius)
    num_neighbors = int(0)
    num_align_neighbors = int(0)
    num_cohesion_neighbors = float(0)
    num_decohesion_neighbors = float(0)
    cohesion_pos_sum = wp.vec3f(0.0)
    decohesion_pos_sum = wp.vec3f(0.0)
    vel_sum = wp.vec3f(0.0)
    for index in query:
        if index != tid:
            other = boids[index]
            other_pos64 = positions[index]
            other_pos = wp.vec3f(float(other_pos64[0]), float(other_pos64[1]), float(other_pos64[2]))
            dist = wp.length(pos - other_pos)

            if dist < cohesion_radius:
                to_other = wp.normalize(other_pos - pos)
                # separation
                if dist < separation_radius:
                    force -= separation_force * (1.0 - dist / separation_radius) * to_other
                # cohesion
                bias = world.biases[boid.group, other.group]
                if bias > 0.0:
                    cohesion_pos_sum += bias * other_pos
                    num_cohesion_neighbors += bias
                else:
                    decohesion_pos_sum -= bias * other_pos
                    num_decohesion_neighbors -= bias
                # alignment
                if other.group == boid.group:
                    vel_sum += bias * other.vel
                    num_align_neighbors += 1
                num_neighbors += 1

    # align
    if num_align_neighbors > 0:
        vel_avg = vel_sum / float(num_align_neighbors)
        force += vel_avg - vel

    # cohere
    if num_cohesion_neighbors > 0.0:
        cohesion_pos_avg = cohesion_pos_sum / float(num_cohesion_neighbors)
        force += cohesion_force * wp.normalize(cohesion_pos_avg - pos)

    # decohere (group separation)
    if num_decohesion_neighbors > 0.0:
        decohesion_pos_avg = decohesion_pos_sum / float(num_decohesion_neighbors)
        force += cohesion_force * wp.normalize(pos - decohesion_pos_avg)

    # boundaries
    boundary_force = 20.0
    if pos[0] >= world.upper[0]:
        force += wp.vec3f(-boundary_force, 0.0, 0.0)
    if pos[0] <= world.lower[0]:
        force += wp.vec3f(boundary_force, 0.0, 0.0)
    if pos[1] >= world.upper[1]:
        force += wp.vec3f(0.0, -0.5 * boundary_force, 0.0)
    if pos[1] <= world.lower[1]:
        force += wp.vec3f(0.0, 5.0 * boundary_force, 0.0)
    if pos[2] >= world.upper[2]:
        force += wp.vec3f(0.0, 0.0, -boundary_force)
    if pos[2] <= world.lower[2]:
        force += wp.vec3f(0.0, 0.0, boundary_force)

    vel += dt * force / boid.mass

    # clamp speed
    max_speed = 15.0
    speed_sq = wp.length_sq(vel)
    if speed_sq > max_speed * max_speed:
        vel = max_speed * wp.normalize(vel)

    # update position
    pos += dt * vel
    positions[tid] = wp.vec3d(wp.float64(pos[0]), wp.float64(pos[1]), wp.float64(pos[2]))

    # update orientation
    dq = wp.quat_between_vectors(forward, vel)
    orientations[tid] = wp.normalize(dq * orientations[tid])

    # save velocity
    boid.vel = vel
    boids[tid] = boid

    # update glow as an exponentially weighted moving average to keep it smooth
    glow = wp.min(1.0, float(num_neighbors) / 25.0)
    glow_alpha = 0.25
    glows[tid] = glow_alpha * glow + (1.0 - glow_alpha) * glows[tid]


#   Internal State
# ------------------------------------------------------------------------------


class InternalState:
    """Internal state for the node."""

    def __init__(self) -> None:
        self.initialized = False


    def initialize(self, device):

        # requirement checks
        ext_mgr = omni.kit.app.get_app().get_extension_manager()

        # make sure USDRT is enabled
        usdrt_ext_name = "usdrt.scenegraph"
        if not ext_mgr.is_extension_enabled(usdrt_ext_name):
            raise RuntimeError(f"This sample requires the '{usdrt_ext_name}' extension to be enabled")

        # check USDRT version to make sure we have a working SelectPrims()
        usdrt_ext_id = ext_mgr.get_enabled_extension_id(usdrt_ext_name)
        usdrt_version_string = ext_mgr.get_extension_dict(usdrt_ext_id)["package"]["version"]
        usdrt_version = tuple(int(v) for v in usdrt_version_string.split("."))
        if usdrt_version < (7, 3, 0):
            raise RuntimeError(f"USDRT version 7.3.0 is required, found {usdrt_version_string}.  Please update to a newer version of Kit to run this sample.")

        # check if FSD is enabled
        settings = carb.settings.get_settings()
        is_fsd_enabled = settings.get_as_bool("/app/useFabricSceneDelegate")
        if not is_fsd_enabled:
            print("***")
            print("*** Flocking demo warning: The Fabric Scene Delegate is not enabled.")
            print("*** Some features, like color animation, may not work.")
            print("*** You can enable FSD in Preferences->Rendering.")
            print("***")

        stage_id = omni.usd.get_context().get_stage_id()
        
        usdrt_stage = usdrt.Usd.Stage.Attach(stage_id)

        # import to Fabric
        for prim in usdrt_stage.Traverse():
            pass

        # set up for Fabric interop
        boid_root = usdrt_stage.GetPrimAtPath(usdrt.Sdf.Path("/World/Boids"))
        boid_prims = boid_root.GetChildren()
        for prim in boid_prims:
            pos = prim.GetAttribute("xformOp:translate").Get()
            prim.CreateAttribute("_worldPosition", usdrt.Sdf.ValueTypeNames.Double3, True).Set(pos)
            prim.CreateAttribute("_worldOrientation", usdrt.Sdf.ValueTypeNames.Quatf, True).Set(usdrt.Gf.Quatf(1, 0, 0, 0))

            # create a custom tag for the boids (could use applied schema too)
            prim.CreateAttribute("BoidTag", usdrt.Sdf.ValueTypeNames.AppliedSchemaTypeTag, True)

        num_boids = len(boid_prims)

        self.stage = usdrt_stage

        self.require_schemas = ["BoidTag"]

        self.transform_attrs = [
            (usdrt.Sdf.ValueTypeNames.Double3, "_worldPosition", usdrt.Usd.Access.ReadWrite),
            (usdrt.Sdf.ValueTypeNames.Quatf, "_worldOrientation", usdrt.Usd.Access.ReadWrite),
        ]

        self.color_attrs = [
            (usdrt.Sdf.ValueTypeNames.Float3Array, "primvars:_emissive", usdrt.Usd.Access.ReadWrite),
        ]

        npboids = np.zeros(num_boids, dtype=Boid.numpy_dtype())

        angles = math.pi - 2 * math.pi * np.random.rand(num_boids)
        vx = 20 * np.sin(angles)
        vz = 20 * np.cos(angles)
        npboids["vel"][:, 0] = vx
        npboids["vel"][:, 2] = vz

        npboids["wander_angles"][:, 0] = math.pi * np.random.rand(num_boids)
        npboids["wander_angles"][:, 1] = 2 * math.pi * np.random.rand(num_boids)

        min_mass = 1.0
        max_mass = 2.0
        npboids["mass"][:] = min_mass + (max_mass - min_mass) * np.random.rand(num_boids)

        # we can have up to 3 groups currently, but that can be easily extended
        self.num_groups = 2
        npboids["group"] = np.random.randint(self.num_groups, size=num_boids)

        num_obstacles = 3
        npobstacles = np.zeros(num_obstacles, dtype=Obstacle.numpy_dtype())
        npobstacles["pos"][0] = (-20, 30, -40)
        npobstacles["radius"][0] = 40
        npobstacles["pos"][1] = (90, 30, 30)
        npobstacles["radius"][1] = 30
        npobstacles["pos"][2] = (-100, 30, 60)
        npobstacles["radius"][2] = 25

        self.grid = wp.HashGrid(dim_x=32, dim_y=32, dim_z=32, device=device)

        biases = wp.mat33f(-1.0)
        for i in range(self.num_groups):
            biases[i, i] = 1.0

        world = World()
        world.lower = (-120, 20, -90)
        world.upper = (120, 40, 90)
        world.grid = self.grid.id
        world.seed = 0
        world.biases = biases
        world.obstacles = wp.array(npobstacles, dtype=Obstacle, device=device)
        self.world = world

        self.num_boids = num_boids
        self.boids = wp.array(npboids, dtype=Boid, device=device)

        # color ramps per group
        color_ramps = [
            [[0.3, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.5, 0.0], [1.0, 1.0, 0.5]],
            [[0.0, 0.0, 0.3], [0.0, 0.0, 1.0], [0.0, 0.5, 1.0], [0.5, 1.0, 1.0]],
            [[0.0, 0.3, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.5], [0.8, 1.0, 0.8]],
        ]

        # copy of positions used for updating the spatial grid
        self.positions = wp.zeros(num_boids, dtype=wp.vec3f, device=device)

        # color ramps are only used on the COLOR_DEVICE
        self.color_ramps_c = wp.array(color_ramps, dtype=wp.vec3f, device=COLOR_DEVICE)

        # keep a copy of group assignments on the COLOR_DEVICE
        self.groups_c = wp.array(npboids["group"], device=COLOR_DEVICE)

        # if we use different devices, the glow array must be copied on each update
        if COLOR_DEVICE == device:
            # use the same glow array on each device, no copying needed
            self.glows_c = wp.zeros(num_boids, dtype=float, device=device)
            self.glows_m = self.glows_c
        elif COLOR_DEVICE == "cpu" or device == "cpu":
            # use a pinned host array for async copying glows between devices
            glows_h = wp.zeros(num_boids, dtype=float, device="cpu", pinned=True)
            if COLOR_DEVICE == "cpu":
                self.glows_c = glows_h
                self.glows_m = wp.zeros_like(glows_h, device=device)
            else:
                self.glows_c = wp.zeros_like(glows_h, device=COLOR_DEVICE)
                self.glows_m = glows_h
        else:
            # two different CUDA devices
            self.glows_c = wp.zeros(num_boids, dtype=float, device=COLOR_DEVICE)
            self.glows_m = wp.zeros(num_boids, dtype=float, device=device)

            # ...but that's currently not supported in Kit
            raise ValueError("Multiple GPUs not supported yet")

        self.time = 0.0

        self.min_group_think = 3.0
        self.max_group_think = 10.0
        self.next_group_think = self.min_group_think + (self.max_group_think - self.min_group_think) * np.random.rand()

        self.frameno = 0

        self.initialized = True


#   Compute
# ------------------------------------------------------------------------------


def compute(db: OgnSamplePrimFlockingDatabase) -> None:
    """Evaluates the node."""

    state = db.internal_state

    device = wp.get_device()

    if not state.initialized:
        state.initialize(device)

    state.frameno += 1

    # get transform attributes
    selection = state.stage.SelectPrims(
        require_applied_schemas=state.require_schemas,
        require_attrs=state.transform_attrs,
        device=str(device)
    )

    fpos = wp.fabricarray(data=selection, attrib="_worldPosition")
    frot = wp.fabricarray(data=selection, attrib="_worldOrientation")

    # use fixed dt for stability
    dt = 1.0 / 60.0

    state.time += dt

    # copy positions to a contiguous array and convert to vec3f so they can be used to update the spatial grid
    wp.launch(copy_positions, dim=state.num_boids, inputs=[state.positions, fpos])

    # grid cell radius should be a bit bigger than query radius
    cell_radius = 20.0
    state.grid.build(state.positions, cell_radius)

    state.world.seed = state.frameno

    # step the flocking simulation
    wp.launch(boids, dim=state.num_boids, inputs=[state.boids, state.world, dt, fpos, frot, state.glows_m])
    
    # async copy from main device and remember the stream so we can sync later
    if COLOR_DEVICE != device:
        if device.is_cuda:
            work_stream = device.stream
        else:
            work_stream = wp.get_stream(COLOR_DEVICE)
        wp.copy(state.glows_c, state.glows_m, stream=work_stream)
    else:
        work_stream = None

    # get color attributes
    color_selection = state.stage.SelectPrims(
        require_applied_schemas=state.require_schemas,
        require_attrs=state.color_attrs,
        device=COLOR_DEVICE
    )

    fcolor = wp.fabricarray(data=color_selection, attrib="primvars:_emissive")

    # occasionally update group biases (whether they are attracted or repelled from each other)
    if state.num_groups > 1 and state.time >= state.next_group_think:

        # pick two random groups
        group0 = np.random.randint(state.num_groups)
        group1 = np.random.randint(state.num_groups)
        while group0 == group1:
            group1 = np.random.randint(state.num_groups)

        # bias towards intra-group separation, but also allow attraction
        state.world.biases[group0, group1] = 1.0 - 5.0 * np.random.rand()
        state.world.biases[group1, group0] = 1.0 - 5.0 * np.random.rand()

        state.next_group_think += state.min_group_think + (state.max_group_think - state.min_group_think) * np.random.rand()

    if work_stream is not None:
        # wait for async GPU work to complete
        wp.synchronize_stream(work_stream)

    # update colors
    wp.launch(assign_colors, dim=state.num_boids, inputs=[state.glows_c, state.groups_c, state.color_ramps_c, fcolor], device=COLOR_DEVICE)


#   Node Entry Point
# ------------------------------------------------------------------------------

class OgnSamplePrimFlocking:
    """Node."""

    @staticmethod
    def internal_state() -> InternalState:
        return InternalState()

    @staticmethod
    def compute(db: OgnSamplePrimFlockingDatabase) -> None:
        device = wp.get_device(MAIN_DEVICE)

        try:
            with wp.ScopedDevice(device):
                compute(db)
        except Exception:
            db.log_error(traceback.format_exc())
            return

        # Fire the execution for the downstream nodes.
        db.outputs.execOut = og.ExecutionAttributeState.ENABLED