/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 */

#include "warp.h"
#include "cuda_util.h"
#include "hashgrid.h"
#include "sort.h"
#include "string.h"

using namespace wp;

#include <map>

namespace 
{
    // host-side copy of mesh descriptors, maps GPU mesh address (id) to a CPU desc
    std::map<uint64_t, HashGrid> g_hash_grid_descriptors;

} // anonymous namespace


namespace wp
{

bool hash_grid_get_descriptor(uint64_t id, HashGrid& grid)
{
    const auto& iter = g_hash_grid_descriptors.find(id);
    if (iter == g_hash_grid_descriptors.end())
        return false;
    else
        grid = iter->second;
        return true;
}

void hash_grid_add_descriptor(uint64_t id, const HashGrid& grid)
{
    g_hash_grid_descriptors[id] = grid;
}

void hash_grid_rem_descriptor(uint64_t id)
{
    g_hash_grid_descriptors.erase(id);

}

// implemented in hashgrid.cu
void hash_grid_rebuild_device(const HashGrid& grid, const wp::vec3* points, int num_points);

} // namespace wp


// host methods
uint64_t hash_grid_create_host(int dim_x, int dim_y, int dim_z)
{
    HashGrid* grid = new HashGrid();
    memset(grid, 0, sizeof(HashGrid));
    
    grid->dim_x = dim_x;
    grid->dim_y = dim_y;
    grid->dim_z = dim_z;

    const int num_cells = dim_x*dim_y*dim_z;   
    grid->cell_starts = (int*)alloc_host(num_cells*sizeof(int));
    grid->cell_ends = (int*)alloc_host(num_cells*sizeof(int));

    return (uint64_t)(grid);
}

void hash_grid_destroy_host(uint64_t id)
{
    HashGrid* grid = (HashGrid*)(id);

    free_host(grid->point_ids);
    free_host(grid->point_cells);
    free_host(grid->cell_starts);
    free_host(grid->cell_ends);

    delete grid;
}

void hash_grid_reserve_host(uint64_t id, int num_points)
{
    HashGrid* grid = (HashGrid*)(id);

    if (num_points > grid->max_points)
    {
        free_host(grid->point_cells);
        free_host(grid->point_ids);
        
        const int num_to_alloc = num_points*3/2;
        grid->point_cells = (int*)alloc_host(2*num_to_alloc*sizeof(int));  // *2 for auxilliary radix buffers
        grid->point_ids = (int*)alloc_host(2*num_to_alloc*sizeof(int));    // *2 for auxilliary radix buffers

        grid->max_points = num_to_alloc;
    }

    grid->num_points = num_points;
}

void hash_grid_update_host(uint64_t id, float cell_width, const wp::vec3* points, int num_points)
{
    HashGrid* grid = (HashGrid*)(id);

    hash_grid_reserve_host(id, num_points);

    grid->cell_width = cell_width;
    grid->cell_width_inv = 1.0f / cell_width;

    // calculate cell for each position
    for (int i=0; i < num_points; ++i)
    {
        grid->point_cells[i] = hash_grid_index(*grid, points[i]);
        grid->point_ids[i] = i;
    }
    
    // sort indices
    radix_sort_pairs_host(grid->point_cells, grid->point_ids, num_points);

    const int num_cells = grid->dim_x * grid->dim_y * grid->dim_z;
    memset(grid->cell_starts, 0, sizeof(int) * num_cells);
    memset(grid->cell_ends, 0, sizeof(int) * num_cells);

    // compute cell start / end
    for (int i=0; i < num_points; ++i)
    {
		// scan the particle-cell array to find the start and end
		const int c = grid->point_cells[i];
		
		if (i == 0)
			grid->cell_starts[c] = 0;
		else
		{
			const int p = grid->point_cells[i-1];

			if (c != p)
			{
				grid->cell_starts[c] = i;
                grid->cell_ends[p] = i;
			}
		}

        if (i == num_points - 1)
        {
            grid->cell_ends[c] = i + 1;
        }
	}
}

// device methods
uint64_t hash_grid_create_device(void* context, int dim_x, int dim_y, int dim_z)
{
    ContextGuard guard(context);

    HashGrid grid;
    memset(&grid, 0, sizeof(HashGrid));

    grid.context = context ? context : cuda_context_get_current();

    grid.dim_x = dim_x;
    grid.dim_y = dim_y;
    grid.dim_z = dim_z;

    const int num_cells = dim_x*dim_y*dim_z;   
    grid.cell_starts = (int*)alloc_device(WP_CURRENT_CONTEXT, num_cells*sizeof(int));
    grid.cell_ends = (int*)alloc_device(WP_CURRENT_CONTEXT, num_cells*sizeof(int));

    // upload to device
    HashGrid* grid_device = (HashGrid*)(alloc_device(WP_CURRENT_CONTEXT, sizeof(HashGrid)));
    memcpy_h2d(WP_CURRENT_CONTEXT, grid_device, &grid, sizeof(HashGrid));

    uint64_t grid_id = (uint64_t)(grid_device);
    hash_grid_add_descriptor(grid_id, grid);

    return grid_id;
}

void hash_grid_destroy_device(uint64_t id)
{
    HashGrid grid;
    if (hash_grid_get_descriptor(id, grid))
    {
        ContextGuard guard(grid.context);

        free_device(WP_CURRENT_CONTEXT, grid.point_ids);
        free_device(WP_CURRENT_CONTEXT, grid.point_cells);
        free_device(WP_CURRENT_CONTEXT, grid.cell_starts);
        free_device(WP_CURRENT_CONTEXT, grid.cell_ends);

        free_device(WP_CURRENT_CONTEXT, (HashGrid*)id);
        
        hash_grid_rem_descriptor(id);
    }
}


void hash_grid_reserve_device(uint64_t id, int num_points)
{
    HashGrid grid;

    if (hash_grid_get_descriptor(id, grid))
    {
        if (num_points > grid.max_points)
        {
            ContextGuard guard(grid.context);

            free_device(WP_CURRENT_CONTEXT, grid.point_cells);
            free_device(WP_CURRENT_CONTEXT, grid.point_ids);
            
            const int num_to_alloc = num_points*3/2;
            grid.point_cells = (int*)alloc_device(WP_CURRENT_CONTEXT, 2*num_to_alloc*sizeof(int));  // *2 for auxilliary radix buffers
            grid.point_ids = (int*)alloc_device(WP_CURRENT_CONTEXT, 2*num_to_alloc*sizeof(int));    // *2 for auxilliary radix buffers
            grid.max_points = num_to_alloc;

            // ensure we pre-size our sort routine to avoid
            // allocations during graph capture
            radix_sort_reserve(WP_CURRENT_CONTEXT, num_to_alloc);

            // update device side grid descriptor, todo: this is
            // slightly redundant since it is performed again
            // inside hash_grid_update_device(), but since
            // reserve can be called from Python we need to make 
            // sure it is consistent
            memcpy_h2d(WP_CURRENT_CONTEXT, (HashGrid*)id, &grid, sizeof(HashGrid));

            // update host side grid descriptor
            hash_grid_add_descriptor(id, grid);
        }
    }
}

void hash_grid_update_device(uint64_t id, float cell_width, const wp::vec3* points, int num_points)
{
    
    // ensure we have enough memory reserved for update
    // this must be done before retrieving the descriptor
    // below since it may update it
    hash_grid_reserve_device(id, num_points);

    // host grid must be static so that we can
    // perform host->device memcpy from this variable
    // and have it safely recorded inside CUDA graphs
    static HashGrid grid;

    if (hash_grid_get_descriptor(id, grid))
    {
        ContextGuard guard(grid.context);

        grid.num_points = num_points;
        grid.cell_width = cell_width;
        grid.cell_width_inv = 1.0f / cell_width;

        hash_grid_rebuild_device(grid, points, num_points);

        // update device side grid descriptor
        memcpy_h2d(WP_CURRENT_CONTEXT, (HashGrid*)id, &grid, sizeof(HashGrid));

        // update host side grid descriptor
        hash_grid_add_descriptor(id, grid);
    }
}

#if !WP_ENABLE_CUDA

namespace wp
{

void hash_grid_rebuild_device(const HashGrid& grid, const wp::vec3* points, int num_points)
{

}

} // namespace wp

#endif // !WP_ENABLE_CUDA