| #pragma once |
|
|
| #include <gpu/common.h> |
| #include <gpu/gpu_memory.h> |
|
|
| namespace cubvh { |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
|
|
| |
| |
| __device__ inline uint32_t _hash_city32_step(uint32_t hash_val, uint32_t key) { |
| hash_val += key * 0x9E3779B9u; |
| hash_val ^= hash_val >> 16; |
| hash_val *= 0x85EBCA6Bu; |
| hash_val ^= hash_val >> 13; |
| hash_val *= 0xC2B2AE35u; |
| hash_val ^= hash_val >> 16; |
| return hash_val; |
| } |
|
|
| struct CityHash { |
| |
| |
| __device__ inline static int hash(const int* key, int key_dim, int capacity) { |
| uint32_t h = 0u; |
| for (int i = 0; i < key_dim; ++i) { |
| h = _hash_city32_step(h, (uint32_t)key[i]); |
| } |
| int signed_h = (int)h; |
| |
| int slot = signed_h % capacity; |
| if (slot < 0) slot += capacity; |
| return slot; |
| } |
| }; |
|
|
| __device__ inline bool _vec_equal(const int* a, const int* b, int dim) { |
| for (int i = 0; i < dim; ++i) { |
| if (a[i] != b[i]) return false; |
| } |
| return true; |
| } |
|
|
| |
| __device__ inline int _search_hash_table( |
| const int* __restrict__ table_kvs, |
| const int* __restrict__ coords, |
| const int* __restrict__ query_key, |
| int table_capacity, |
| int num_dims |
| ) { |
| |
| |
| |
| int slot = CityHash::hash(query_key, num_dims, table_capacity); |
| const int begin = slot; |
| int attempts = 0; |
| while (attempts < table_capacity) { |
| const int marker = table_kvs[slot * 2 + 0]; |
| if (marker == -1) { |
| return -1; |
| } |
| const int vec_idx = table_kvs[slot * 2 + 1]; |
| if (vec_idx != -1) { |
| const int* candidate = &coords[vec_idx * num_dims]; |
| if (_vec_equal(candidate, query_key, num_dims)) { |
| return vec_idx; |
| } |
| } |
| slot = (slot + 1) % table_capacity; |
| if (slot == begin) return -1; |
| ++attempts; |
| } |
| return -1; |
| } |
|
|
| |
|
|
| |
| |
| __global__ void prepare_key_value_pairs_kernel(int* table_kvs, int capacity) { |
| const int tid = blockIdx.x * blockDim.x + threadIdx.x; |
| if (tid < capacity) { |
| table_kvs[2 * tid + 0] = -1; |
| table_kvs[2 * tid + 1] = -1; |
| } |
| } |
|
|
| |
| |
| |
| |
| __global__ void insert_kernel( |
| int* __restrict__ table_kvs, |
| const int* __restrict__ coords, |
| int num_keys, |
| int num_dims, |
| int table_capacity |
| ) { |
| const int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (idx >= num_keys) return; |
| const int* key = &coords[idx * num_dims]; |
| int slot = CityHash::hash(key, num_dims, table_capacity); |
| const int begin = slot; |
| int attempts = 0; |
| while (attempts < table_capacity) { |
| int* marker_ptr = &table_kvs[slot * 2 + 0]; |
| const int prev = atomicCAS(marker_ptr, -1, slot); |
| if (prev == -1) { |
| table_kvs[slot * 2 + 1] = idx; |
| return; |
| } |
| slot = (slot + 1) % table_capacity; |
| if (slot == begin) return; |
| ++attempts; |
| } |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| __global__ void search_kernel( |
| const int* __restrict__ table_kvs, |
| const int* __restrict__ coords, |
| const int* __restrict__ queries, |
| int num_queries, |
| int table_capacity, |
| int num_dims, |
| int* __restrict__ out_indices |
| ) { |
| const int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (idx >= num_queries) return; |
| const int* q = &queries[idx * num_dims]; |
| out_indices[idx] = _search_hash_table(table_kvs, coords, q, table_capacity, num_dims); |
| } |
|
|
|
|
| |
| struct HashTableInt { |
| |
| |
| GPUMemory<int> table_kvs; |
| int capacity = 0; |
| const int* d_coords = nullptr; |
| int num_coords = 0; |
| int num_dims = 3; |
|
|
| void resize(int new_capacity) { |
| capacity = new_capacity; |
| table_kvs.resize((size_t)capacity * 2); |
| } |
|
|
| |
| void set_num_dims(int d) { num_dims = d; } |
|
|
| |
| |
| void prepare(cudaStream_t stream) { |
| if (capacity <= 0) return; |
| const uint32_t tpb = 256u; |
| const uint32_t blocks = (uint32_t)div_round_up(capacity, (int)tpb); |
| prepare_key_value_pairs_kernel<<<blocks, tpb, 0, stream>>>(table_kvs.data(), capacity); |
| } |
|
|
| |
| |
| void insert(const int* d_coords_in, int n_keys, cudaStream_t stream) { |
| d_coords = d_coords_in; |
| num_coords = n_keys; |
| if (capacity <= 0 || n_keys <= 0) return; |
| const uint32_t tpb = 256u; |
| const uint32_t blocks = (uint32_t)div_round_up(n_keys, (int)tpb); |
| insert_kernel<<<blocks, tpb, 0, stream>>>(table_kvs.data(), d_coords, n_keys, num_dims, capacity); |
| } |
|
|
|
|
| |
| |
| void build(const int* d_coords_in, int n_keys, cudaStream_t stream) { |
| |
| int desired_capacity = n_keys * 2; |
| if (desired_capacity < 16) desired_capacity = 16; |
| resize(desired_capacity); |
| prepare(stream); |
| insert(d_coords_in, n_keys, stream); |
| } |
|
|
| |
| |
| |
| void search(const int* d_queries, int n_queries, int* d_out_indices, cudaStream_t stream) const { |
| if (capacity <= 0 || n_queries <= 0) return; |
| const uint32_t tpb = 256u; |
| const uint32_t blocks = (uint32_t)div_round_up(n_queries, (int)tpb); |
| search_kernel<<<blocks, tpb, 0, stream>>>(table_kvs.data(), d_coords, d_queries, n_queries, capacity, num_dims, d_out_indices); |
| } |
| }; |
|
|
| } |