| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <cooperative_groups.h> |
| #include <cooperative_groups/memcpy_async.h> |
|
|
| namespace cg = cooperative_groups; |
|
|
| |
| |
| |
| |
| |
| #define COLS_PER_CLUSTER_BLOCK_MAX 256u |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #define INPUT_BITS_MAX 32768u |
|
|
| extern "C" { |
|
|
| struct FusedPtrs { |
| unsigned long long syn_bit; |
| unsigned long long syn_perm; |
| unsigned long long boost; |
| unsigned long long active_duty; |
| unsigned long long inhibition_threshold; |
| unsigned long long seg_cell_id; |
| unsigned long long seg_syn_count; |
| unsigned long long syn_presyn; |
| unsigned long long tm_syn_perm; |
| unsigned long long cell_seg_count; |
| unsigned long long cell_active_a; |
| unsigned long long cell_active_b; |
| unsigned long long cell_winner_a; |
| unsigned long long cell_winner_b; |
| unsigned long long inputs; |
| unsigned long long cols_out; |
| unsigned long long anom_out; |
| unsigned long long barrier_counters; |
| unsigned long long step_scratch; |
| }; |
|
|
| struct FusedConfig { |
| |
| unsigned int input_bits; |
| unsigned int n_columns; |
| unsigned int synapses_per_col; |
| float conn_thr; |
| float sp_inc; |
| float sp_dec; |
| float sparsity_target; |
| float duty_alpha; |
| float thr_adapt_rate; |
| |
| unsigned int cells_per_column; |
| unsigned int n_cells; |
| unsigned int bits_words; |
| unsigned int max_segments_per_cell; |
| unsigned int synapses_per_segment; |
| unsigned int activation_threshold; |
| unsigned int learning_threshold; |
| unsigned int max_new_synapses; |
| int conn_thr_i16; |
| int perm_inc_i16; |
| int perm_dec_i16; |
| int predicted_seg_dec_i16; |
| int initial_perm_i16; |
| |
| unsigned int T; |
| unsigned int learn; |
| unsigned int iter_seed; |
| unsigned int cooperative_grid_sync; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| __device__ static inline void fused_grid_barrier(cg::grid_group grid, |
| unsigned int * , |
| unsigned int , |
| unsigned int , |
| unsigned int ) { |
| #if __CUDA_ARCH__ >= 900 |
| |
| auto cluster = cg::this_cluster(); |
| cluster.sync(); |
| #else |
| |
| |
| |
| grid.sync(); |
| #endif |
| } |
|
|
| __device__ static inline unsigned int warp_sum_u32(unsigned int v) { |
| for (int off = 16; off > 0; off >>= 1) { |
| v += __shfl_down_sync(0xffffffffu, v, off); |
| } |
| return v; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| __device__ static inline |
| void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) { |
| cg::grid_group grid = cg::this_grid(); |
| |
| const unsigned int * __restrict__ syn_bit = (const unsigned int*)P.syn_bit; |
| float * __restrict__ syn_perm = (float*)P.syn_perm; |
| float * __restrict__ boost = (float*)P.boost; |
| float * __restrict__ active_duty = (float*)P.active_duty; |
| float * __restrict__ inhibition_threshold = (float*)P.inhibition_threshold; |
| unsigned int * __restrict__ seg_cell_id = (unsigned int*)P.seg_cell_id; |
| unsigned int * __restrict__ seg_syn_count = (unsigned int*)P.seg_syn_count; |
| unsigned int * __restrict__ syn_presyn = (unsigned int*)P.syn_presyn; |
| short * __restrict__ tm_syn_perm = (short*)P.tm_syn_perm; |
| unsigned int * __restrict__ cell_seg_count = (unsigned int*)P.cell_seg_count; |
| unsigned int * __restrict__ cell_active_a = (unsigned int*)P.cell_active_a; |
| unsigned int * __restrict__ cell_active_b = (unsigned int*)P.cell_active_b; |
| unsigned int * __restrict__ cell_winner_a = (unsigned int*)P.cell_winner_a; |
| unsigned int * __restrict__ cell_winner_b = (unsigned int*)P.cell_winner_b; |
| const unsigned char * __restrict__ inputs = (const unsigned char*)P.inputs; |
| unsigned char * __restrict__ cols_out = (unsigned char*)P.cols_out; |
| float * __restrict__ anom_out = (float*)P.anom_out; |
| unsigned int * __restrict__ barrier_counters = (unsigned int*)P.barrier_counters; |
| unsigned int * __restrict__ step_scratch = (unsigned int*)P.step_scratch; |
|
|
| const unsigned int tid = threadIdx.x; |
| const unsigned int lane = tid & 31u; |
| const unsigned int warp = tid >> 5; |
| const unsigned int warps_per_block = blockDim.x >> 5; |
| const unsigned int gwarp = blockIdx.x * warps_per_block + warp; |
| const unsigned int n_warps = gridDim.x * warps_per_block; |
|
|
| const unsigned int n_cols = cfg.n_columns; |
| const unsigned int col_lo = (gwarp * n_cols) / n_warps; |
| const unsigned int col_hi = ((gwarp + 1) * n_cols) / n_warps; |
|
|
| unsigned int phase = 0u; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #if __CUDA_ARCH__ >= 900 |
| |
| auto cluster = cg::this_cluster(); |
| const unsigned int cluster_block_rank = cluster.block_rank(); |
| const unsigned int cluster_sz = cluster.num_blocks(); |
| #else |
| |
| const unsigned int cluster_block_rank = blockIdx.x; |
| const unsigned int cluster_sz = gridDim.x; |
| #endif |
|
|
| |
| |
| const unsigned int cols_per_block = |
| (n_cols + cluster_sz - 1u) / cluster_sz; |
| const unsigned int my_col_start = |
| cluster_block_rank * cols_per_block; |
| const unsigned int my_col_end = |
| (my_col_start + cols_per_block < n_cols) |
| ? (my_col_start + cols_per_block) : n_cols; |
|
|
| #if __CUDA_ARCH__ >= 900 |
| |
| |
| |
| __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX]; |
| __shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX]; |
| __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX]; |
| #endif |
|
|
| |
| |
| |
| |
| |
| |
| |
| #if __CUDA_ARCH__ >= 900 |
| __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX]; |
| #endif |
|
|
| #if __CUDA_ARCH__ >= 900 |
| |
| |
| for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) { |
| const unsigned int off = c - my_col_start; |
| s_inhib_thr [off] = inhibition_threshold[c]; |
| s_boost [off] = boost[c]; |
| s_active_duty[off] = active_duty[c]; |
| } |
|
|
| |
| |
| cluster.sync(); |
| #else |
| |
| |
| grid.sync(); |
| #endif |
|
|
| const unsigned int S = cfg.synapses_per_col; |
| const unsigned int cpc = cfg.cells_per_column; |
| const unsigned int SPS = cfg.synapses_per_segment; |
| const unsigned int MSC = cfg.max_segments_per_cell; |
|
|
| |
| for (unsigned int t = 0u; t < cfg.T; t++) { |
| const unsigned int inp_off = t * cfg.input_bits; |
| const unsigned int col_base_out = t * n_cols; |
|
|
| unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a; |
| unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b; |
| unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a; |
| unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b; |
|
|
| |
| const unsigned int my_cell_lo = col_lo * cpc; |
| const unsigned int my_cell_hi = col_hi * cpc; |
| if (cpc == 32u) { |
| |
| for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) { |
| curr_active[c] = 0u; |
| curr_winner[c] = 0u; |
| } |
| } else { |
| for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) { |
| unsigned int w = cell >> 5; |
| unsigned int m = 1u << (cell & 31u); |
| atomicAnd(&curr_active[w], ~m); |
| atomicAnd(&curr_winner[w], ~m); |
| } |
| } |
|
|
| |
| if (blockIdx.x == 0u && tid == 0u) { |
| step_scratch[0] = 0u; |
| step_scratch[1] = 0u; |
| } |
|
|
| |
| |
| |
| __threadfence(); |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #if __CUDA_ARCH__ >= 900 |
| const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX); |
| if (use_input_tile) { |
| auto tb = cg::this_thread_block(); |
| cg::memcpy_async(tb, s_input_tile, |
| inputs + inp_off, |
| cfg.input_bits); |
| cg::wait(tb); |
| cluster.sync(); |
| } |
| #else |
| const bool use_input_tile = false; |
| #endif |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| for (unsigned int c = col_lo; c < col_hi; c++) { |
| unsigned int base = c * S; |
| unsigned int local = 0u; |
| for (unsigned int s = lane; s < S; s += 32u) { |
| unsigned int b = syn_bit[base + s]; |
| float p = syn_perm[base + s]; |
| |
| |
| #if __CUDA_ARCH__ >= 900 |
| unsigned int inp_byte = use_input_tile |
| ? (unsigned int)s_input_tile[b] |
| : (unsigned int)inputs[inp_off + b]; |
| #else |
| unsigned int inp_byte = (unsigned int)inputs[inp_off + b]; |
| #endif |
| unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u; |
| local += hit; |
| } |
| unsigned int overlap = warp_sum_u32(local); |
| overlap = __shfl_sync(0xffffffffu, overlap, 0); |
|
|
| |
| #if __CUDA_ARCH__ >= 900 |
| |
| const unsigned int owner_block = c / cols_per_block; |
| const unsigned int owner_offset = c - owner_block * cols_per_block; |
| float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset]; |
| float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset]; |
| #else |
| |
| float boost_val = boost[c]; |
| float thr = inhibition_threshold[c]; |
| #endif |
|
|
| float boosted = (float)overlap * boost_val; |
| unsigned int is_active = (boosted > thr) ? 1u : 0u; |
|
|
| if (lane == 0) { |
| cols_out[col_base_out + c] = (unsigned char)is_active; |
| if (is_active) { |
| atomicAdd(&step_scratch[0], 1u); |
| } |
| } |
|
|
| |
| |
| if (cfg.learn && is_active) { |
| for (unsigned int s = lane; s < S; s += 32u) { |
| unsigned int b = syn_bit[base + s]; |
| float p = syn_perm[base + s]; |
| #if __CUDA_ARCH__ >= 900 |
| unsigned int inp_byte = use_input_tile |
| ? (unsigned int)s_input_tile[b] |
| : (unsigned int)inputs[inp_off + b]; |
| #else |
| unsigned int inp_byte = (unsigned int)inputs[inp_off + b]; |
| #endif |
| if (inp_byte != 0u) { |
| p += cfg.sp_inc; |
| if (p > 1.0f) p = 1.0f; |
| } else { |
| p -= cfg.sp_dec; |
| if (p < 0.0f) p = 0.0f; |
| } |
| syn_perm[base + s] = p; |
| } |
| } |
|
|
| |
| |
| if (lane == 0) { |
| #if __CUDA_ARCH__ >= 900 |
| float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset]; |
| #else |
| float ad = active_duty[c]; |
| #endif |
| float sample = is_active ? 1.0f : 0.0f; |
| ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample; |
|
|
| #if __CUDA_ARCH__ >= 900 |
| |
| cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad; |
| #endif |
| active_duty[c] = ad; |
|
|
| |
| float err = ad - cfg.sparsity_target; |
| float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f; |
| if (new_thr < 0.1f) new_thr = 0.1f; |
| if (new_thr > 1000.0f) new_thr = 1000.0f; |
|
|
| #if __CUDA_ARCH__ >= 900 |
| |
| cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr; |
| #endif |
| inhibition_threshold[c] = new_thr; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| #if __CUDA_ARCH__ >= 900 |
| cluster.sync(); |
| #endif |
|
|
| |
| |
| |
| __threadfence(); |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); |
|
|
| |
| |
| |
| for (unsigned int c = col_lo; c < col_hi; c++) { |
| unsigned int col_active = cols_out[col_base_out + c]; |
| if (col_active == 0u) continue; |
|
|
| unsigned int base_cell = c * cpc; |
| unsigned int any_predicted = 0u; |
| unsigned int best_seg_id_for_grow = 0xFFFFFFFFu; |
| unsigned int best_pot_count = 0u; |
|
|
| for (unsigned int k = 0u; k < cpc; k++) { |
| unsigned int cell = base_cell + k; |
| unsigned int n_segs_here = cell_seg_count[cell]; |
| if (n_segs_here > MSC) n_segs_here = MSC; |
| if (n_segs_here == 0u) continue; |
|
|
| unsigned int seg_base_id = cell * MSC; |
| unsigned int cell_is_predictive = 0u; |
|
|
| for (unsigned int ls = 0u; ls < n_segs_here; ls++) { |
| unsigned int seg = seg_base_id + ls; |
| unsigned int n_syn = seg_syn_count[seg]; |
| if (n_syn == 0u) continue; |
| unsigned int syn_base = seg * SPS; |
|
|
| unsigned int l_conn = 0u; |
| unsigned int l_pot = 0u; |
| for (unsigned int s = lane; s < n_syn; s += 32u) { |
| unsigned int presyn = syn_presyn[syn_base + s]; |
| unsigned int w = prev_active[presyn >> 5]; |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; |
| if (bit) { |
| l_pot += 1u; |
| int p = (int)tm_syn_perm[syn_base + s]; |
| if (p >= cfg.conn_thr_i16) l_conn += 1u; |
| } |
| } |
| unsigned int tot_conn = warp_sum_u32(l_conn); |
| unsigned int tot_pot = warp_sum_u32(l_pot); |
| tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0); |
| tot_pot = __shfl_sync(0xffffffffu, tot_pot, 0); |
|
|
| if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u; |
| if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) { |
| best_pot_count = tot_pot; |
| best_seg_id_for_grow = seg; |
| } |
|
|
| |
| if (cfg.learn && tot_conn >= cfg.activation_threshold) { |
| for (unsigned int s = lane; s < n_syn; s += 32u) { |
| unsigned int presyn = syn_presyn[syn_base + s]; |
| unsigned int w = prev_active[presyn >> 5]; |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; |
| int p = (int)tm_syn_perm[syn_base + s]; |
| if (bit) { |
| int np = p + cfg.perm_inc_i16; |
| if (np > 32767) np = 32767; |
| tm_syn_perm[syn_base + s] = (short)np; |
| } else { |
| int np = p - cfg.perm_dec_i16; |
| if (np < 0) np = 0; |
| tm_syn_perm[syn_base + s] = (short)np; |
| } |
| } |
| } |
| } |
|
|
| if (cell_is_predictive) { |
| any_predicted = 1u; |
| if (lane == 0) { |
| unsigned int w = cell >> 5; |
| unsigned int m = 1u << (cell & 31u); |
| atomicOr(&curr_active[w], m); |
| atomicOr(&curr_winner[w], m); |
| } |
| } |
| } |
|
|
| |
| if (!any_predicted) { |
| if (lane == 0) { |
| for (unsigned int k = 0u; k < cpc; k++) { |
| unsigned int cell = base_cell + k; |
| unsigned int w = cell >> 5; |
| unsigned int m = 1u << (cell & 31u); |
| atomicOr(&curr_active[w], m); |
| } |
| unsigned int win = base_cell; |
| unsigned int ww = win >> 5; |
| unsigned int wm = 1u << (win & 31u); |
| atomicOr(&curr_winner[ww], wm); |
| atomicAdd(&step_scratch[1], 1u); |
| } |
|
|
| if (cfg.learn) { |
| unsigned int target_seg; |
| unsigned int existing_syn; |
| if (best_seg_id_for_grow != 0xFFFFFFFFu) { |
| |
| target_seg = best_seg_id_for_grow; |
| existing_syn = seg_syn_count[target_seg]; |
| target_seg = __shfl_sync(0xffffffffu, target_seg, 0); |
| existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0); |
|
|
| |
| unsigned int syn_base = target_seg * SPS; |
| for (unsigned int s = lane; s < existing_syn; s += 32u) { |
| unsigned int presyn = syn_presyn[syn_base + s]; |
| unsigned int w = prev_active[presyn >> 5]; |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; |
| int p = (int)tm_syn_perm[syn_base + s]; |
| if (bit) { |
| int np = p + cfg.perm_inc_i16; |
| if (np > 32767) np = 32767; |
| tm_syn_perm[syn_base + s] = (short)np; |
| } else { |
| int np = p - cfg.perm_dec_i16; |
| if (np < 0) np = 0; |
| tm_syn_perm[syn_base + s] = (short)np; |
| } |
| } |
| } else { |
| |
| unsigned int new_seg = 0u; |
| if (lane == 0) { |
| unsigned int winner_cell = base_cell; |
| unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u); |
| if (slot >= MSC) slot = slot % MSC; |
| new_seg = winner_cell * MSC + slot; |
| seg_cell_id[new_seg] = winner_cell; |
| seg_syn_count[new_seg] = 0u; |
| } |
| target_seg = __shfl_sync(0xffffffffu, new_seg, 0); |
| existing_syn = 0u; |
| } |
|
|
| |
| unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u; |
| unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room; |
| if (lane == 0 && max_grow > 0u) { |
| unsigned int syn_base = target_seg * SPS; |
| unsigned int grown = 0u; |
| unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words; |
| for (unsigned int w_off = 0u; |
| w_off < cfg.bits_words && grown < max_grow; |
| w_off++) { |
| unsigned int widx = (start_off + w_off) % cfg.bits_words; |
| unsigned int word = prev_winner[widx]; |
| while (word != 0u && grown < max_grow) { |
| unsigned int bit_pos = __ffs(word) - 1u; |
| word &= ~(1u << bit_pos); |
| unsigned int cell_id = widx * 32u + bit_pos; |
| if (cell_id >= cfg.n_cells) continue; |
| bool exists = false; |
| for (unsigned int es = 0u; es < existing_syn + grown; es++) { |
| if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; } |
| } |
| if (exists) continue; |
| unsigned int write_idx = existing_syn + grown; |
| if (write_idx >= SPS) break; |
| syn_presyn[syn_base + write_idx] = cell_id; |
| tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16; |
| grown++; |
| } |
| } |
| if (grown > 0u) { |
| seg_syn_count[target_seg] = existing_syn + grown; |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| __threadfence(); |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); |
|
|
| |
| if (blockIdx.x == 0u && tid == 0u) { |
| unsigned int total = step_scratch[0]; |
| unsigned int bad = step_scratch[1]; |
| float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f; |
| anom_out[t] = anom; |
| } |
| } |
| } |
|
|
| |
| __global__ __launch_bounds__(256, 2) |
| void htm_fused_step(FusedPtrs P, FusedConfig cfg) { |
| htm_fused_step_body(P, cfg); |
| } |
|
|
| |
| |
| |
| __global__ __launch_bounds__(256, 2) |
| void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) { |
| const FusedPtrs P = P_arr[blockIdx.y]; |
| htm_fused_step_body(P, cfg); |
| } |
|
|
| } |
|
|