Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

feather-a10g-large-runtime / overlay /htm_rust /src /gpu /kernels /sp_overlap.cu

icarus112

Upload folder using huggingface_hub

1c59946 verified 9 days ago

raw

history blame contribute delete

2.91 kB

	// SP overlap kernel.
	//
	// For each column c (one CUDA block), compute:
	// overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
	// boosted[c] = overlap[c] * boost[c]
	// raw_overlap[c] = overlap[c] (also returned so host can drive duty cycle)
	//
	// Memory layout (flat, column-major with per-column stride = synapses_per_col):
	// syn_bit[c * S + s] : u32 index into input SDR
	// syn_perm[c * S + s] : f32 permanence in [0, 1]
	// boost[c] : f32
	// inp[b] : u8 0/1
	// Output:
	// raw[c] : u32
	// boosted[c] : f32
	//
	// Launch:
	// grid = n_columns
	// block = 128 (or 256) — one warp-sweep across synapses; many warps give
	// parallel reduction across S (typically S=40).
	//
	// At S=40 this is completely latency-bound; we coalesce reads and do a
	// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
	// reduction which is sufficient for S <= 1024 and has zero correctness risk.

	extern "C" __global__
	void sp_overlap(
	const unsigned char * __restrict__ inp, // (input_bits,)
	const unsigned int * __restrict__ syn_bit, // (n_columns * S,)
	const float * __restrict__ syn_perm,// (n_columns * S,)
	const float * __restrict__ boost, // (n_columns,)
	float conn_thr,
	unsigned int synapses_per_col, // S
	unsigned int n_columns,
	unsigned int * __restrict__ raw_out, // (n_columns,)
	float * __restrict__ boosted_out // (n_columns,)
	) {
	const unsigned int c = blockIdx.x;
	if (c >= n_columns) return;

	const unsigned int base = c * synapses_per_col;
	const unsigned int tid = threadIdx.x;
	const unsigned int bsz = blockDim.x;

	// Per-thread partial count.
	unsigned int local = 0;
	for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
	unsigned int b = syn_bit[base + s];
	float p = syn_perm[base + s];
	// Branchless: only counts when input active AND perm connected.
	// Using (inp != 0) to tolerate u8 layout.
	unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
	local += hit;
	}

	// Block-wide reduction in shared memory.
	__shared__ unsigned int smem[32];

	// Warp-level reduction via shuffle.
	unsigned int lane = tid & 31;
	unsigned int warp = tid >> 5;
	for (int off = 16; off > 0; off >>= 1) {
	local += __shfl_down_sync(0xffffffff, local, off);
	}
	if (lane == 0) smem[warp] = local;
	__syncthreads();

	if (warp == 0) {
	unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
	for (int off = 16; off > 0; off >>= 1) {
	v += __shfl_down_sync(0xffffffff, v, off);
	}
	if (tid == 0) {
	raw_out[c] = v;
	boosted_out[c] = (float)v * boost[c];
	}
	}
	}