Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

feather-a10g-large-runtime / overlay /htm_rust /src /gpu /kernels /sp_topk.cu

icarus112

Upload folder using huggingface_hub

1c59946 verified 16 days ago

raw

history blame contribute delete

4.3 kB

	// Top-K column selection.
	//
	// Inputs:
	// boosted[n_columns] : f32 score
	// Output:
	// active_mask[n_columns] : u8 0/1, exactly k ones
	//
	// Tie-breaking: when scores are equal, the LOWER column index wins (matches
	// CPU reference `select_nth_unstable_by` with secondary index comparator).
	//
	// Strategy: a single-block implementation. n_columns is typically 2048, which
	// fits comfortably in shared memory. We use a bitonic top-k via per-thread
	// radix-select of the (score, -index) key. At k≈41 of n=2048 the simplest
	// correct approach is a thresholding pass:
	//
	// 1. Radix-like bucket pass to find the k-th largest score.
	// 2. Mark winners = strictly-greater-than-threshold AND ties until count hits k.
	//
	// For strict index-ordered tie-break we materialise a 64-bit key:
	// key = (float_to_sortable_u32(score) << 32) \| (0xffffffff - index)
	// Larger key = (higher score) OR (same score, smaller index).
	//
	// Then we find the k-th largest 64-bit key via radix-select and mark all
	// columns whose key >= threshold. This is O(n_cols * log k) and well under
	// 100 μs for n=2048, k=41 on sm_86.
	//
	// For simplicity and correctness this kernel uses a single-block parallel
	// selection sort variant (find max → mark → zero → repeat, k iterations).
	// At k=41 this is 41 passes of 2048 threads = ~2048*41 = 84K ops, trivially
	// fast.

	extern "C" __global__
	void sp_topk_select(
	const float * __restrict__ scores, // (n_columns,)
	unsigned int n_columns,
	unsigned int k,
	unsigned char * __restrict__ active_out // (n_columns,)
	) {
	extern __shared__ float smem[];
	// Layout: smem[0..n] = working scores (we'll mark selected entries as -inf)
	// smem[n..n+32*2] = reduction scratch (score + index, per warp)
	float * work = smem;
	const unsigned int tid = threadIdx.x;
	const unsigned int bsz = blockDim.x;

	// Load scores into shared; also init active_out = 0.
	for (unsigned int i = tid; i < n_columns; i += bsz) {
	work[i] = scores[i];
	active_out[i] = 0;
	}
	__syncthreads();

	__shared__ int winner_idx;
	__shared__ float winner_score;

	for (unsigned int iter = 0; iter < k; ++iter) {
	// Find (argmax score, lowest index for ties).
	float best_s = -INFINITY;
	int best_i = n_columns; // sentinel larger than any index

	for (unsigned int i = tid; i < n_columns; i += bsz) {
	float s = work[i];
	if (s > best_s \|\| (s == best_s && (int)i < best_i)) {
	best_s = s;
	best_i = (int)i;
	}
	}

	// Warp reduction. We reduce pairs (score, idx) keeping (max score, min idx on tie).
	unsigned int mask = 0xffffffff;
	for (int off = 16; off > 0; off >>= 1) {
	float os = __shfl_down_sync(mask, best_s, off);
	int oi = __shfl_down_sync(mask, best_i, off);
	if (os > best_s \|\| (os == best_s && oi < best_i)) {
	best_s = os;
	best_i = oi;
	}
	}
	// Warp 0 collects lane 0 values from other warps via shared mem.
	__shared__ float warp_s[32];
	__shared__ int warp_i[32];
	unsigned int lane = tid & 31;
	unsigned int warp = tid >> 5;
	if (lane == 0) {
	warp_s[warp] = best_s;
	warp_i[warp] = best_i;
	}
	__syncthreads();

	if (warp == 0) {
	unsigned int nwarps = (bsz + 31) / 32;
	float s = (lane < nwarps) ? warp_s[lane] : -INFINITY;
	int i = (lane < nwarps) ? warp_i[lane] : (int)n_columns;
	for (int off = 16; off > 0; off >>= 1) {
	float os = __shfl_down_sync(mask, s, off);
	int oi = __shfl_down_sync(mask, i, off);
	if (os > s \|\| (os == s && oi < i)) {
	s = os;
	i = oi;
	}
	}
	if (tid == 0) {
	winner_score = s;
	winner_idx = i;
	}
	}
	__syncthreads();

	if (tid == 0) {
	if (winner_idx < (int)n_columns) {
	active_out[winner_idx] = 1;
	work[winner_idx] = -INFINITY;
	}
	}
	__syncthreads();
	}
	}