// TM anomaly kernel. // // Computes: // n_active = sum of sp_active_mask // anomaly = unpredicted_count / n_active (if n_active > 0) // = 0 (else) // // Launch: single block, 256 threads. extern "C" __global__ void tm_anomaly( const unsigned char * __restrict__ sp_active_mask, const unsigned int * __restrict__ unpredicted_count, float * __restrict__ anomaly_out, // (1,) or (t_slot,) unsigned int t_slot, unsigned int n_cols ) { const unsigned int tid = threadIdx.x; __shared__ unsigned int n_active_s; if (tid == 0) n_active_s = 0u; __syncthreads(); unsigned int local = 0u; for (unsigned int i = tid; i < n_cols; i += blockDim.x) { if (sp_active_mask[i]) local += 1u; } // Warp reduce. for (int off = 16; off > 0; off >>= 1) { local += __shfl_down_sync(0xffffffffu, local, off); } if ((tid & 31u) == 0) { atomicAdd(&n_active_s, local); } __syncthreads(); if (tid == 0) { unsigned int total = n_active_s; unsigned int bad = unpredicted_count[0]; float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f; anomaly_out[t_slot] = anom; } }