File size: 1,272 Bytes
1c59946 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | // TM anomaly kernel.
//
// Computes:
// n_active = sum of sp_active_mask
// anomaly = unpredicted_count / n_active (if n_active > 0)
// = 0 (else)
//
// Launch: single block, 256 threads.
extern "C" __global__
void tm_anomaly(
const unsigned char * __restrict__ sp_active_mask,
const unsigned int * __restrict__ unpredicted_count,
float * __restrict__ anomaly_out, // (1,) or (t_slot,)
unsigned int t_slot,
unsigned int n_cols
) {
const unsigned int tid = threadIdx.x;
__shared__ unsigned int n_active_s;
if (tid == 0) n_active_s = 0u;
__syncthreads();
unsigned int local = 0u;
for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
if (sp_active_mask[i]) local += 1u;
}
// Warp reduce.
for (int off = 16; off > 0; off >>= 1) {
local += __shfl_down_sync(0xffffffffu, local, off);
}
if ((tid & 31u) == 0) {
atomicAdd(&n_active_s, local);
}
__syncthreads();
if (tid == 0) {
unsigned int total = n_active_s;
unsigned int bad = unpredicted_count[0];
float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
anomaly_out[t_slot] = anom;
}
}
|