| // TM anomaly kernel. | |
| // | |
| // Computes: | |
| // n_active = sum of sp_active_mask | |
| // anomaly = unpredicted_count / n_active (if n_active > 0) | |
| // = 0 (else) | |
| // | |
| // Launch: single block, 256 threads. | |
| extern "C" __global__ | |
| void tm_anomaly( | |
| const unsigned char * __restrict__ sp_active_mask, | |
| const unsigned int * __restrict__ unpredicted_count, | |
| float * __restrict__ anomaly_out, // (1,) or (t_slot,) | |
| unsigned int t_slot, | |
| unsigned int n_cols | |
| ) { | |
| const unsigned int tid = threadIdx.x; | |
| __shared__ unsigned int n_active_s; | |
| if (tid == 0) n_active_s = 0u; | |
| __syncthreads(); | |
| unsigned int local = 0u; | |
| for (unsigned int i = tid; i < n_cols; i += blockDim.x) { | |
| if (sp_active_mask[i]) local += 1u; | |
| } | |
| // Warp reduce. | |
| for (int off = 16; off > 0; off >>= 1) { | |
| local += __shfl_down_sync(0xffffffffu, local, off); | |
| } | |
| if ((tid & 31u) == 0) { | |
| atomicAdd(&n_active_s, local); | |
| } | |
| __syncthreads(); | |
| if (tid == 0) { | |
| unsigned int total = n_active_s; | |
| unsigned int bad = unpredicted_count[0]; | |
| float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f; | |
| anomaly_out[t_slot] = anom; | |
| } | |
| } | |