icarus112's picture
Upload folder using huggingface_hub
1c59946 verified
// TM anomaly kernel.
//
// Computes:
// n_active = sum of sp_active_mask
// anomaly = unpredicted_count / n_active (if n_active > 0)
// = 0 (else)
//
// Launch: single block, 256 threads.
extern "C" __global__
void tm_anomaly(
const unsigned char * __restrict__ sp_active_mask,
const unsigned int * __restrict__ unpredicted_count,
float * __restrict__ anomaly_out, // (1,) or (t_slot,)
unsigned int t_slot,
unsigned int n_cols
) {
const unsigned int tid = threadIdx.x;
__shared__ unsigned int n_active_s;
if (tid == 0) n_active_s = 0u;
__syncthreads();
unsigned int local = 0u;
for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
if (sp_active_mask[i]) local += 1u;
}
// Warp reduce.
for (int off = 16; off > 0; off >>= 1) {
local += __shfl_down_sync(0xffffffffu, local, off);
}
if ((tid & 31u) == 0) {
atomicAdd(&n_active_s, local);
}
__syncthreads();
if (tid == 0) {
unsigned int total = n_active_s;
unsigned int bad = unpredicted_count[0];
float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
anomaly_out[t_slot] = anom;
}
}