// TM anomaly kernel.
//
// Computes:
//   n_active = sum of sp_active_mask
//   anomaly  = unpredicted_count / n_active   (if n_active > 0)
//            = 0                              (else)
//
// Launch: single block, 256 threads.

extern "C" __global__
void tm_anomaly(
    const unsigned char * __restrict__ sp_active_mask,
    const unsigned int  * __restrict__ unpredicted_count,
    float               * __restrict__ anomaly_out,       // (1,) or (t_slot,)
    unsigned int          t_slot,
    unsigned int          n_cols
) {
    const unsigned int tid = threadIdx.x;
    __shared__ unsigned int n_active_s;

    if (tid == 0) n_active_s = 0u;
    __syncthreads();

    unsigned int local = 0u;
    for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
        if (sp_active_mask[i]) local += 1u;
    }
    // Warp reduce.
    for (int off = 16; off > 0; off >>= 1) {
        local += __shfl_down_sync(0xffffffffu, local, off);
    }
    if ((tid & 31u) == 0) {
        atomicAdd(&n_active_s, local);
    }
    __syncthreads();

    if (tid == 0) {
        unsigned int total = n_active_s;
        unsigned int bad = unpredicted_count[0];
        float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
        anomaly_out[t_slot] = anom;
    }
}