#include #include #include #include #include #include __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < numElements) { float diff = predictions[idx] - targets[idx]; float sq_diff = diff * diff; atomicAdd(sum, sq_diff); } } void read_binary(const std::string& filename, float* data, size_t size) { std::ifstream in(filename, std::ios::binary); if (!in) { std::cerr << "Cannot open file: " << filename << std::endl; exit(1); } in.read(reinterpret_cast(data), size * sizeof(float)); in.close(); } // test bool compare_scalar(float a, float b, float tol = 1e-1f) { return std::fabs(a - b) < tol; } int main() { std::vector sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18}; bool all_passed = true; for (int t = 0; t < sizes.size(); ++t) { size_t N = sizes[t]; size_t input_size = N * sizeof(float); // test std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin"; std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin"; std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin"; float* h_preds = (float*)malloc(input_size); float* h_targets = (float*)malloc(input_size); float h_mse_ref; read_binary(pred_file, h_preds, N); read_binary(target_file, h_targets, N); read_binary(ref_file, &h_mse_ref, 1); float *d_preds, *d_targets, *d_sum; cudaMalloc(&d_preds, input_size); cudaMalloc(&d_targets, input_size); cudaMalloc(&d_sum, sizeof(float)); cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice); cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice); cudaMemset(d_sum, 0, sizeof(float)); int threads = 256; int blocks = (N + threads - 1) / threads; mseKernel<<>>(d_preds, d_targets, N, d_sum); float h_sum = 0.0f; cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost); float mse = h_sum / N; if (!compare_scalar(mse, h_mse_ref)) { std::cout << "F" << std::endl; all_passed = false; cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); free(h_preds); free(h_targets); break; } cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); free(h_preds); free(h_targets); } if (all_passed) std::cout << "T" << std::endl; return 0; }