| #include <iostream> | |
| #include <cuda_runtime.h> | |
| #include <fstream> | |
| #include <cmath> | |
| #include <vector> | |
| #include <string> | |
| __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) { | |
| size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
| if (idx < numElements) { | |
| float diff = predictions[idx] - targets[idx]; | |
| float sq_diff = diff * diff; | |
| atomicAdd(sum, sq_diff); | |
| } | |
| } | |
| void read_binary(const std::string& filename, float* data, size_t size) { | |
| std::ifstream in(filename, std::ios::binary); | |
| if (!in) { | |
| std::cerr << "Cannot open file: " << filename << std::endl; | |
| exit(1); | |
| } | |
| in.read(reinterpret_cast<char*>(data), size * sizeof(float)); | |
| in.close(); | |
| } | |
| // test | |
| bool compare_scalar(float a, float b, float tol = 1e-1f) { | |
| return std::fabs(a - b) < tol; | |
| } | |
| int main() { | |
| std::vector<size_t> sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18}; | |
| bool all_passed = true; | |
| for (int t = 0; t < sizes.size(); ++t) { | |
| size_t N = sizes[t]; | |
| size_t input_size = N * sizeof(float); | |
| // test | |
| std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin"; | |
| std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin"; | |
| std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin"; | |
| float* h_preds = (float*)malloc(input_size); | |
| float* h_targets = (float*)malloc(input_size); | |
| float h_mse_ref; | |
| read_binary(pred_file, h_preds, N); | |
| read_binary(target_file, h_targets, N); | |
| read_binary(ref_file, &h_mse_ref, 1); | |
| float *d_preds, *d_targets, *d_sum; | |
| cudaMalloc(&d_preds, input_size); | |
| cudaMalloc(&d_targets, input_size); | |
| cudaMalloc(&d_sum, sizeof(float)); | |
| cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice); | |
| cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice); | |
| cudaMemset(d_sum, 0, sizeof(float)); | |
| int threads = 256; | |
| int blocks = (N + threads - 1) / threads; | |
| mseKernel<<<blocks, threads>>>(d_preds, d_targets, N, d_sum); | |
| float h_sum = 0.0f; | |
| cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost); | |
| float mse = h_sum / N; | |
| if (!compare_scalar(mse, h_mse_ref)) { | |
| std::cout << "F" << std::endl; | |
| all_passed = false; | |
| cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); | |
| free(h_preds); free(h_targets); | |
| break; | |
| } | |
| cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); | |
| free(h_preds); free(h_targets); | |
| } | |
| if (all_passed) std::cout << "T" << std::endl; | |
| return 0; | |
| } | |