wentaochen's picture
v1 init
949310d
#include <iostream>
#include <cuda_runtime.h>
#include <fstream>
#include <cmath>
#include <vector>
#include <string>
__global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numElements) {
float diff = predictions[idx] - targets[idx];
float sq_diff = diff * diff;
atomicAdd(sum, sq_diff);
}
}
void read_binary(const std::string& filename, float* data, size_t size) {
std::ifstream in(filename, std::ios::binary);
if (!in) {
std::cerr << "Cannot open file: " << filename << std::endl;
exit(1);
}
in.read(reinterpret_cast<char*>(data), size * sizeof(float));
in.close();
}
// test
bool compare_scalar(float a, float b, float tol = 1e-1f) {
return std::fabs(a - b) < tol;
}
int main() {
std::vector<size_t> sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18};
bool all_passed = true;
for (int t = 0; t < sizes.size(); ++t) {
size_t N = sizes[t];
size_t input_size = N * sizeof(float);
// test
std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin";
std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin";
std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin";
float* h_preds = (float*)malloc(input_size);
float* h_targets = (float*)malloc(input_size);
float h_mse_ref;
read_binary(pred_file, h_preds, N);
read_binary(target_file, h_targets, N);
read_binary(ref_file, &h_mse_ref, 1);
float *d_preds, *d_targets, *d_sum;
cudaMalloc(&d_preds, input_size);
cudaMalloc(&d_targets, input_size);
cudaMalloc(&d_sum, sizeof(float));
cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice);
cudaMemset(d_sum, 0, sizeof(float));
int threads = 256;
int blocks = (N + threads - 1) / threads;
mseKernel<<<blocks, threads>>>(d_preds, d_targets, N, d_sum);
float h_sum = 0.0f;
cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
float mse = h_sum / N;
if (!compare_scalar(mse, h_mse_ref)) {
std::cout << "F" << std::endl;
all_passed = false;
cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
free(h_preds); free(h_targets);
break;
}
cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
free(h_preds); free(h_targets);
}
if (all_passed) std::cout << "T" << std::endl;
return 0;
}