Spaces:

KernelPilot
/

KernelPilot-Optimization

Sleeping

File size: 2,757 Bytes

949310d

#include <iostream>
#include <cuda_runtime.h>
#include <fstream>
#include <cmath>
#include <vector>
#include <string>

__global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numElements) {
        float diff = predictions[idx] - targets[idx];
        float sq_diff = diff * diff;
        atomicAdd(sum, sq_diff);
    }
}

void read_binary(const std::string& filename, float* data, size_t size) {
    std::ifstream in(filename, std::ios::binary);
    if (!in) {
        std::cerr << "Cannot open file: " << filename << std::endl;
        exit(1);
    }
    in.read(reinterpret_cast<char*>(data), size * sizeof(float));
    in.close();
}

// test
bool compare_scalar(float a, float b, float tol = 1e-1f) {
    return std::fabs(a - b) < tol;
}

int main() {
    std::vector<size_t> sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18};
    bool all_passed = true;

    for (int t = 0; t < sizes.size(); ++t) {
        size_t N = sizes[t];
        size_t input_size = N * sizeof(float);
        
        // test
        std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin";
        std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin";
        std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin";

        float* h_preds = (float*)malloc(input_size);
        float* h_targets = (float*)malloc(input_size);
        float h_mse_ref;

        read_binary(pred_file, h_preds, N);
        read_binary(target_file, h_targets, N);
        read_binary(ref_file, &h_mse_ref, 1);

        float *d_preds, *d_targets, *d_sum;
        cudaMalloc(&d_preds, input_size);
        cudaMalloc(&d_targets, input_size);
        cudaMalloc(&d_sum, sizeof(float));
        cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice);
        cudaMemset(d_sum, 0, sizeof(float));

        int threads = 256;
        int blocks = (N + threads - 1) / threads;
        mseKernel<<<blocks, threads>>>(d_preds, d_targets, N, d_sum);

        float h_sum = 0.0f;
        cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
        float mse = h_sum / N;

        if (!compare_scalar(mse, h_mse_ref)) {
            std::cout << "F" << std::endl;
            all_passed = false;

            cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
            free(h_preds); free(h_targets);
            break;
        }

        cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
        free(h_preds); free(h_targets);
    }

    if (all_passed) std::cout << "T" << std::endl;
    return 0;
}