| #include <stdio.h>
|
| #include <stdlib.h>
|
| #include <time.h>
|
| #include <assert.h>
|
| #include <cupti.h>
|
| #include "../../../common/cupti_add.h"
|
| #include "../../../common/cpu_timestamps.h"
|
|
|
| #include <cooperative_groups.h>
|
| #include <cooperative_groups/memcpy_async.h>
|
|
|
| using namespace nvcuda::experimental;
|
|
|
| #define PREFETCH_COUNT 2
|
|
|
| #ifdef RD_WG_SIZE_0_0
|
| #define BLOCK_SIZE RD_WG_SIZE_0_0
|
| #elif defined(RD_WG_SIZE_0)
|
| #define BLOCK_SIZE RD_WG_SIZE_0
|
| #elif defined(RD_WG_SIZE)
|
| #define BLOCK_SIZE RD_WG_SIZE
|
| #else
|
| #define BLOCK_SIZE 16
|
| #endif
|
|
|
| #define STR_SIZE 256
|
|
|
|
|
| #define MAX_PD (3.0e6)
|
|
|
| #define PRECISION 0.001
|
| #define SPEC_HEAT_SI 1.75e6
|
| #define K_SI 100
|
|
|
| #define FACTOR_CHIP 0.5
|
|
|
|
|
| float t_chip = 0.0005;
|
| float chip_height = 0.016;
|
| float chip_width = 0.016;
|
|
|
| float amb_temp = 80.0;
|
|
|
| void run(int argc, char **argv);
|
|
|
|
|
| #define pin_stats_reset() startCycle()
|
| #define pin_stats_pause(cycles) stopCycle(cycles)
|
| #define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
|
|
|
| void fatal(char *s)
|
| {
|
| fprintf(stderr, "error: %s\n", s);
|
| }
|
|
|
| void writeoutput(float *vect, int grid_rows, int grid_cols, char *file)
|
| {
|
|
|
| int i, j, index = 0;
|
| FILE *fp;
|
| char str[STR_SIZE];
|
|
|
| if ((fp = fopen(file, "w")) == 0)
|
| printf("The file was not opened\n");
|
|
|
| for (i = 0; i < grid_rows; i++)
|
| for (j = 0; j < grid_cols; j++)
|
| {
|
|
|
| sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
|
| fputs(str, fp);
|
| index++;
|
| }
|
|
|
| fclose(fp);
|
| }
|
|
|
| void readinput(float *vect, int grid_rows, int grid_cols, char *file)
|
| {
|
|
|
| int i, j;
|
| FILE *fp;
|
| char str[STR_SIZE];
|
| float val;
|
|
|
| if ((fp = fopen(file, "r")) == 0)
|
| printf("The file was not opened\n");
|
|
|
| for (i = 0; i <= grid_rows - 1; i++)
|
| for (j = 0; j <= grid_cols - 1; j++)
|
| {
|
| fgets(str, STR_SIZE, fp);
|
| if (feof(fp))
|
| fatal("not enough lines in file");
|
|
|
| if ((sscanf(str, "%f", &val) != 1))
|
| fatal("invalid file format");
|
| vect[i * grid_cols + j] = val;
|
| }
|
|
|
| fclose(fp);
|
| }
|
|
|
| #define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
|
| #define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
|
| #define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
|
|
| __global__ void calculate_temp(int iteration,
|
| float *power,
|
| float *temp_src,
|
| float *temp_dst,
|
| int grid_cols,
|
| int grid_rows,
|
| int border_cols,
|
| int border_rows,
|
| float Cap,
|
| float Rx,
|
| float Ry,
|
| float Rz,
|
| float step,
|
| int batch_size)
|
| {
|
| cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| pipeline pipe;
|
| __shared__ float temp_on_cuda[PREFETCH_COUNT][BLOCK_SIZE][BLOCK_SIZE];
|
| __shared__ float power_on_cuda[PREFETCH_COUNT][BLOCK_SIZE][BLOCK_SIZE];
|
| __shared__ float temp_t[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
| float amb_temp = 80.0;
|
| float step_div_Cap;
|
| float Rx_1, Ry_1, Rz_1;
|
|
|
|
|
|
|
|
|
| int tx = threadIdx.x;
|
| int ty = threadIdx.y;
|
|
|
| step_div_Cap = step / Cap;
|
|
|
| Rx_1 = 1 / Rx;
|
| Ry_1 = 1 / Ry;
|
| Rz_1 = 1 / Rz;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| int small_block_rows = BLOCK_SIZE - iteration * 2;
|
| int small_block_cols = BLOCK_SIZE - iteration * 2;
|
|
|
|
|
|
|
|
|
| int tile_dim_x = gridDim.x * batch_size;
|
|
|
| int total_tiles = tile_dim_x * tile_dim_x;
|
| int tiles_this_block = batch_size * batch_size;
|
| int tiles_this_block_x = batch_size;
|
|
|
| int base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
|
| int fetch = base_tile;
|
| int end_tile = fetch + tiles_this_block;
|
|
|
| for (int compute = fetch; compute < end_tile; compute++)
|
| {
|
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| {
|
|
|
| int offset = fetch - base_tile;
|
| int block_id = fetch / tiles_this_block;
|
| int bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
|
| int by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
|
|
|
|
|
|
|
| int blkY = small_block_rows * by - border_rows;
|
| int blkX = small_block_cols * bx - border_cols;
|
| int blkYmax = blkY + BLOCK_SIZE - 1;
|
| int blkXmax = blkX + BLOCK_SIZE - 1;
|
|
|
|
|
| int yidx = blkY + ty;
|
| int xidx = blkX + tx;
|
|
|
|
|
| int loadYidx = yidx, loadXidx = xidx;
|
| int index = grid_cols * loadYidx + loadXidx;
|
|
|
| if (IN_RANGE(loadYidx, 0, grid_rows - 1) && IN_RANGE(loadXidx, 0, grid_cols - 1))
|
| {
|
| memcpy_async(temp_on_cuda[fetch % PREFETCH_COUNT][ty][tx], temp_src[index], pipe);
|
| memcpy_async(power_on_cuda[fetch % PREFETCH_COUNT][ty][tx], power[index], pipe);
|
| }
|
| pipe.commit();
|
| }
|
| if (fetch == end_tile)
|
| {
|
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| {
|
| pipe.commit();
|
| }
|
| ++fetch;
|
| }
|
| pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| block.sync();
|
|
|
|
|
| int offset = compute - base_tile;
|
| int block_id = compute / tiles_this_block;
|
| int bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
|
| int by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
|
|
|
|
|
|
|
| int blkY = small_block_rows * by - border_rows;
|
| int blkX = small_block_cols * bx - border_cols;
|
| int blkYmax = blkY + BLOCK_SIZE - 1;
|
| int blkXmax = blkX + BLOCK_SIZE - 1;
|
|
|
|
|
| int yidx = blkY + ty;
|
| int xidx = blkX + tx;
|
|
|
|
|
| int loadYidx = yidx, loadXidx = xidx;
|
| int index = grid_cols * loadYidx + loadXidx;
|
|
|
|
|
|
|
|
|
| int validYmin = (blkY < 0) ? -blkY : 0;
|
| int validYmax = (blkYmax > grid_rows - 1) ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1) : BLOCK_SIZE - 1;
|
| int validXmin = (blkX < 0) ? -blkX : 0;
|
| int validXmax = (blkXmax > grid_cols - 1) ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1) : BLOCK_SIZE - 1;
|
|
|
| int N = ty - 1;
|
| int S = ty + 1;
|
| int W = tx - 1;
|
| int E = tx + 1;
|
|
|
| N = (N < validYmin) ? validYmin : N;
|
| S = (S > validYmax) ? validYmax : S;
|
| W = (W < validXmin) ? validXmin : W;
|
| E = (E > validXmax) ? validXmax : E;
|
|
|
| bool computed;
|
| for (int i = 0; i < iteration; i++)
|
| {
|
| computed = false;
|
| if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
|
| IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
|
| IN_RANGE(tx, validXmin, validXmax) &&
|
| IN_RANGE(ty, validYmin, validYmax))
|
| {
|
| computed = true;
|
| temp_t[ty][tx] = temp_on_cuda[compute % PREFETCH_COUNT][ty][tx] + step_div_Cap * (power_on_cuda[compute % PREFETCH_COUNT][ty][tx] +
|
| (temp_on_cuda[compute % PREFETCH_COUNT][S][tx] + temp_on_cuda[compute % PREFETCH_COUNT][N][tx] - 2.0 * temp_on_cuda[compute % PREFETCH_COUNT][ty][tx]) * Ry_1 +
|
| (temp_on_cuda[compute % PREFETCH_COUNT][ty][E] + temp_on_cuda[compute % PREFETCH_COUNT][ty][W] - 2.0 * temp_on_cuda[compute % PREFETCH_COUNT][ty][tx]) * Rx_1 +
|
| (amb_temp - temp_on_cuda[compute % PREFETCH_COUNT][ty][tx]) * Rz_1);
|
| }
|
| block.sync();
|
| if (i == iteration - 1)
|
| break;
|
| if (computed)
|
| temp_on_cuda[compute % PREFETCH_COUNT][ty][tx] = temp_t[ty][tx];
|
| block.sync();
|
| }
|
|
|
|
|
|
|
|
|
| if (computed)
|
| {
|
| temp_dst[index] = temp_t[ty][tx];
|
| }
|
| }
|
| }
|
|
|
| |
| |
|
|
|
|
| int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col, int row,
|
| int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows, int batch_size)
|
| {
|
| dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
|
| dim3 dimGrid(blockCols, blockRows);
|
|
|
| float grid_height = chip_height / row;
|
| float grid_width = chip_width / col;
|
|
|
| float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
|
| float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
|
| float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
|
| float Rz = t_chip / (K_SI * grid_height * grid_width);
|
|
|
| float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
|
| float step = PRECISION / max_slope;
|
| float t;
|
|
|
| int src = 1, dst = 0;
|
|
|
| for (t = 0; t < total_iterations; t += num_iterations)
|
| {
|
| int temp = src;
|
| src = dst;
|
| dst = temp;
|
| calculate_temp<<<dimGrid, dimBlock>>>(MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src], MatrixTemp[dst],
|
| col, row, borderCols, borderRows, Cap, Rx, Ry, Rz, step, batch_size);
|
| }
|
| return dst;
|
| }
|
|
|
| void usage(int argc, char **argv)
|
| {
|
| fprintf(stderr, "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> <temp_file> <power_file> <output_file>\n", argv[0]);
|
| fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid (positive integer)\n");
|
| fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
|
| fprintf(stderr, "\t<sim_time> - number of iterations\n");
|
| fprintf(stderr, "\t<temp_file> - name of the file containing the initial temperature values of each cell\n");
|
| fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n");
|
| fprintf(stderr, "\t<output_file> - name of the output file\n");
|
| fprintf(stderr, "\t<batch_size> - batch_size * batch_size per block\n");
|
| exit(1);
|
| }
|
|
|
| int main(int argc, char *argv[])
|
| {
|
| uint64_t start_tsc = rdtsc();
|
| uint64_t start_tsp = rdtsp();
|
| printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
|
|
|
| run(argc, argv);
|
|
|
| return EXIT_SUCCESS;
|
| }
|
|
|
| void run(int argc, char **argv)
|
| {
|
| int size;
|
| int grid_rows, grid_cols;
|
| float *FilesavingTemp, *FilesavingPower, *MatrixOut;
|
| char *tfile, *pfile, *ofile;
|
|
|
| int total_iterations = 60;
|
| int pyramid_height = 1;
|
|
|
| if (argc != 8)
|
| usage(argc, argv);
|
| if ((grid_rows = atoi(argv[1])) <= 0 ||
|
| (grid_cols = atoi(argv[1])) <= 0 ||
|
| (pyramid_height = atoi(argv[2])) <= 0 ||
|
| (total_iterations = atoi(argv[3])) <= 0)
|
| usage(argc, argv);
|
|
|
| tfile = argv[4];
|
| pfile = argv[5];
|
| ofile = argv[6];
|
|
|
| int batch_size = atoi(argv[7]);
|
|
|
| size = grid_rows * grid_cols;
|
|
|
|
|
| #define EXPAND_RATE 2
|
| int borderCols = (pyramid_height)*EXPAND_RATE / 2;
|
| int borderRows = (pyramid_height)*EXPAND_RATE / 2;
|
| int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
| int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
|
|
|
|
|
|
| int blockCols = (grid_cols + smallBlockCol * batch_size - 1) / (smallBlockCol * batch_size);
|
| int blockRows = (grid_rows + smallBlockRow * batch_size - 1) / (smallBlockRow * batch_size);
|
|
|
|
|
|
|
| FilesavingTemp = (float *)malloc(size * sizeof(float));
|
| FilesavingPower = (float *)malloc(size * sizeof(float));
|
| MatrixOut = (float *)calloc(size, sizeof(float));
|
|
|
| if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
|
| fatal("unable to allocate memory");
|
|
|
| printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, %d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
|
| pyramid_height, grid_cols, grid_rows, borderCols, borderRows, blockCols, blockRows, smallBlockCol, smallBlockRow);
|
|
|
| readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
|
| readinput(FilesavingPower, grid_rows, grid_cols, pfile);
|
|
|
| GPU_argv_init();
|
|
|
| initTrace();
|
| startCPU();
|
| float *MatrixTemp[2], *MatrixPower;
|
| cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
|
| cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
|
| cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size, cudaMemcpyHostToDevice);
|
|
|
| cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
|
| cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size, cudaMemcpyHostToDevice);
|
|
|
| int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
|
| total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, batch_size);
|
|
|
| cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size, cudaMemcpyDeviceToHost);
|
|
|
| cudaFree(MatrixPower);
|
| cudaFree(MatrixTemp[0]);
|
| cudaFree(MatrixTemp[1]);
|
|
|
| endCPU();
|
| finiTrace();
|
|
|
| writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
|
| free(MatrixOut);
|
| }
|
|
|