| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <time.h> |
| #include <math.h> |
|
|
| #define BLOCK_X 16 |
| #define BLOCK_Y 16 |
| #define TILE_C 64 |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| __global__ void kronecker_slab_tiled( |
| const double *__restrict__ ct, |
| const double *__restrict__ z_inv, |
| int P, int C, int j, |
| unsigned long long *__restrict__ nz_count, |
| unsigned long long *__restrict__ max_abs) |
| { |
| int i = blockIdx.x * BLOCK_X + threadIdx.x; |
| int dk = blockIdx.y * BLOCK_Y + threadIdx.y; |
| int k = j + dk; |
|
|
| if (i > j || k >= P) return; |
|
|
| |
| __shared__ double s_zi[TILE_C]; |
| __shared__ double s_row_j[TILE_C]; |
|
|
| double sum = 0.0; |
| double comp = 0.0; |
|
|
| for (int c0 = 0; c0 < C; c0 += TILE_C) { |
| int tile_len = (c0 + TILE_C <= C) ? TILE_C : (C - c0); |
|
|
| |
| int lid = threadIdx.y * BLOCK_X + threadIdx.x; |
| int nthreads = BLOCK_X * BLOCK_Y; |
| for (int t = lid; t < tile_len; t += nthreads) { |
| s_zi[t] = z_inv[c0 + t]; |
| s_row_j[t] = ct[(int64_t)j * C + c0 + t]; |
| } |
| __syncthreads(); |
|
|
| for (int t = 0; t < tile_len; t++) { |
| double val = s_zi[t] |
| * ct[(int64_t)i * C + c0 + t] |
| * s_row_j[t] |
| * ct[(int64_t)k * C + c0 + t]; |
| |
| double y = val - comp; |
| double t2 = sum + y; |
| comp = (t2 - sum) - y; |
| sum = t2; |
| } |
| __syncthreads(); |
| } |
|
|
| int64_t g = llround(sum); |
| if (g != 0) { |
| atomicAdd(nz_count, 1ULL); |
| unsigned long long av = (unsigned long long)(g > 0 ? g : -g); |
| atomicMax(max_abs, av); |
| } |
| } |
|
|
|
|
| int main(int argc, char **argv) { |
| if (argc < 2) { |
| fprintf(stderr, "Usage: %s <n> [gpu_id]\n", argv[0]); |
| return 1; |
| } |
| int n = atoi(argv[1]); |
| int gpu = argc > 2 ? atoi(argv[2]) : 0; |
| cudaSetDevice(gpu); |
|
|
| |
| char path[512]; |
| snprintf(path, 512, "scripts/experiments/kronecker-coefficients/results/char_table_n%d.dbin", n); |
| FILE *fc = fopen(path, "rb"); |
| if (!fc) { |
| fprintf(stderr, "Cannot open %s — run convert_char_table.py first\n", path); |
| return 1; |
| } |
| fseek(fc, 0, SEEK_END); long ct_sz = ftell(fc); fseek(fc, 0, SEEK_SET); |
|
|
| snprintf(path, 512, "scripts/experiments/kronecker-coefficients/results/z_inv_n%d.bin", n); |
| FILE *fz = fopen(path, "rb"); |
| fseek(fz, 0, SEEK_END); int C = ftell(fz) / sizeof(double); fseek(fz, 0, SEEK_SET); |
| int P = ct_sz / (C * sizeof(double)); |
|
|
| printf("========================================\n"); |
| printf("Kronecker S_%d (optimized GPU)\n", n); |
| printf("P=%d partitions, C=%d classes\n", P, C); |
| printf("Character table: %.2f GB\n", ct_sz / 1e9); |
| printf("Triples (i<=j<=k): %lld\n", (long long)P * (P + 1) * (P + 2) / 6); |
| printf("========================================\n\n"); |
| fflush(stdout); |
|
|
| double *h_ct = (double *)malloc(ct_sz); |
| double *h_z = (double *)malloc(C * sizeof(double)); |
| fread(h_ct, 1, ct_sz, fc); fclose(fc); |
| fread(h_z, sizeof(double), C, fz); fclose(fz); |
|
|
| |
| double *d_ct, *d_z; |
| unsigned long long *d_nz, *d_mx; |
|
|
| cudaMalloc(&d_ct, ct_sz); |
| cudaMalloc(&d_z, C * sizeof(double)); |
| cudaMalloc(&d_nz, sizeof(unsigned long long)); |
| cudaMalloc(&d_mx, sizeof(unsigned long long)); |
| cudaMemcpy(d_ct, h_ct, ct_sz, cudaMemcpyHostToDevice); |
| cudaMemcpy(d_z, h_z, C * sizeof(double), cudaMemcpyHostToDevice); |
|
|
| printf("GPU memory: %.1f GB char table (no slab buffer needed)\n", ct_sz / 1e9); |
| fflush(stdout); |
|
|
| struct timespec t0, t1; |
| clock_gettime(CLOCK_MONOTONIC, &t0); |
|
|
| unsigned long long zero = 0; |
| cudaMemcpy(d_nz, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice); |
| cudaMemcpy(d_mx, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice); |
|
|
| for (int j = 0; j < P; j++) { |
| int num_i = j + 1; |
| int num_k = P - j; |
|
|
| dim3 block(BLOCK_X, BLOCK_Y); |
| dim3 grid((num_i + BLOCK_X - 1) / BLOCK_X, |
| (num_k + BLOCK_Y - 1) / BLOCK_Y); |
|
|
| kronecker_slab_tiled<<<grid, block>>>( |
| d_ct, d_z, P, C, j, d_nz, d_mx); |
|
|
| if (j % 500 == 0 || j == P - 1) { |
| cudaDeviceSynchronize(); |
| unsigned long long snap_nz, snap_mx; |
| cudaMemcpy(&snap_nz, d_nz, sizeof(unsigned long long), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&snap_mx, d_mx, sizeof(unsigned long long), cudaMemcpyDeviceToHost); |
|
|
| clock_gettime(CLOCK_MONOTONIC, &t1); |
| double el = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; |
| double eta = j > 0 ? el * (P - j) / j : 0; |
| printf(" j=%d/%d (%.1f%%) nz=%llu max=%llu %.0fs ETA %.0fs\n", |
| j, P, 100.0 * j / P, snap_nz, snap_mx, el, eta); |
| fflush(stdout); |
|
|
| |
| char ckpt[512]; |
| snprintf(ckpt, 512, |
| "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n); |
| FILE *fck = fopen(ckpt, "w"); |
| if (fck) { |
| fprintf(fck, "n=%d\nP=%d\nslab=%d/%d\nnonzero=%llu\nmax=%llu\nelapsed=%.1f\n", |
| n, P, j + 1, P, snap_nz, snap_mx, el); |
| fclose(fck); |
| } |
| } |
| } |
|
|
| cudaDeviceSynchronize(); |
| unsigned long long final_nz, final_mx; |
| cudaMemcpy(&final_nz, d_nz, sizeof(unsigned long long), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&final_mx, d_mx, sizeof(unsigned long long), cudaMemcpyDeviceToHost); |
|
|
| clock_gettime(CLOCK_MONOTONIC, &t1); |
| double total_time = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; |
|
|
| printf("\n========================================\n"); |
| printf("RESULTS\n"); |
| printf("========================================\n"); |
| printf("S_%d Kronecker coefficients (full)\n", n); |
| printf("Partitions: %d, Classes: %d\n", P, C); |
| printf("Triples (i<=j<=k): %lld\n", (long long)P * (P + 1) * (P + 2) / 6); |
| printf("Nonzero: %llu\n", final_nz); |
| printf("Max |g|: %llu\n", final_mx); |
| printf("Time: %.1fs\n", total_time); |
| printf("========================================\n"); |
|
|
| char ckpt[512]; |
| snprintf(ckpt, 512, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n); |
| remove(ckpt); |
|
|
| free(h_ct); free(h_z); |
| cudaFree(d_ct); cudaFree(d_z); |
| cudaFree(d_nz); cudaFree(d_mx); |
| return 0; |
| } |
|
|