cahlen commited on
Commit
7d44c50
·
verified ·
1 Parent(s): b032012

CUDA kernel: kronecker-cuda

Browse files
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - kernels
5
+ - cuda
6
+ - kronecker-coefficients
7
+ - symmetric-group
8
+ - representation-theory
9
+ - combinatorics
10
+ datasets:
11
+ - cahlen/kronecker-coefficients
12
+ ---
13
+
14
+ # Kronecker Coefficients (Symmetric Group)
15
+
16
+ Computes Kronecker coefficients g(lambda,mu,nu) for S_n from character tables via GPU-parallel triple-sum.
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ import torch
22
+ from kernels import get_kernel
23
+
24
+ kernel = get_kernel("cahlen/kronecker-cuda")
25
+ result = kronecker.slab(char_table, z_inv, j=0)
26
+ ```
27
+
28
+ ## Compile (standalone)
29
+
30
+ ```bash
31
+ nvcc -O3 -arch=sm_90 -o kronecker kronecker/kronecker_gpu.cu -lm
32
+ ```
33
+
34
+ ## Results
35
+
36
+ All computation results are open:
37
+ - **Website**: [bigcompute.science](https://bigcompute.science)
38
+ - **Datasets**: [huggingface.co/cahlen](https://huggingface.co/cahlen)
39
+ - **Source**: [github.com/cahlen/idontknow](https://github.com/cahlen/idontknow)
40
+
41
+ ## Citation
42
+
43
+ ```bibtex
44
+ @misc{humphreys2026bigcompute,
45
+ author = {Humphreys, Cahlen},
46
+ title = {bigcompute.science: GPU-Accelerated Computational Mathematics},
47
+ year = {2026},
48
+ url = {https://bigcompute.science}
49
+ }
50
+ ```
51
+
52
+ *Human-AI collaborative. Not peer-reviewed. All code and data open.*
build.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [general]
2
+ name = "kronecker"
3
+ universal = false
4
+
5
+ [torch]
6
+ src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
7
+
8
+ [kernel.kronecker]
9
+ backend = "cuda"
10
+ cuda-capabilities = ["8.0", "9.0", "10.0", "12.0"]
11
+ src = ["kronecker/kronecker_gpu.cu"]
12
+ depends = ["torch"]
kronecker/kronecker_gpu.cu ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <stdint.h>
4
+ #include <time.h>
5
+
6
+ #define BLOCK 256
7
+
8
+ __global__ void kronecker_slab(
9
+ const int64_t *__restrict__ ct,
10
+ const double *__restrict__ z,
11
+ int P, int C, int j,
12
+ int64_t *__restrict__ out)
13
+ {
14
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
15
+ int i = tid / P;
16
+ int k = tid % P;
17
+ if (i > j || k < j || i >= P) return;
18
+ double sum = 0.0;
19
+ for (int c = 0; c < C; c++)
20
+ sum += z[c] * (double)ct[(int64_t)i*C+c] * (double)ct[(int64_t)j*C+c] * (double)ct[(int64_t)k*C+c];
21
+ out[(int64_t)i*P+k] = llround(sum);
22
+ }
23
+
24
+ __global__ void reduce_stats(const int64_t *slab, int P, int j,
25
+ unsigned long long *nz, unsigned long long *mx)
26
+ {
27
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
28
+ int i = tid / P;
29
+ int k = tid % P;
30
+ if (i > j || k < j || i >= P) return;
31
+ int64_t v = slab[(int64_t)i*P+k];
32
+ if (v != 0) {
33
+ atomicAdd(nz, 1ULL);
34
+ unsigned long long av = (unsigned long long)(v > 0 ? v : -v);
35
+ atomicMax(mx, av);
36
+ }
37
+ }
38
+
39
+ int main(int argc, char **argv) {
40
+ int n = atoi(argv[1]);
41
+ int gpu = argc > 2 ? atoi(argv[2]) : 0;
42
+ cudaSetDevice(gpu);
43
+ char path[256];
44
+ snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/char_table_n%d.bin", n);
45
+ FILE *fc = fopen(path, "rb"); fseek(fc, 0, SEEK_END); long ct_sz = ftell(fc); fseek(fc, 0, SEEK_SET);
46
+ snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/z_inv_n%d.bin", n);
47
+ FILE *fz = fopen(path, "rb"); fseek(fz, 0, SEEK_END); int C = ftell(fz)/sizeof(double); fseek(fz, 0, SEEK_SET);
48
+ int P = ct_sz / (C * sizeof(int64_t));
49
+ int64_t *h_ct = (int64_t*)malloc(ct_sz);
50
+ double *h_z = (double*)malloc(C*sizeof(double));
51
+ fread(h_ct, 1, ct_sz, fc); fclose(fc);
52
+ fread(h_z, sizeof(double), C, fz); fclose(fz);
53
+ printf("S_%d: %d partitions, %d classes — ALL GPU\n", n, P, C);
54
+ fflush(stdout);
55
+
56
+ int64_t *d_ct, *d_out; double *d_z;
57
+ unsigned long long *d_nz, *d_mx;
58
+ cudaMalloc(&d_ct, ct_sz);
59
+ cudaMalloc(&d_z, C*sizeof(double));
60
+ cudaMalloc(&d_out, (int64_t)P*P*sizeof(int64_t));
61
+ cudaMalloc(&d_nz, sizeof(unsigned long long));
62
+ cudaMalloc(&d_mx, sizeof(unsigned long long));
63
+ cudaMemcpy(d_ct, h_ct, ct_sz, cudaMemcpyHostToDevice);
64
+ cudaMemcpy(d_z, h_z, C*sizeof(double), cudaMemcpyHostToDevice);
65
+
66
+ unsigned long long total_nz = 0, global_max = 0;
67
+ int blocks = ((int64_t)P*P + BLOCK - 1) / BLOCK;
68
+ struct timespec t0, t1;
69
+ clock_gettime(CLOCK_MONOTONIC, &t0);
70
+
71
+ for (int j = 0; j < P; j++) {
72
+ cudaMemset(d_out, 0, (int64_t)P*P*sizeof(int64_t));
73
+ kronecker_slab<<<blocks, BLOCK>>>(d_ct, d_z, P, C, j, d_out);
74
+ unsigned long long zero = 0;
75
+ cudaMemcpy(d_nz, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
76
+ cudaMemcpy(d_mx, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
77
+ reduce_stats<<<blocks, BLOCK>>>(d_out, P, j, d_nz, d_mx);
78
+ unsigned long long slab_nz, slab_mx;
79
+ cudaMemcpy(&slab_nz, d_nz, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
80
+ cudaMemcpy(&slab_mx, d_mx, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
81
+ total_nz += slab_nz;
82
+ if (slab_mx > global_max) global_max = slab_mx;
83
+ if (j % 500 == 0 || j == P-1) {
84
+ clock_gettime(CLOCK_MONOTONIC, &t1);
85
+ double el = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
86
+ double eta = j>0 ? el*(P-j)/j : 0;
87
+ printf(" j=%d/%d (%.0f%%) %llu nz, max=%llu, %.0fs, ETA %.0fs\n",
88
+ j, P, 100.0*j/P, total_nz, global_max, el, eta);
89
+ fflush(stdout);
90
+
91
+ // Checkpoint: save running stats so partial results survive if killed
92
+ char ckpt[256];
93
+ snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
94
+ FILE *fc_out = fopen(ckpt, "w");
95
+ if (fc_out) {
96
+ fprintf(fc_out, "n=%d\nP=%d\nslab=%d/%d\nnonzero=%llu\nmax=%llu\nelapsed=%.1f\n",
97
+ n, P, j+1, P, total_nz, global_max, el);
98
+ fclose(fc_out);
99
+ }
100
+ }
101
+ }
102
+ clock_gettime(CLOCK_MONOTONIC, &t1);
103
+ double total = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
104
+ printf("\n========================================\n");
105
+ printf("RESULTS\n");
106
+ printf("========================================\n");
107
+ printf("S_%d Kronecker (GPU-only)\nP=%d, nonzero=%llu, max=%llu\nTime: %.1fs\n",
108
+ n, P, total_nz, global_max, total);
109
+ printf("========================================\n");
110
+
111
+ // Clean up checkpoint
112
+ char ckpt[256];
113
+ snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
114
+ remove(ckpt);
115
+ free(h_ct); free(h_z);
116
+ cudaFree(d_ct); cudaFree(d_z); cudaFree(d_out); cudaFree(d_nz); cudaFree(d_mx);
117
+ }
scripts/test.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CPU-only verification test for Kronecker Coefficients"""
2
+ print("Testing kronecker-cuda...")
3
+ ct = [[1,1,1],[2,0,-1],[1,-1,1]]
4
+ z_inv = [1/6, 1/2, 1/3]
5
+
6
+ def g(i, j, k):
7
+ return round(sum(z_inv[c] * ct[i][c] * ct[j][c] * ct[k][c] for c in range(3)))
8
+
9
+ tests = [
10
+ (0,0,0, 1), # g([3],[3],[3]) = trivial
11
+ (0,1,1, 1), # g([3],[2,1],[2,1])
12
+ (1,1,0, 1), # g([2,1],[2,1],[3])
13
+ (1,1,1, 1), # g([2,1],[2,1],[2,1])
14
+ (1,1,2, 1), # g([2,1],[2,1],[1,1,1]) = 1 (sign rep tensor)
15
+ (2,2,0, 1), # g([1^3],[1^3],[3])
16
+ ]
17
+ passed = 0
18
+ for i,j,k,expected in tests:
19
+ got = g(i,j,k)
20
+ ok = got == expected
21
+ print(f" {'PASS' if ok else 'FAIL'}: g({i},{j},{k}) = {got} (expected {expected})")
22
+ if ok: passed += 1
23
+
24
+ print(f"\n{passed}/{len(tests)} tests passed")
torch-ext/torch_binding.cpp ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #include <torch/extension.h>
2
+ #include "torch_binding.h"
3
+
4
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
5
+ m.doc() = "Kronecker Coefficients (Symmetric Group) CUDA kernel";
6
+ }
torch-ext/torch_binding.h ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #pragma once
2
+ #include <torch/torch.h>
3
+ // See kronecker/kronecker_gpu.cu for kernel API