CUDA kernel: kronecker-cuda

Browse files

Files changed (6) hide show

README.md +52 -0
build.toml +12 -0
kronecker/kronecker_gpu.cu +117 -0
scripts/test.py +24 -0
torch-ext/torch_binding.cpp +6 -0
torch-ext/torch_binding.h +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+---
+license: mit
+tags:
+  - kernels
+  - cuda
+  - kronecker-coefficients
+  - symmetric-group
+  - representation-theory
+  - combinatorics
+datasets:
+  - cahlen/kronecker-coefficients
+---
+# Kronecker Coefficients (Symmetric Group)
+Computes Kronecker coefficients g(lambda,mu,nu) for S_n from character tables via GPU-parallel triple-sum.
+## Usage
+```python
+import torch
+from kernels import get_kernel
+kernel = get_kernel("cahlen/kronecker-cuda")
+result = kronecker.slab(char_table, z_inv, j=0)
+```
+## Compile (standalone)
+```bash
+nvcc -O3 -arch=sm_90 -o kronecker kronecker/kronecker_gpu.cu -lm
+```
+## Results
+All computation results are open:
+- **Website**: [bigcompute.science](https://bigcompute.science)
+- **Datasets**: [huggingface.co/cahlen](https://huggingface.co/cahlen)
+- **Source**: [github.com/cahlen/idontknow](https://github.com/cahlen/idontknow)
+## Citation
+```bibtex
+@misc{humphreys2026bigcompute,
+  author = {Humphreys, Cahlen},
+  title = {bigcompute.science: GPU-Accelerated Computational Mathematics},
+  year = {2026},
+  url = {https://bigcompute.science}
+}
+```
+*Human-AI collaborative. Not peer-reviewed. All code and data open.*

build.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[general]
+name = "kronecker"
+universal = false
+[torch]
+src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
+[kernel.kronecker]
+backend = "cuda"
+cuda-capabilities = ["8.0", "9.0", "10.0", "12.0"]
+src = ["kronecker/kronecker_gpu.cu"]
+depends = ["torch"]

kronecker/kronecker_gpu.cu ADDED Viewed

	@@ -0,0 +1,117 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+#define BLOCK 256
+__global__ void kronecker_slab(
+    const int64_t *__restrict__ ct,
+    const double  *__restrict__ z,
+    int P, int C, int j,
+    int64_t *__restrict__ out)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = tid / P;
+    int k = tid % P;
+    if (i > j || k < j || i >= P) return;
+    double sum = 0.0;
+    for (int c = 0; c < C; c++)
+        sum += z[c] * (double)ct[(int64_t)i*C+c] * (double)ct[(int64_t)j*C+c] * (double)ct[(int64_t)k*C+c];
+    out[(int64_t)i*P+k] = llround(sum);
+}
+__global__ void reduce_stats(const int64_t *slab, int P, int j,
+                             unsigned long long *nz, unsigned long long *mx)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = tid / P;
+    int k = tid % P;
+    if (i > j || k < j || i >= P) return;
+    int64_t v = slab[(int64_t)i*P+k];
+    if (v != 0) {
+        atomicAdd(nz, 1ULL);
+        unsigned long long av = (unsigned long long)(v > 0 ? v : -v);
+        atomicMax(mx, av);
+    }
+}
+int main(int argc, char **argv) {
+    int n = atoi(argv[1]);
+    int gpu = argc > 2 ? atoi(argv[2]) : 0;
+    cudaSetDevice(gpu);
+    char path[256];
+    snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/char_table_n%d.bin", n);
+    FILE *fc = fopen(path, "rb"); fseek(fc, 0, SEEK_END); long ct_sz = ftell(fc); fseek(fc, 0, SEEK_SET);
+    snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/z_inv_n%d.bin", n);
+    FILE *fz = fopen(path, "rb"); fseek(fz, 0, SEEK_END); int C = ftell(fz)/sizeof(double); fseek(fz, 0, SEEK_SET);
+    int P = ct_sz / (C * sizeof(int64_t));
+    int64_t *h_ct = (int64_t*)malloc(ct_sz);
+    double *h_z = (double*)malloc(C*sizeof(double));
+    fread(h_ct, 1, ct_sz, fc); fclose(fc);
+    fread(h_z, sizeof(double), C, fz); fclose(fz);
+    printf("S_%d: %d partitions, %d classes — ALL GPU\n", n, P, C);
+    fflush(stdout);
+    int64_t *d_ct, *d_out; double *d_z;
+    unsigned long long *d_nz, *d_mx;
+    cudaMalloc(&d_ct, ct_sz);
+    cudaMalloc(&d_z, C*sizeof(double));
+    cudaMalloc(&d_out, (int64_t)P*P*sizeof(int64_t));
+    cudaMalloc(&d_nz, sizeof(unsigned long long));
+    cudaMalloc(&d_mx, sizeof(unsigned long long));
+    cudaMemcpy(d_ct, h_ct, ct_sz, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_z, h_z, C*sizeof(double), cudaMemcpyHostToDevice);
+    unsigned long long total_nz = 0, global_max = 0;
+    int blocks = ((int64_t)P*P + BLOCK - 1) / BLOCK;
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    for (int j = 0; j < P; j++) {
+        cudaMemset(d_out, 0, (int64_t)P*P*sizeof(int64_t));
+        kronecker_slab<<<blocks, BLOCK>>>(d_ct, d_z, P, C, j, d_out);
+        unsigned long long zero = 0;
+        cudaMemcpy(d_nz, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_mx, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
+        reduce_stats<<<blocks, BLOCK>>>(d_out, P, j, d_nz, d_mx);
+        unsigned long long slab_nz, slab_mx;
+        cudaMemcpy(&slab_nz, d_nz, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&slab_mx, d_mx, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+        total_nz += slab_nz;
+        if (slab_mx > global_max) global_max = slab_mx;
+        if (j % 500 == 0 || j == P-1) {
+            clock_gettime(CLOCK_MONOTONIC, &t1);
+            double el = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
+            double eta = j>0 ? el*(P-j)/j : 0;
+            printf("  j=%d/%d (%.0f%%) %llu nz, max=%llu, %.0fs, ETA %.0fs\n",
+                   j, P, 100.0*j/P, total_nz, global_max, el, eta);
+            fflush(stdout);
+            // Checkpoint: save running stats so partial results survive if killed
+            char ckpt[256];
+            snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
+            FILE *fc_out = fopen(ckpt, "w");
+            if (fc_out) {
+                fprintf(fc_out, "n=%d\nP=%d\nslab=%d/%d\nnonzero=%llu\nmax=%llu\nelapsed=%.1f\n",
+                        n, P, j+1, P, total_nz, global_max, el);
+                fclose(fc_out);
+            }
+        }
+    }
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double total = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
+    printf("\n========================================\n");
+    printf("RESULTS\n");
+    printf("========================================\n");
+    printf("S_%d Kronecker (GPU-only)\nP=%d, nonzero=%llu, max=%llu\nTime: %.1fs\n",
+           n, P, total_nz, global_max, total);
+    printf("========================================\n");
+    // Clean up checkpoint
+    char ckpt[256];
+    snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
+    remove(ckpt);
+    free(h_ct); free(h_z);
+    cudaFree(d_ct); cudaFree(d_z); cudaFree(d_out); cudaFree(d_nz); cudaFree(d_mx);
+}

scripts/test.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""CPU-only verification test for Kronecker Coefficients"""
+print("Testing kronecker-cuda...")
+ct = [[1,1,1],[2,0,-1],[1,-1,1]]
+z_inv = [1/6, 1/2, 1/3]
+def g(i, j, k):
+    return round(sum(z_inv[c] * ct[i][c] * ct[j][c] * ct[k][c] for c in range(3)))
+tests = [
+    (0,0,0, 1),  # g([3],[3],[3]) = trivial
+    (0,1,1, 1),  # g([3],[2,1],[2,1])
+    (1,1,0, 1),  # g([2,1],[2,1],[3])
+    (1,1,1, 1),  # g([2,1],[2,1],[2,1])
+    (1,1,2, 1),  # g([2,1],[2,1],[1,1,1]) = 1 (sign rep tensor)
+    (2,2,0, 1),  # g([1^3],[1^3],[3])
+]
+passed = 0
+for i,j,k,expected in tests:
+    got = g(i,j,k)
+    ok = got == expected
+    print(f"  {'PASS' if ok else 'FAIL'}: g({i},{j},{k}) = {got} (expected {expected})")
+    if ok: passed += 1
+print(f"\n{passed}/{len(tests)} tests passed")

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,6 @@

+#include <torch/extension.h>
+#include "torch_binding.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.doc() = "Kronecker Coefficients (Symmetric Group) CUDA kernel";
+}

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,3 @@

+#pragma once
+#include <torch/torch.h>
+// See kronecker/kronecker_gpu.cu for kernel API