Instructions to use cahlen/kronecker-cuda with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use cahlen/kronecker-cuda with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("cahlen/kronecker-cuda") - Notebooks
- Google Colab
- Kaggle
CUDA kernel: kronecker-cuda
Browse files- README.md +52 -0
- build.toml +12 -0
- kronecker/kronecker_gpu.cu +117 -0
- scripts/test.py +24 -0
- torch-ext/torch_binding.cpp +6 -0
- torch-ext/torch_binding.h +3 -0
README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- kernels
|
| 5 |
+
- cuda
|
| 6 |
+
- kronecker-coefficients
|
| 7 |
+
- symmetric-group
|
| 8 |
+
- representation-theory
|
| 9 |
+
- combinatorics
|
| 10 |
+
datasets:
|
| 11 |
+
- cahlen/kronecker-coefficients
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Kronecker Coefficients (Symmetric Group)
|
| 15 |
+
|
| 16 |
+
Computes Kronecker coefficients g(lambda,mu,nu) for S_n from character tables via GPU-parallel triple-sum.
|
| 17 |
+
|
| 18 |
+
## Usage
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
import torch
|
| 22 |
+
from kernels import get_kernel
|
| 23 |
+
|
| 24 |
+
kernel = get_kernel("cahlen/kronecker-cuda")
|
| 25 |
+
result = kronecker.slab(char_table, z_inv, j=0)
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Compile (standalone)
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
nvcc -O3 -arch=sm_90 -o kronecker kronecker/kronecker_gpu.cu -lm
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Results
|
| 35 |
+
|
| 36 |
+
All computation results are open:
|
| 37 |
+
- **Website**: [bigcompute.science](https://bigcompute.science)
|
| 38 |
+
- **Datasets**: [huggingface.co/cahlen](https://huggingface.co/cahlen)
|
| 39 |
+
- **Source**: [github.com/cahlen/idontknow](https://github.com/cahlen/idontknow)
|
| 40 |
+
|
| 41 |
+
## Citation
|
| 42 |
+
|
| 43 |
+
```bibtex
|
| 44 |
+
@misc{humphreys2026bigcompute,
|
| 45 |
+
author = {Humphreys, Cahlen},
|
| 46 |
+
title = {bigcompute.science: GPU-Accelerated Computational Mathematics},
|
| 47 |
+
year = {2026},
|
| 48 |
+
url = {https://bigcompute.science}
|
| 49 |
+
}
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
*Human-AI collaborative. Not peer-reviewed. All code and data open.*
|
build.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[general]
|
| 2 |
+
name = "kronecker"
|
| 3 |
+
universal = false
|
| 4 |
+
|
| 5 |
+
[torch]
|
| 6 |
+
src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
|
| 7 |
+
|
| 8 |
+
[kernel.kronecker]
|
| 9 |
+
backend = "cuda"
|
| 10 |
+
cuda-capabilities = ["8.0", "9.0", "10.0", "12.0"]
|
| 11 |
+
src = ["kronecker/kronecker_gpu.cu"]
|
| 12 |
+
depends = ["torch"]
|
kronecker/kronecker_gpu.cu
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <stdio.h>
|
| 2 |
+
#include <stdlib.h>
|
| 3 |
+
#include <stdint.h>
|
| 4 |
+
#include <time.h>
|
| 5 |
+
|
| 6 |
+
#define BLOCK 256
|
| 7 |
+
|
| 8 |
+
__global__ void kronecker_slab(
|
| 9 |
+
const int64_t *__restrict__ ct,
|
| 10 |
+
const double *__restrict__ z,
|
| 11 |
+
int P, int C, int j,
|
| 12 |
+
int64_t *__restrict__ out)
|
| 13 |
+
{
|
| 14 |
+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
| 15 |
+
int i = tid / P;
|
| 16 |
+
int k = tid % P;
|
| 17 |
+
if (i > j || k < j || i >= P) return;
|
| 18 |
+
double sum = 0.0;
|
| 19 |
+
for (int c = 0; c < C; c++)
|
| 20 |
+
sum += z[c] * (double)ct[(int64_t)i*C+c] * (double)ct[(int64_t)j*C+c] * (double)ct[(int64_t)k*C+c];
|
| 21 |
+
out[(int64_t)i*P+k] = llround(sum);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
__global__ void reduce_stats(const int64_t *slab, int P, int j,
|
| 25 |
+
unsigned long long *nz, unsigned long long *mx)
|
| 26 |
+
{
|
| 27 |
+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
| 28 |
+
int i = tid / P;
|
| 29 |
+
int k = tid % P;
|
| 30 |
+
if (i > j || k < j || i >= P) return;
|
| 31 |
+
int64_t v = slab[(int64_t)i*P+k];
|
| 32 |
+
if (v != 0) {
|
| 33 |
+
atomicAdd(nz, 1ULL);
|
| 34 |
+
unsigned long long av = (unsigned long long)(v > 0 ? v : -v);
|
| 35 |
+
atomicMax(mx, av);
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
int main(int argc, char **argv) {
|
| 40 |
+
int n = atoi(argv[1]);
|
| 41 |
+
int gpu = argc > 2 ? atoi(argv[2]) : 0;
|
| 42 |
+
cudaSetDevice(gpu);
|
| 43 |
+
char path[256];
|
| 44 |
+
snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/char_table_n%d.bin", n);
|
| 45 |
+
FILE *fc = fopen(path, "rb"); fseek(fc, 0, SEEK_END); long ct_sz = ftell(fc); fseek(fc, 0, SEEK_SET);
|
| 46 |
+
snprintf(path, 256, "scripts/experiments/kronecker-coefficients/results/z_inv_n%d.bin", n);
|
| 47 |
+
FILE *fz = fopen(path, "rb"); fseek(fz, 0, SEEK_END); int C = ftell(fz)/sizeof(double); fseek(fz, 0, SEEK_SET);
|
| 48 |
+
int P = ct_sz / (C * sizeof(int64_t));
|
| 49 |
+
int64_t *h_ct = (int64_t*)malloc(ct_sz);
|
| 50 |
+
double *h_z = (double*)malloc(C*sizeof(double));
|
| 51 |
+
fread(h_ct, 1, ct_sz, fc); fclose(fc);
|
| 52 |
+
fread(h_z, sizeof(double), C, fz); fclose(fz);
|
| 53 |
+
printf("S_%d: %d partitions, %d classes — ALL GPU\n", n, P, C);
|
| 54 |
+
fflush(stdout);
|
| 55 |
+
|
| 56 |
+
int64_t *d_ct, *d_out; double *d_z;
|
| 57 |
+
unsigned long long *d_nz, *d_mx;
|
| 58 |
+
cudaMalloc(&d_ct, ct_sz);
|
| 59 |
+
cudaMalloc(&d_z, C*sizeof(double));
|
| 60 |
+
cudaMalloc(&d_out, (int64_t)P*P*sizeof(int64_t));
|
| 61 |
+
cudaMalloc(&d_nz, sizeof(unsigned long long));
|
| 62 |
+
cudaMalloc(&d_mx, sizeof(unsigned long long));
|
| 63 |
+
cudaMemcpy(d_ct, h_ct, ct_sz, cudaMemcpyHostToDevice);
|
| 64 |
+
cudaMemcpy(d_z, h_z, C*sizeof(double), cudaMemcpyHostToDevice);
|
| 65 |
+
|
| 66 |
+
unsigned long long total_nz = 0, global_max = 0;
|
| 67 |
+
int blocks = ((int64_t)P*P + BLOCK - 1) / BLOCK;
|
| 68 |
+
struct timespec t0, t1;
|
| 69 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 70 |
+
|
| 71 |
+
for (int j = 0; j < P; j++) {
|
| 72 |
+
cudaMemset(d_out, 0, (int64_t)P*P*sizeof(int64_t));
|
| 73 |
+
kronecker_slab<<<blocks, BLOCK>>>(d_ct, d_z, P, C, j, d_out);
|
| 74 |
+
unsigned long long zero = 0;
|
| 75 |
+
cudaMemcpy(d_nz, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
|
| 76 |
+
cudaMemcpy(d_mx, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
|
| 77 |
+
reduce_stats<<<blocks, BLOCK>>>(d_out, P, j, d_nz, d_mx);
|
| 78 |
+
unsigned long long slab_nz, slab_mx;
|
| 79 |
+
cudaMemcpy(&slab_nz, d_nz, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
| 80 |
+
cudaMemcpy(&slab_mx, d_mx, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
| 81 |
+
total_nz += slab_nz;
|
| 82 |
+
if (slab_mx > global_max) global_max = slab_mx;
|
| 83 |
+
if (j % 500 == 0 || j == P-1) {
|
| 84 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 85 |
+
double el = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
|
| 86 |
+
double eta = j>0 ? el*(P-j)/j : 0;
|
| 87 |
+
printf(" j=%d/%d (%.0f%%) %llu nz, max=%llu, %.0fs, ETA %.0fs\n",
|
| 88 |
+
j, P, 100.0*j/P, total_nz, global_max, el, eta);
|
| 89 |
+
fflush(stdout);
|
| 90 |
+
|
| 91 |
+
// Checkpoint: save running stats so partial results survive if killed
|
| 92 |
+
char ckpt[256];
|
| 93 |
+
snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
|
| 94 |
+
FILE *fc_out = fopen(ckpt, "w");
|
| 95 |
+
if (fc_out) {
|
| 96 |
+
fprintf(fc_out, "n=%d\nP=%d\nslab=%d/%d\nnonzero=%llu\nmax=%llu\nelapsed=%.1f\n",
|
| 97 |
+
n, P, j+1, P, total_nz, global_max, el);
|
| 98 |
+
fclose(fc_out);
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 103 |
+
double total = (t1.tv_sec-t0.tv_sec)+(t1.tv_nsec-t0.tv_nsec)/1e9;
|
| 104 |
+
printf("\n========================================\n");
|
| 105 |
+
printf("RESULTS\n");
|
| 106 |
+
printf("========================================\n");
|
| 107 |
+
printf("S_%d Kronecker (GPU-only)\nP=%d, nonzero=%llu, max=%llu\nTime: %.1fs\n",
|
| 108 |
+
n, P, total_nz, global_max, total);
|
| 109 |
+
printf("========================================\n");
|
| 110 |
+
|
| 111 |
+
// Clean up checkpoint
|
| 112 |
+
char ckpt[256];
|
| 113 |
+
snprintf(ckpt, 256, "scripts/experiments/kronecker-coefficients/results/checkpoint_n%d.txt", n);
|
| 114 |
+
remove(ckpt);
|
| 115 |
+
free(h_ct); free(h_z);
|
| 116 |
+
cudaFree(d_ct); cudaFree(d_z); cudaFree(d_out); cudaFree(d_nz); cudaFree(d_mx);
|
| 117 |
+
}
|
scripts/test.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CPU-only verification test for Kronecker Coefficients"""
|
| 2 |
+
print("Testing kronecker-cuda...")
|
| 3 |
+
ct = [[1,1,1],[2,0,-1],[1,-1,1]]
|
| 4 |
+
z_inv = [1/6, 1/2, 1/3]
|
| 5 |
+
|
| 6 |
+
def g(i, j, k):
|
| 7 |
+
return round(sum(z_inv[c] * ct[i][c] * ct[j][c] * ct[k][c] for c in range(3)))
|
| 8 |
+
|
| 9 |
+
tests = [
|
| 10 |
+
(0,0,0, 1), # g([3],[3],[3]) = trivial
|
| 11 |
+
(0,1,1, 1), # g([3],[2,1],[2,1])
|
| 12 |
+
(1,1,0, 1), # g([2,1],[2,1],[3])
|
| 13 |
+
(1,1,1, 1), # g([2,1],[2,1],[2,1])
|
| 14 |
+
(1,1,2, 1), # g([2,1],[2,1],[1,1,1]) = 1 (sign rep tensor)
|
| 15 |
+
(2,2,0, 1), # g([1^3],[1^3],[3])
|
| 16 |
+
]
|
| 17 |
+
passed = 0
|
| 18 |
+
for i,j,k,expected in tests:
|
| 19 |
+
got = g(i,j,k)
|
| 20 |
+
ok = got == expected
|
| 21 |
+
print(f" {'PASS' if ok else 'FAIL'}: g({i},{j},{k}) = {got} (expected {expected})")
|
| 22 |
+
if ok: passed += 1
|
| 23 |
+
|
| 24 |
+
print(f"\n{passed}/{len(tests)} tests passed")
|
torch-ext/torch_binding.cpp
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <torch/extension.h>
|
| 2 |
+
#include "torch_binding.h"
|
| 3 |
+
|
| 4 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
| 5 |
+
m.doc() = "Kronecker Coefficients (Symmetric Group) CUDA kernel";
|
| 6 |
+
}
|
torch-ext/torch_binding.h
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <torch/torch.h>
|
| 3 |
+
// See kronecker/kronecker_gpu.cu for kernel API
|