Add torch library bindings (guard main, add torch wrapper functions)

Browse files

Files changed (3) hide show

kronecker/kronecker_gpu.cu +39 -0
torch-ext/torch_binding.cpp +1 -0
torch-ext/torch_binding.h +2 -1

kronecker/kronecker_gpu.cu CHANGED Viewed

@@ -1,3 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -36,6 +40,8 @@ __global__ void reduce_stats(const int64_t *slab, int P, int j,
     }
 }
 int main(int argc, char **argv) {
     int n = atoi(argv[1]);
     int gpu = argc > 2 ? atoi(argv[2]) : 0;
@@ -115,3 +121,36 @@ int main(int argc, char **argv) {
     free(h_ct); free(h_z);
     cudaFree(d_ct); cudaFree(d_z); cudaFree(d_out); cudaFree(d_nz); cudaFree(d_mx);
 }

+#ifdef TORCH_EXTENSION_NAME
+#include <torch/torch.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
     }
 }
+#ifndef TORCH_EXTENSION_NAME
 int main(int argc, char **argv) {
     int n = atoi(argv[1]);
     int gpu = argc > 2 ? atoi(argv[2]) : 0;
     free(h_ct); free(h_z);
     cudaFree(d_ct); cudaFree(d_z); cudaFree(d_out); cudaFree(d_nz); cudaFree(d_mx);
 }
+#endif
+#ifdef TORCH_EXTENSION_NAME
+std::vector<torch::Tensor> compute_kronecker(torch::Tensor ct, torch::Tensor z_inv) {
+    TORCH_CHECK(ct.is_cuda() && ct.dtype() == torch::kInt64, "ct must be int64 CUDA tensor");
+    TORCH_CHECK(ct.dim() == 2, "ct must be 2-D (P x C)");
+    TORCH_CHECK(z_inv.is_cuda() && z_inv.dtype() == torch::kFloat64, "z_inv must be float64 CUDA");
+    int P = ct.size(0);
+    int C = ct.size(1);
+    auto out = torch::zeros({P, P}, torch::dtype(torch::kInt64).device(ct.device()));
+    auto nz_dev = torch::zeros({1}, torch::dtype(torch::kInt64).device(ct.device()));
+    auto mx_dev = torch::zeros({1}, torch::dtype(torch::kInt64).device(ct.device()));
+    int64_t total_nz = 0, global_max = 0;
+    int nblocks = (P * P + 255) / 256;
+    for (int j = 0; j < P; j++) {
+        out.zero_(); nz_dev.zero_(); mx_dev.zero_();
+        kronecker_slab<<<nblocks, 256>>>(ct.data_ptr<int64_t>(), z_inv.data_ptr<double>(), P, C, j, out.data_ptr<int64_t>());
+        cudaDeviceSynchronize();
+        reduce_stats<<<nblocks, 256>>>(out.data_ptr<int64_t>(), P, j, (unsigned long long*)nz_dev.data_ptr<int64_t>(), (unsigned long long*)mx_dev.data_ptr<int64_t>());
+        cudaDeviceSynchronize();
+        total_nz += nz_dev.cpu().item<int64_t>();
+        int64_t sm = mx_dev.cpu().item<int64_t>();
+        if (sm > global_max) global_max = sm;
+    }
+    return {torch::tensor({total_nz}, torch::kInt64), torch::tensor({global_max}, torch::kInt64)};
+}
+#endif

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -3,4 +3,5 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.doc() = "Kronecker Coefficients (Symmetric Group) CUDA kernel";
 }

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.doc() = "Kronecker Coefficients (Symmetric Group) CUDA kernel";
+  m.def("compute_kronecker", &compute_kronecker, py::arg("ct"), py::arg("z_inv"));
 }

torch-ext/torch_binding.h CHANGED Viewed

@@ -1,3 +1,4 @@
 #pragma once
 #include <torch/torch.h>
-// See kronecker/kronecker_gpu.cu for kernel API

 #pragma once
 #include <torch/torch.h>
+std::vector<torch::Tensor> compute_kronecker(torch::Tensor ct, torch::Tensor z_inv);