Initial upload: torch-compatible CUDA kernel with pybind11 bindings and CPU tests

Browse files

Files changed (6) hide show

README.md +58 -0
build.toml +12 -0
scripts/test.py +212 -0
src/zaremba_density.cu +210 -0
torch-ext/torch_binding.cpp +25 -0
torch-ext/torch_binding.h +5 -0

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+license: mit
+tags:
+  - kernels
+  - cuda
+  - number-theory
+  - continued-fractions
+  - zaremba-conjecture
+datasets:
+  - cahlen/zaremba-density
+---
+# Zaremba Density CUDA Kernel
+GPU-accelerated computation of Zaremba density: for a digit set A and bound N, counts how many denominators d <= N have a continued fraction representation with all partial quotients in A.
+This kernel was used to produce the results in [cahlen/zaremba-density](https://huggingface.co/datasets/cahlen/zaremba-density), computing densities for all 1,023 subsets of {1,...,10} at various scales up to 10^12.
+## Algorithm
+1. **CPU prefix generation**: Enumerate CF prefixes to a fixed depth, sorted by denominator descending for load balancing
+2. **GPU persistent threads**: Each thread atomically claims prefixes and recursively extends them, marking denominators in a bitset
+3. **CPU shallow marking**: Mark denominators from short CFs that fall below the prefix depth
+4. **GPU popcount**: Count set bits in the bitset
+The persistent-thread work-stealing design ensures good GPU utilization even when prefix subtrees vary widely in size.
+## API
+```python
+import torch
+# After building with the kernels build system:
+from zaremba_density import count_representable
+digits = torch.tensor([1, 2, 3], dtype=torch.int32)
+result = count_representable(100, digits)
+print(f"Representable: {result.item()} / 100")
+# Expected: 98 (only d=1 and d=97 are not representable with {1,2,3})
+```
+### `count_representable(max_d: int, digits: Tensor[int32]) -> Tensor[int64]`
+- **max_d**: Upper bound on denominators to check (inclusive)
+- **digits**: 1-D tensor of allowed partial quotient digits
+- **returns**: 1-element int64 tensor with the count of representable denominators
+## Known Values
+| Digit set | N | Count | Density |
+|-----------|---|-------|---------|
+| {1} | 10 | 5 | 50% |
+| {1,2} | 20 | 16 | 80% |
+| {1,2,3} | 100 | 98 | 98% |
+| {1,2,3,4,5} | 10^6 | 999,987 | 99.9987% |
+## Hardware
+Developed and tested on RTX 5090 (32GB) and 8xB200 cluster (~1.4TB VRAM). For N > 10^9, a GPU with >= 16GB VRAM is recommended due to bitset size.

build.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[general]
+name = "zaremba_density"
+universal = false
+[torch]
+src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
+[kernel.zaremba_density]
+backend = "cuda"
+cuda-capabilities = ["8.0", "9.0", "10.0", "12.0"]
+src = ["src/zaremba_density.cu"]
+depends = ["torch"]

scripts/test.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python3
+"""
+CPU-only test for zaremba_density kernel logic.
+Verifies the continued fraction denominator enumeration algorithm
+against known values without requiring a GPU.
+The algorithm enumerates all denominators d <= N such that there exists
+a fraction a/d (gcd(a,d)=1) whose CF expansion [0; a_1, ..., a_k]
+uses only partial quotients a_i from the digit set A.
+"""
+import sys
+def count_representable_cpu(max_d: int, digits: list[int]) -> tuple[int, set[int]]:
+    """
+    CPU reference implementation of the Zaremba density algorithm.
+    Enumerates all CF denominators <= max_d with partial quotients in `digits`.
+    Uses iterative DFS on the CF tree, matching the GPU kernel logic.
+    The CF tree starts from [0; a] for each a in digits, giving convergent 1/a.
+    Each node (q_prev, q) extends to (q, a*q + q_prev) for each digit a.
+    Returns (count, set_of_representable_d).
+    """
+    representable = set()
+    representable.add(1)  # d=1 is always representable (empty CF)
+    # Stack entries: (q_prev, q_curr)
+    # Initial: CF [0; a] has q_prev=1, q=a
+    stack = []
+    for a in digits:
+        if a <= max_d:
+            stack.append((1, a))
+    while stack:
+        q_prev, q_curr = stack.pop()
+        representable.add(q_curr)
+        for a in digits:
+            q_new = a * q_curr + q_prev
+            if q_new <= max_d:
+                stack.append((q_curr, q_new))
+    result = {d for d in representable if d <= max_d}
+    return len(result), result
+def test_digits_1_n10():
+    """A={1}, N=10: CF denominators are Fibonacci numbers.
+    [0;1]=1/1, [0;1,1]=1/2, [0;1,1,1]=2/3, [0;1,1,1,1]=3/5, [0;1,1,1,1,1]=5/8
+    Denominators: {1, 2, 3, 5, 8} = 5 values.
+    """
+    count, reprs = count_representable_cpu(10, [1])
+    fib_le_10 = {1, 2, 3, 5, 8}
+    assert reprs == fib_le_10, f"A={{1}}, N=10: expected Fibonacci {{1,2,3,5,8}}, got {sorted(reprs)}"
+    assert count == 5, f"A={{1}}, N=10: expected 5, got {count}"
+    print(f"PASS: A={{1}}, N=10 -> {count} representable = {sorted(reprs)} (Fibonacci numbers)")
+def test_digits_1_n100():
+    """A={1}, N=100: should be exactly the Fibonacci numbers <= 100."""
+    count, reprs = count_representable_cpu(100, [1])
+    # Fibonacci: 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89
+    fib = set()
+    a, b = 1, 1
+    while a <= 100:
+        fib.add(a)
+        a, b = b, a + b
+    assert reprs == fib, f"A={{1}}, N=100: expected Fibonacci, got {sorted(reprs)}"
+    print(f"PASS: A={{1}}, N=100 -> {count} representable (Fibonacci: {sorted(reprs)})")
+def test_digits_12_n20():
+    """A={1,2}, N=20: verify exact set of representable denominators."""
+    count, reprs = count_representable_cpu(20, [1, 2])
+    exceptions = sorted(set(range(1, 21)) - reprs)
+    print(f"  A={{1,2}}, N=20: {count}/20 representable")
+    print(f"  Representable: {sorted(reprs)}")
+    print(f"  Exceptions: {exceptions}")
+    # Verify basic properties
+    assert 1 in reprs, "d=1 should always be representable"
+    assert count > 10, f"A={{1,2}} should cover most of 1..20, got only {count}"
+    # Fibonacci numbers are a subset (A={1} subset of A={1,2})
+    for fib in [1, 2, 3, 5, 8, 13]:
+        assert fib in reprs, f"Fibonacci {fib} should be representable with A={{1,2}}"
+    print(f"PASS: A={{1,2}}, N=20 -> {count} representable, {len(exceptions)} exceptions = {exceptions}")
+def test_digits_123_n100():
+    """A={1,2,3}, N=100: verify representable count."""
+    count, reprs = count_representable_cpu(100, [1, 2, 3])
+    exceptions = sorted(set(range(1, 101)) - reprs)
+    print(f"  A={{1,2,3}}, N=100: {count}/100 representable")
+    print(f"  Exceptions: {exceptions}")
+    # Should cover most values
+    assert count >= 90, f"A={{1,2,3}} should cover >= 90% at N=100, got {count}"
+    # All values representable by A={1,2} should also be representable
+    count_12, reprs_12 = count_representable_cpu(100, [1, 2])
+    assert reprs_12.issubset(reprs), "A={1,2} representable should be subset of A={1,2,3}"
+    print(f"PASS: A={{1,2,3}}, N=100 -> {count} representable, {len(exceptions)} exceptions")
+def test_digits_1_n1():
+    """Edge case: N=1, any digits containing 1."""
+    count, reprs = count_representable_cpu(1, [1])
+    assert count == 1, f"A={{1}}, N=1: expected 1, got {count}"
+    assert reprs == {1}, f"A={{1}}, N=1: should be {{1}}, got {reprs}"
+    print(f"PASS: A={{1}}, N=1 -> {count} representable")
+def test_digits_2_n10():
+    """A={2}, N=10: only CFs with all partial quotients = 2."""
+    count, reprs = count_representable_cpu(10, [2])
+    # [0;2]=1/2, [0;2,2]=2/5, so denominators are 1 (trivial), 2, 5
+    # Actually [0;2,2,2] has q=2*5+2=12 > 10
+    # So: {1, 2, 5}
+    assert 1 in reprs and 2 in reprs and 5 in reprs
+    print(f"  A={{2}}, N=10: representable = {sorted(reprs)}")
+    assert count == 3, f"A={{2}}, N=10: expected 3, got {count}"
+    print(f"PASS: A={{2}}, N=10 -> {count} representable = {sorted(reprs)}")
+def test_digits_12345_small():
+    """A={1,2,3,4,5} at small N -- should cover almost everything."""
+    count, reprs = count_representable_cpu(50, [1, 2, 3, 4, 5])
+    print(f"  A={{1,2,3,4,5}}, N=50 -> {count}/50 representable")
+    assert count >= 48, f"A={{1,2,3,4,5}}, N=50: expected >= 48, got {count}"
+    print(f"PASS: A={{1,2,3,4,5}}, N=50 -> {count} representable (>= 48)")
+def test_monotonicity():
+    """Adding digits can only increase or maintain the count."""
+    c1, _ = count_representable_cpu(100, [1])
+    c12, _ = count_representable_cpu(100, [1, 2])
+    c123, _ = count_representable_cpu(100, [1, 2, 3])
+    c1234, _ = count_representable_cpu(100, [1, 2, 3, 4])
+    c12345, _ = count_representable_cpu(100, [1, 2, 3, 4, 5])
+    assert c1 <= c12 <= c123 <= c1234 <= c12345, \
+        f"Monotonicity failed: {c1}, {c12}, {c123}, {c1234}, {c12345}"
+    print(f"PASS: Monotonicity: {c1} <= {c12} <= {c123} <= {c1234} <= {c12345}")
+def test_cf_recurrence():
+    """Verify the CF recurrence q_{n+1} = a_{n+1} * q_n + q_{n-1} directly."""
+    # [0; 3, 1, 4, 1, 5] should give denominators via the recurrence:
+    # q_prev=1, q=3 -> extend 1: q=1*3+1=4 -> extend 4: q=4*4+3=19
+    # -> extend 1: q=1*19+4=23 -> extend 5: q=5*23+19=134
+    q_prev, q_curr = 1, 3
+    expected_denoms = [3]
+    for a in [1, 4, 1, 5]:
+        q_new = a * q_curr + q_prev
+        expected_denoms.append(q_new)
+        q_prev, q_curr = q_curr, q_new
+    assert expected_denoms == [3, 4, 19, 23, 134], f"CF recurrence failed: {expected_denoms}"
+    print(f"PASS: CF recurrence [3,1,4,1,5] -> denominators {expected_denoms}")
+def test_subset_inclusion():
+    """If A is a subset of B, then representable(A) is a subset of representable(B)."""
+    _, r1 = count_representable_cpu(200, [1])
+    _, r12 = count_representable_cpu(200, [1, 2])
+    _, r123 = count_representable_cpu(200, [1, 2, 3])
+    _, r45 = count_representable_cpu(200, [4, 5])
+    _, r12345 = count_representable_cpu(200, [1, 2, 3, 4, 5])
+    assert r1.issubset(r12), "A={1} not subset of A={1,2}"
+    assert r12.issubset(r123), "A={1,2} not subset of A={1,2,3}"
+    assert r123.issubset(r12345), "A={1,2,3} not subset of A={1,2,3,4,5}"
+    assert r45.issubset(r12345), "A={4,5} not subset of A={1,2,3,4,5}"
+    print(f"PASS: Subset inclusion verified for nested digit sets")
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Zaremba Density -- CPU Reference Tests")
+    print("=" * 60)
+    print()
+    tests = [
+        test_digits_1_n10,
+        test_digits_1_n100,
+        test_digits_12_n20,
+        test_digits_123_n100,
+        test_digits_1_n1,
+        test_digits_2_n10,
+        test_digits_12345_small,
+        test_monotonicity,
+        test_cf_recurrence,
+        test_subset_inclusion,
+    ]
+    passed = 0
+    failed = 0
+    for t in tests:
+        try:
+            t()
+            passed += 1
+        except AssertionError as e:
+            print(f"FAIL: {t.__name__}: {e}")
+            failed += 1
+        except Exception as e:
+            print(f"ERROR: {t.__name__}: {e}")
+            failed += 1
+        print()
+    print("=" * 60)
+    print(f"Results: {passed} passed, {failed} failed")
+    print("=" * 60)
+    sys.exit(0 if failed == 0 else 1)

src/zaremba_density.cu ADDED Viewed

	@@ -0,0 +1,210 @@

+/*
+ * Zaremba Density CUDA Kernel — Torch-compatible version
+ *
+ * Enumerates all continued fraction denominators <= N with partial quotients
+ * from a given digit set A. Uses a persistent-thread work-stealing design
+ * with a bitset to track representable denominators.
+ *
+ * Original: zaremba_density_gpu.cu (standalone CLI)
+ * This version: torch C++ extension wrapper around the same kernel logic.
+ */
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <cuda_runtime.h>
+#define MAX_DIGITS 10
+#define MAX_DEPTH 200
+// Mark a denominator in the bitset
+__device__ void mark(uint64_t d, uint8_t *bitset, uint64_t max_d) {
+    if (d < 1 || d > max_d) return;
+    uint64_t byte_idx = d >> 3;
+    uint8_t bit = 1 << (d & 7);
+    atomicOr((unsigned int*)&bitset[byte_idx & ~3],
+             (unsigned int)bit << (8 * (byte_idx & 3)));
+}
+// Persistent-thread kernel: each thread self-schedules prefixes via atomic counter
+__global__ void enumerate_persistent(
+    uint64_t *prefixes, int num_prefixes,
+    int *digits, int num_digits,
+    uint8_t *bitset, uint64_t max_d,
+    int *progress)
+{
+    struct { uint64_t p_prev, p, q_prev, q; } stack[MAX_DEPTH];
+    while (true) {
+        int my_prefix = atomicAdd(progress, 1);
+        if (my_prefix >= num_prefixes) return;
+        uint64_t pp0 = prefixes[my_prefix * 4 + 0];
+        uint64_t p0  = prefixes[my_prefix * 4 + 1];
+        uint64_t qp0 = prefixes[my_prefix * 4 + 2];
+        uint64_t q0  = prefixes[my_prefix * 4 + 3];
+        mark(q0, bitset, max_d);
+        int sp = 0;
+        for (int i = num_digits - 1; i >= 0; i--) {
+            uint64_t a = digits[i];
+            uint64_t q_new = a * q0 + qp0;
+            if (q_new > max_d || sp >= MAX_DEPTH) continue;
+            stack[sp].p_prev = p0; stack[sp].p = a * p0 + pp0;
+            stack[sp].q_prev = q0; stack[sp].q = q_new;
+            sp++;
+        }
+        while (sp > 0) {
+            sp--;
+            uint64_t pp = stack[sp].p_prev, p = stack[sp].p;
+            uint64_t qp = stack[sp].q_prev, q = stack[sp].q;
+            mark(q, bitset, max_d);
+            for (int i = num_digits - 1; i >= 0; i--) {
+                uint64_t a = digits[i];
+                uint64_t q_new = a * q + qp;
+                if (q_new > max_d || sp >= MAX_DEPTH) continue;
+                stack[sp].p_prev = p; stack[sp].p = a * p + pp;
+                stack[sp].q_prev = q; stack[sp].q = q_new;
+                sp++;
+            }
+        }
+    }
+}
+// Count set bits in the bitset
+__global__ void count_marked(uint8_t *bitset, uint64_t max_d, uint64_t *count) {
+    uint64_t tid = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
+    uint64_t max_byte = (max_d + 8) / 8;
+    if (tid >= max_byte) return;
+    uint8_t b = bitset[tid];
+    int bits = __popc((unsigned int)b);
+    if (tid == max_byte - 1) {
+        int valid_bits = (max_d % 8) + 1;
+        bits = __popc((unsigned int)(b & ((1 << valid_bits) - 1)));
+    }
+    if (bits > 0) atomicAdd((unsigned long long*)count, (unsigned long long)bits);
+}
+// C++ host function called from torch binding
+extern "C" int64_t zaremba_count_representable(int64_t max_d, int *h_digits, int num_digits) {
+    // Generate prefixes on CPU
+    int PREFIX_DEPTH = 8;
+    if (max_d >= 1000000000LL) PREFIX_DEPTH = 15;
+    int max_prefixes = 20000000;
+    uint64_t *h_prefixes = new uint64_t[max_prefixes * 4];
+    int np = 0;
+    struct PfxEntry { uint64_t pp, p, qp, q; int depth; };
+    PfxEntry *stk = new PfxEntry[max_prefixes];
+    int ssp = 0;
+    for (int i = 0; i < num_digits; i++) {
+        stk[ssp] = {0, 1, 1, (uint64_t)h_digits[i], 1};
+        ssp++;
+    }
+    while (ssp > 0) {
+        ssp--;
+        uint64_t pp = stk[ssp].pp, p = stk[ssp].p;
+        uint64_t qp = stk[ssp].qp, q = stk[ssp].q;
+        int dep = stk[ssp].depth;
+        if (q > (uint64_t)max_d) continue;
+        if (dep >= PREFIX_DEPTH) {
+            if (np < max_prefixes) {
+                h_prefixes[np*4+0] = pp; h_prefixes[np*4+1] = p;
+                h_prefixes[np*4+2] = qp; h_prefixes[np*4+3] = q;
+                np++;
+            }
+        } else {
+            for (int i = num_digits - 1; i >= 0; i--) {
+                uint64_t qn = (uint64_t)h_digits[i] * q + qp;
+                if (qn > (uint64_t)max_d || ssp >= max_prefixes - 1) continue;
+                stk[ssp] = {p, (uint64_t)h_digits[i] * p + pp, q, qn, dep + 1};
+                ssp++;
+            }
+        }
+    }
+    delete[] stk;
+    // GPU allocation
+    uint64_t bitset_bytes = ((uint64_t)max_d + 8) / 8;
+    uint8_t *d_bs;
+    cudaMalloc(&d_bs, bitset_bytes);
+    cudaMemset(d_bs, 0, bitset_bytes);
+    int *d_digits;
+    cudaMalloc(&d_digits, num_digits * sizeof(int));
+    cudaMemcpy(d_digits, h_digits, num_digits * sizeof(int), cudaMemcpyHostToDevice);
+    uint64_t *d_prefixes;
+    cudaMalloc(&d_prefixes, (uint64_t)np * 4 * sizeof(uint64_t));
+    cudaMemcpy(d_prefixes, h_prefixes, (uint64_t)np * 4 * sizeof(uint64_t), cudaMemcpyHostToDevice);
+    int *d_progress;
+    cudaMalloc(&d_progress, sizeof(int));
+    cudaMemset(d_progress, 0, sizeof(int));
+    // Launch
+    int block_size = 256;
+    int grid_size = (np + block_size - 1) / block_size;
+    if (grid_size > 65535) grid_size = 65535;
+    enumerate_persistent<<<grid_size, block_size>>>(
+        d_prefixes, np, d_digits, num_digits, d_bs, (uint64_t)max_d, d_progress);
+    cudaDeviceSynchronize();
+    // Mark shallow denominators on CPU
+    uint8_t *h_bs = new uint8_t[bitset_bytes];
+    cudaMemcpy(h_bs, d_bs, bitset_bytes, cudaMemcpyDeviceToHost);
+    h_bs[0] |= (1 << 1);  // d=1
+    PfxEntry *cstk = new PfxEntry[500000];
+    int csp = 0;
+    for (int i = 0; i < num_digits; i++) {
+        cstk[csp] = {0, 1, 1, (uint64_t)h_digits[i], 1};
+        csp++;
+    }
+    while (csp > 0) {
+        csp--;
+        uint64_t q = cstk[csp].q;
+        int dep = cstk[csp].depth;
+        if (q > (uint64_t)max_d) continue;
+        h_bs[q >> 3] |= (1 << (q & 7));
+        if (dep >= PREFIX_DEPTH) continue;
+        uint64_t pp = cstk[csp].pp, p = cstk[csp].p, qp = cstk[csp].qp;
+        for (int i = 0; i < num_digits; i++) {
+            uint64_t qn = (uint64_t)h_digits[i] * q + qp;
+            if (qn > (uint64_t)max_d || csp >= 499999) continue;
+            cstk[csp] = {p, (uint64_t)h_digits[i] * p + pp, q, qn, dep + 1};
+            csp++;
+        }
+    }
+    delete[] cstk;
+    cudaMemcpy(d_bs, h_bs, bitset_bytes, cudaMemcpyHostToDevice);
+    // Count on GPU
+    uint64_t *d_count;
+    cudaMalloc(&d_count, sizeof(uint64_t));
+    cudaMemset(d_count, 0, sizeof(uint64_t));
+    {
+        uint64_t max_byte = ((uint64_t)max_d + 8) / 8;
+        int gd = (int)((max_byte + 255) / 256);
+        count_marked<<<gd, 256>>>(d_bs, (uint64_t)max_d, d_count);
+        cudaDeviceSynchronize();
+    }
+    int64_t covered = 0;
+    cudaMemcpy(&covered, d_count, sizeof(uint64_t), cudaMemcpyDeviceToHost);
+    // Cleanup
+    cudaFree(d_count);
+    cudaFree(d_bs);
+    cudaFree(d_digits);
+    cudaFree(d_prefixes);
+    cudaFree(d_progress);
+    delete[] h_prefixes;
+    delete[] h_bs;
+    return covered;
+}

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,25 @@

+#include <torch/extension.h>
+#include "torch_binding.h"
+// Defined in the CUDA kernel
+extern "C" int64_t zaremba_count_representable(int64_t max_d, int *h_digits, int num_digits);
+torch::Tensor count_representable(int64_t max_d, torch::Tensor digits) {
+    TORCH_CHECK(digits.dtype() == torch::kInt32, "digits must be int32");
+    TORCH_CHECK(digits.dim() == 1, "digits must be 1-D");
+    TORCH_CHECK(digits.is_cpu(), "digits must be on CPU");
+    TORCH_CHECK(max_d > 0, "max_d must be positive");
+    int num_digits = digits.size(0);
+    int *h_digits = digits.data_ptr<int>();
+    int64_t count = zaremba_count_representable(max_d, h_digits, num_digits);
+    auto result = torch::tensor({count}, torch::dtype(torch::kInt64));
+    return result;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("count_representable", &count_representable,
+          "Count denominators <= max_d representable with given CF digit set");
+}

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/types.h>
+torch::Tensor count_representable(int64_t max_d, torch::Tensor digits);