class-numbers-cuda / class_numbers /class_numbers_v2.cu
cahlen's picture
Add torch library bindings (guard main, add torch wrapper functions)
a51a509 verified
#ifdef TORCH_EXTENSION_NAME
#include <torch/torch.h>
#endif
/*
* Class Numbers of Real Quadratic Fields — v2 Multi-GPU
*
* Computes h(d) for all fundamental discriminants d in [D_lo, D_hi]
* using: h(d) = round(sqrt(d) * L(1, chi_d) / (2 * R(d)))
*
* Key improvements over v1:
* - Integer-only CF for regulator (no FP64 overflow)
* - Euler product with 9592 primes to 10^5 (was 1229 to 10^4)
* - CPU segmented sieve for fundamental discriminants
* - Multi-GPU via pthreads (one thread per GPU)
* - Incremental log accumulation for regulator
* - Cohen-Lenstra statistics collection
*
* Compile: nvcc -O3 -arch=sm_100a -o class_v2 \
* scripts/experiments/class-numbers/class_numbers_v2.cu -lpthread -lm
*
* Run: ./class_v2 <start> <end>
* e.g. ./class_v2 5 1000000000 (validate against known tables)
* ./class_v2 100000000000 10000000000000 (new computation)
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <string.h>
#include <time.h>
#ifndef TORCH_EXTENSION_NAME
#include <pthread.h>
#endif
typedef unsigned long long uint64;
typedef long long int64;
#define BLOCK_SIZE 256
#define MAX_CF_STEPS 2000000 // cap for CF period (covers 99.9% of d < 10^13)
#define CHUNK_SIZE 10000000 // 10M raw d per chunk
// =====================================================
// Primes in constant memory (up to 100003 = 9592 primes)
// =====================================================
#define NUM_PRIMES 9592
__constant__ int d_primes[NUM_PRIMES];
// =====================================================
// Kronecker symbol (d/p) — modular exponentiation
// =====================================================
__device__ int kronecker(int64 d, int p) {
if (p == 2) {
int dm8 = ((int)(d % 8) + 8) % 8;
if (dm8 == 1 || dm8 == 7) return 1;
if (dm8 == 3 || dm8 == 5) return -1;
return 0;
}
// Euler's criterion: d^((p-1)/2) mod p
int64 a = ((d % p) + p) % p;
if (a == 0) return 0;
int64 result = 1;
int64 exp = (p - 1) / 2;
int64 base = a;
while (exp > 0) {
if (exp & 1) result = (result * base) % p;
base = (base * base) % p;
exp >>= 1;
}
return (result == 1) ? 1 : -1;
}
// =====================================================
// Combined kernel: regulator + L-function + class number
// =====================================================
__global__ void compute_class_numbers(
uint64 *discriminants, // fundamental discriminants
uint32_t count,
int *class_numbers_out,
double *regulators_out, // optional: NULL to skip output
// Statistics (atomics)
uint64 *h1_count, // count of h(d) = 1
uint64 *h_histogram, // h_histogram[h] for h < 1024
uint64 *total_processed,
uint64 *div3_count, // count of 3 | h(d)
uint64 *div5_count,
uint64 *div7_count)
{
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= count) return;
uint64 d = discriminants[idx];
if (d < 5) return;
// ===== PHASE 1: Regulator (validated: matches PARI/GP on 1000 discriminants) =====
// For d ≡ 0 mod 4 (d=4m): CF of √m, stop at first D==1
// For d ≡ 1 mod 4: CF of (1+√d)/2, stop when P=1,Q=2
double regulator = 0.0;
double log_P_prev, log_P_curr, log_Q_prev, log_Q_curr;
if (d % 4 == 0) {
// d = 4m: CF of √m
uint64 m_val = d / 4;
uint64 a0 = (uint64)sqrt((double)m_val);
while (a0 * a0 > m_val) a0--;
while ((a0+1)*(a0+1) <= m_val) a0++;
if (a0 * a0 == m_val) return;
int64 mm = 0, D = 1, a = (int64)a0;
log_P_prev = 0.0;
log_P_curr = log((double)a0);
log_Q_prev = -1e30;
log_Q_curr = 0.0;
for (int step = 0; step < MAX_CF_STEPS; step++) {
mm = D * a - mm;
D = ((int64)m_val - mm * mm) / D;
if (D == 0) break;
a = ((int64)a0 + mm) / D;
// Check D==1 BEFORE updating convergents (critical!)
if (D == 1) {
double diff = log_Q_curr + 0.5 * log((double)m_val) - log_P_curr;
regulator = log_P_curr + log(1.0 + exp(diff));
break;
}
// Update log convergents
double rp = exp(log_P_prev - log_P_curr);
log_P_prev = log_P_curr;
log_P_curr = log_P_curr + log((double)a + rp);
double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0;
log_Q_prev = log_Q_curr;
log_Q_curr = log_Q_curr + log((double)a + rq);
}
} else {
// d ≡ 1 mod 4: CF of (1+√d)/2 with reduced-state cycle detection
uint64 isqrt_d = (uint64)sqrt((double)d);
while (isqrt_d * isqrt_d > d) isqrt_d--;
while ((isqrt_d+1)*(isqrt_d+1) <= d) isqrt_d++;
int64 P = 1, Q = 2;
int64 a = (P + (int64)isqrt_d) / Q;
log_P_prev = 0.0;
log_P_curr = log((double)(a > 0 ? a : 1));
log_Q_prev = -1e30;
log_Q_curr = 0.0;
// Cycle detection via reduced states
int64 first_P = -1, first_Q = -1;
double log_eps0 = 0.0;
for (int step = 0; step < MAX_CF_STEPS; step++) {
int64 P_new = a * Q - P;
int64 Q_new = ((int64)d - P_new * P_new) / Q;
if (Q_new == 0) break;
int64 a_new = (P_new + (int64)isqrt_d) / Q_new;
P = P_new; Q = Q_new; a = a_new;
// Update log convergents
double rp = exp(log_P_prev - log_P_curr);
log_P_prev = log_P_curr;
log_P_curr = log_P_curr + log((double)a + rp);
double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0;
log_Q_prev = log_Q_curr;
log_Q_curr = log_Q_curr + log((double)a + rq);
// Check if reduced: 0 < P <= isqrt_d, P > isqrt_d - Q, Q > 0
int is_reduced = (Q > 0 && P > 0 && P <= (int64)isqrt_d && P > (int64)isqrt_d - Q);
if (!is_reduced) continue;
// Compute log(ε) = log((2p - q + q√d) / 2)
double ratio_qp = exp(log_Q_curr - log_P_curr);
double log_2pmq = log_P_curr + log(2.0 - ratio_qp);
double diff = log_Q_curr + 0.5 * log((double)d) - log_2pmq;
double log_eps = log_2pmq + log(1.0 + exp(diff)) - log(2.0);
if (first_P < 0) {
// First reduced state: save it
first_P = P; first_Q = Q;
log_eps0 = log_eps;
} else if (P == first_P && Q == first_Q) {
// Cycle detected! R = log(ε_now) - log(ε_first)
regulator = log_eps - log_eps0;
break;
}
}
}
if (regulator < 0.01) regulator = 0.01;
// ===== PHASE 2: L(1, chi_d) via Euler product =====
double L1 = 1.0;
for (int i = 0; i < NUM_PRIMES; i++) {
int p = d_primes[i];
int chi = kronecker((int64)d, p);
if (chi != 0) {
L1 *= 1.0 / (1.0 - (double)chi / p);
}
// If chi = 0, the factor is 1/(1-0) = 1, no change
}
// ===== PHASE 3: Assemble class number =====
double h_approx = sqrt((double)d) * L1 / (2.0 * regulator);
int h = (int)round(h_approx);
if (h < 1) h = 1;
class_numbers_out[idx] = h;
if (regulators_out) regulators_out[idx] = regulator;
// ===== PHASE 4: Statistics =====
atomicAdd(total_processed, 1ULL);
if (h == 1) atomicAdd(h1_count, 1ULL);
if (h < 1024) atomicAdd(&h_histogram[h], 1ULL);
if (h % 3 == 0) atomicAdd(div3_count, 1ULL);
if (h % 5 == 0) atomicAdd(div5_count, 1ULL);
if (h % 7 == 0) atomicAdd(div7_count, 1ULL);
}
// =====================================================
// GPU: Squarefree sieve + fundamental discriminant extraction
// =====================================================
__global__ void gpu_sieve_squarefree(
uint8_t *sieve, uint64 lo, uint64 len,
const int *primes, int num_primes)
{
uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= len) return;
uint64 d = lo + pos;
for (int i = 0; i < num_primes; i++) {
int p = primes[i];
uint64 p2 = (uint64)p * p;
if (p2 > d) break;
if (d % p2 == 0) { sieve[pos] = 0; return; }
}
}
__global__ void gpu_extract_fundamental(
const uint8_t *sieve, uint64 lo, uint64 len,
uint64 *output, uint32_t *count, uint32_t max_out)
{
uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= len) return;
uint64 d = lo + pos;
if (d < 5) return;
int is_fund = 0;
if (d % 4 == 1 && sieve[pos]) {
is_fund = 1;
} else if (d % 4 == 0) {
uint64 m = d / 4;
if ((m % 4 == 2 || m % 4 == 3)) {
if (m >= lo && m < lo + len && sieve[m - lo]) is_fund = 1;
else if (m < lo) {
// Trial division for m outside sieve range
int sqf = 1;
for (uint64 p = 2; p * p <= m && sqf; p++)
if (m % (p*p) == 0) sqf = 0;
if (sqf) is_fund = 1;
}
}
}
if (is_fund) {
uint32_t idx = atomicAdd(count, 1);
if (idx < max_out) output[idx] = d;
}
}
// =====================================================
// Generate prime table
// =====================================================
#ifndef TORCH_EXTENSION_NAME
int generate_primes(int *primes, int max_prime) {
char *sieve = (char*)calloc(max_prime + 1, 1);
memset(sieve, 1, max_prime + 1);
sieve[0] = sieve[1] = 0;
for (int i = 2; i * i <= max_prime; i++)
if (sieve[i])
for (int j = i*i; j <= max_prime; j += i)
sieve[j] = 0;
int count = 0;
for (int i = 2; i <= max_prime && count < NUM_PRIMES; i++)
if (sieve[i]) primes[count++] = i;
free(sieve);
return count;
}
// =====================================================
// GPU worker thread
// =====================================================
typedef struct {
int gpu_id;
uint64 d_start, d_end;
char output_path[256]; // binary output file path
// Results
uint64 total_processed;
uint64 h1_count;
uint64 div3, div5, div7;
uint64 h_hist[1024];
} GPUWork;
void *gpu_worker(void *arg) {
GPUWork *work = (GPUWork*)arg;
cudaSetDevice(work->gpu_id);
// Allocate GPU buffers
uint64 *d_discriminants;
int *d_class_numbers;
uint64 *d_h1, *d_total, *d_div3, *d_div5, *d_div7, *d_hist;
uint32_t max_per_chunk = CHUNK_SIZE; // max fundamental discriminants per chunk
cudaMalloc(&d_discriminants, max_per_chunk * sizeof(uint64));
cudaMalloc(&d_class_numbers, max_per_chunk * sizeof(int));
cudaMalloc(&d_h1, sizeof(uint64));
cudaMalloc(&d_total, sizeof(uint64));
cudaMalloc(&d_div3, sizeof(uint64));
cudaMalloc(&d_div5, sizeof(uint64));
cudaMalloc(&d_div7, sizeof(uint64));
cudaMalloc(&d_hist, 1024 * sizeof(uint64));
cudaMemset(d_h1, 0, sizeof(uint64));
cudaMemset(d_total, 0, sizeof(uint64));
cudaMemset(d_div3, 0, sizeof(uint64));
cudaMemset(d_div5, 0, sizeof(uint64));
cudaMemset(d_div7, 0, sizeof(uint64));
cudaMemset(d_hist, 0, 1024 * sizeof(uint64));
// GPU sieve buffers
uint64 chunk_raw = CHUNK_SIZE * 3;
uint8_t *d_sieve;
uint32_t *d_sieve_count;
int *d_sieve_primes;
cudaMalloc(&d_sieve, chunk_raw);
cudaMalloc(&d_sieve_count, sizeof(uint32_t));
// Generate sieve primes on CPU (up to sqrt of max d)
uint64 sqrt_max = (uint64)sqrt((double)work->d_end) + 2;
int *h_sieve_primes = (int*)malloc(sqrt_max * sizeof(int));
int n_sieve_primes = 0;
{
char *isp = (char*)calloc(sqrt_max + 1, 1);
for (uint64 i = 2; i <= sqrt_max; i++) isp[i] = 1;
for (uint64 i = 2; i * i <= sqrt_max; i++)
if (isp[i]) for (uint64 j = i*i; j <= sqrt_max; j += i) isp[j] = 0;
for (uint64 i = 2; i <= sqrt_max; i++)
if (isp[i]) h_sieve_primes[n_sieve_primes++] = (int)i;
free(isp);
}
cudaMalloc(&d_sieve_primes, n_sieve_primes * sizeof(int));
cudaMemcpy(d_sieve_primes, h_sieve_primes, n_sieve_primes * sizeof(int), cudaMemcpyHostToDevice);
free(h_sieve_primes);
uint64 chunks_done = 0;
for (uint64 d_lo = work->d_start; d_lo < work->d_end; d_lo += chunk_raw) {
uint64 d_hi = d_lo + chunk_raw;
if (d_hi > work->d_end) d_hi = work->d_end;
uint64 len = d_hi - d_lo;
// GPU Sieve: squarefree + fundamental discriminant extraction
cudaMemset(d_sieve, 1, len);
cudaMemset(d_sieve_count, 0, sizeof(uint32_t));
uint64 sieve_blocks = (len + BLOCK_SIZE - 1) / BLOCK_SIZE;
gpu_sieve_squarefree<<<sieve_blocks, BLOCK_SIZE>>>(
d_sieve, d_lo, len, d_sieve_primes, n_sieve_primes);
gpu_extract_fundamental<<<sieve_blocks, BLOCK_SIZE>>>(
d_sieve, d_lo, len, d_discriminants, d_sieve_count, max_per_chunk);
uint32_t count;
cudaMemcpy(&count, d_sieve_count, sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (count == 0) continue;
if (count > max_per_chunk) count = max_per_chunk;
// Launch kernel
int blocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
compute_class_numbers<<<blocks, BLOCK_SIZE>>>(
d_discriminants, count, d_class_numbers, NULL,
d_h1, d_hist, d_total, d_div3, d_div5, d_div7);
cudaDeviceSynchronize();
// Write raw (d, h) pairs to binary file
if (work->output_path[0]) {
uint64 *h_disc = (uint64*)malloc(count * sizeof(uint64));
int *h_cls = (int*)malloc(count * sizeof(int));
cudaMemcpy(h_disc, d_discriminants, count * sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(h_cls, d_class_numbers, count * sizeof(int), cudaMemcpyDeviceToHost);
FILE *fout = fopen(work->output_path, "ab"); // append binary
if (fout) {
for (uint32_t i = 0; i < count; i++) {
if (h_cls[i] > 0) { // skip invalid
fwrite(&h_disc[i], sizeof(uint64), 1, fout);
fwrite(&h_cls[i], sizeof(int), 1, fout);
}
}
fclose(fout);
}
free(h_disc); free(h_cls);
}
chunks_done++;
if (chunks_done % 20 == 0) {
uint64 total;
cudaMemcpy(&total, d_total, sizeof(uint64), cudaMemcpyDeviceToHost);
double pct = 100.0 * (d_lo - work->d_start) / (double)(work->d_end - work->d_start);
printf("[GPU %d] %.1f%% | %llu discriminants | d ~ %.2e\n",
work->gpu_id, pct, total, (double)d_lo);
fflush(stdout);
}
}
// Collect results
cudaDeviceSynchronize();
cudaMemcpy(&work->total_processed, d_total, sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(&work->h1_count, d_h1, sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(&work->div3, d_div3, sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(&work->div5, d_div5, sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(&work->div7, d_div7, sizeof(uint64), cudaMemcpyDeviceToHost);
cudaMemcpy(work->h_hist, d_hist, 1024 * sizeof(uint64), cudaMemcpyDeviceToHost);
cudaFree(d_discriminants); cudaFree(d_class_numbers);
cudaFree(d_h1); cudaFree(d_total); cudaFree(d_div3); cudaFree(d_div5); cudaFree(d_div7);
cudaFree(d_hist);
cudaFree(d_sieve); cudaFree(d_sieve_count); cudaFree(d_sieve_primes);
printf("[GPU %d] done: %llu discriminants\n", work->gpu_id, work->total_processed);
return NULL;
}
// =====================================================
// Main
// =====================================================
int main(int argc, char **argv) {
uint64 D_start = argc > 1 ? strtoull(argv[1], NULL, 10) : 5;
uint64 D_end = argc > 2 ? strtoull(argv[2], NULL, 10) : 1000000;
printf("========================================\n");
printf("Class Numbers of Real Quadratic Fields v2\n");
printf("Range: [%llu, %llu)\n", D_start, D_end);
printf("========================================\n\n");
// Generate primes
int h_primes[NUM_PRIMES];
int nprimes = generate_primes(h_primes, 100003);
printf("Primes: %d (up to %d)\n", nprimes, h_primes[nprimes-1]);
int num_gpus;
cudaGetDeviceCount(&num_gpus);
printf("GPUs: %d\n\n", num_gpus);
// Upload primes to all GPUs
for (int g = 0; g < num_gpus; g++) {
cudaSetDevice(g);
cudaMemcpyToSymbol(d_primes, h_primes, nprimes * sizeof(int));
}
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
// Launch workers
uint64 range = D_end - D_start;
uint64 per_gpu = (range + num_gpus - 1) / num_gpus;
pthread_t threads[8];
GPUWork works[8];
for (int g = 0; g < num_gpus; g++) {
works[g].gpu_id = g;
works[g].d_start = D_start + g * per_gpu;
works[g].d_end = D_start + (g + 1) * per_gpu;
if (works[g].d_end > D_end) works[g].d_end = D_end;
memset(works[g].h_hist, 0, sizeof(works[g].h_hist));
snprintf(works[g].output_path, 256,
"/home/amsysistestdrive2026/idontknow/data/class-numbers/raw_gpu%d_%llu_%llu.bin",
g, works[g].d_start, works[g].d_end);
pthread_create(&threads[g], NULL, gpu_worker, &works[g]);
}
// Collect
uint64 grand_total = 0, grand_h1 = 0;
uint64 grand_div3 = 0, grand_div5 = 0, grand_div7 = 0;
uint64 grand_hist[1024] = {0};
for (int g = 0; g < num_gpus; g++) {
pthread_join(threads[g], NULL);
grand_total += works[g].total_processed;
grand_h1 += works[g].h1_count;
grand_div3 += works[g].div3;
grand_div5 += works[g].div5;
grand_div7 += works[g].div7;
for (int h = 0; h < 1024; h++)
grand_hist[h] += works[g].h_hist[h];
}
clock_gettime(CLOCK_MONOTONIC, &t1);
double elapsed = (t1.tv_sec-t0.tv_sec) + (t1.tv_nsec-t0.tv_nsec)/1e9;
printf("\n========================================\n");
printf("RESULTS\n");
printf("========================================\n");
printf("Range: [%llu, %llu)\n", D_start, D_end);
printf("Fundamental discriminants: %llu\n", grand_total);
printf("Time: %.1fs (%.0f disc/sec)\n", elapsed, grand_total / elapsed);
printf("\nCohen-Lenstra statistics:\n");
printf(" h(d) = 1: %llu (%.4f%%)\n", grand_h1, 100.0 * grand_h1 / grand_total);
printf(" C-L predicted h=1: ~75.446%%\n");
printf(" 3 | h(d): %llu (%.4f%%)\n", grand_div3, 100.0 * grand_div3 / grand_total);
printf(" 5 | h(d): %llu (%.4f%%)\n", grand_div5, 100.0 * grand_div5 / grand_total);
printf(" 7 | h(d): %llu (%.4f%%)\n", grand_div7, 100.0 * grand_div7 / grand_total);
printf("\nClass number distribution (first 20):\n");
for (int h = 1; h <= 20; h++)
printf(" h=%2d: %llu (%.3f%%)\n", h, grand_hist[h], 100.0 * grand_hist[h] / grand_total);
printf("\n========================================\n");
return 0;
}
#endif
#ifdef TORCH_EXTENSION_NAME
static void setup_primes_for_torch() {
static bool initialized = false;
if (initialized) return;
int h_primes[NUM_PRIMES];
int count = 0;
bool sieve[100004];
memset(sieve, true, sizeof(sieve));
for (int i = 2; i * i <= 100003; i++)
if (sieve[i]) for (int j = i*i; j <= 100003; j += i) sieve[j] = false;
for (int i = 2; i <= 100003 && count < NUM_PRIMES; i++)
if (sieve[i]) h_primes[count++] = i;
cudaMemcpyToSymbol(d_primes, h_primes, count * sizeof(int));
initialized = true;
}
std::vector<torch::Tensor> compute_class_numbers_torch(torch::Tensor discriminants) {
TORCH_CHECK(discriminants.is_cuda(), "discriminants must be CUDA tensor");
TORCH_CHECK(discriminants.dtype() == torch::kInt64, "discriminants must be int64");
TORCH_CHECK(discriminants.dim() == 1, "discriminants must be 1-D");
setup_primes_for_torch();
int n = discriminants.size(0);
auto class_nums = torch::zeros({n}, torch::dtype(torch::kInt32).device(discriminants.device()));
auto regulators = torch::zeros({n}, torch::dtype(torch::kFloat64).device(discriminants.device()));
auto h1_count = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
auto histogram = torch::zeros({1024}, torch::dtype(torch::kInt64).device(discriminants.device()));
auto total_proc = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
auto div3 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
auto div5 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
auto div7 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
int blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
compute_class_numbers<<<blocks, BLOCK_SIZE>>>(
(uint64*)discriminants.data_ptr<int64_t>(), (uint32_t)n,
class_nums.data_ptr<int>(), regulators.data_ptr<double>(),
(uint64*)h1_count.data_ptr<int64_t>(),
(uint64*)histogram.data_ptr<int64_t>(),
(uint64*)total_proc.data_ptr<int64_t>(),
(uint64*)div3.data_ptr<int64_t>(),
(uint64*)div5.data_ptr<int64_t>(),
(uint64*)div7.data_ptr<int64_t>());
cudaDeviceSynchronize();
return {class_nums, regulators};
}
#endif