bigcompute-cuda-kernels / class-numbers /class_number_rqf.cu
cahlen's picture
Upload 51 CUDA kernels for computational mathematics research
b5117ef verified
/*
* CUDA-accelerated class number computation for real quadratic fields
*
* For each fundamental discriminant d > 0, compute the class number h(d)
* of the real quadratic field Q(sqrt(d)).
*
* Method: Baby-step Giant-step (BSGS) in the infrastructure of the
* real quadratic field. For each d, we compute the regulator R(d) and
* class number h(d) using the analytic class number formula:
* h(d) * R(d) = sqrt(d) * L(1, χ_d) / 2
* where L(1, χ_d) is the Dirichlet L-function at s=1.
*
* Current frontier: Jacobson et al. computed h(d) for d up to ~10^11.
* Our target: extend to d up to 10^13, a ~100x improvement.
* This directly tests the Cohen-Lenstra heuristics for class group distribution.
*
* Each CUDA thread handles one discriminant d.
*
* Compile: nvcc -O3 -arch=sm_100a -o class_number_rqf scripts/experiments/class-numbers/class_number_rqf.cu -lm
* Run: ./class_number_rqf <start_d> <end_d>
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <time.h>
#define THREADS_PER_BLOCK 256
// Check if d is a fundamental discriminant
// d is fundamental if: d ≡ 1 (mod 4) and d is squarefree,
// or d = 4m where m ≡ 2,3 (mod 4) and m is squarefree
__device__ bool is_fundamental_discriminant(uint64_t d) {
if (d <= 1) return false;
// Check d mod 4
uint64_t d_mod4 = d % 4;
if (d_mod4 == 1) {
// d must be squarefree
for (uint64_t p = 2; p * p <= d; p++) {
if (d % (p * p) == 0) return false;
}
return true;
} else if (d_mod4 == 0) {
uint64_t m = d / 4;
uint64_t m_mod4 = m % 4;
if (m_mod4 != 2 && m_mod4 != 3) return false;
for (uint64_t p = 2; p * p <= m; p++) {
if (m % (p * p) == 0) return false;
}
return true;
}
return false;
}
// Kronecker symbol (d/n) — needed for L-function computation
__device__ int kronecker_symbol(int64_t d, uint64_t n) {
if (n == 0) return (d == 1 || d == -1) ? 1 : 0;
if (n == 1) return 1;
// Handle n = 2
int result = 1;
while (n % 2 == 0) {
n /= 2;
int d_mod8 = ((d % 8) + 8) % 8;
if (d_mod8 == 3 || d_mod8 == 5) result = -result;
}
if (n == 1) return result;
// Quadratic reciprocity (Jacobi symbol from here)
int64_t a = d % (int64_t)n;
if (a < 0) a += n;
uint64_t b = n;
while (a != 0) {
while (a % 2 == 0) {
a /= 2;
if (b % 8 == 3 || b % 8 == 5) result = -result;
}
// Swap
int64_t temp = a;
a = b;
b = temp;
if (a % 4 == 3 && b % 4 == 3) result = -result;
a = a % b;
}
return (b == 1) ? result : 0;
}
// Approximate L(1, χ_d) using partial sum of Dirichlet series
// L(1, χ_d) = Σ_{n=1}^{∞} (d/n)/n
// We sum up to N terms. For fundamental d, convergence is slow
// but we can accelerate with the Euler product or partial summation.
__device__ double approx_L1(int64_t d, int N) {
double sum = 0.0;
for (int n = 1; n <= N; n++) {
int chi = kronecker_symbol(d, n);
sum += (double)chi / (double)n;
}
return sum;
}
// Compute class number via analytic formula:
// h(d) = round(sqrt(d) * L(1, χ_d) / (2 * R(d)))
// For the simplified version, we use:
// h(d) * R(d) = sqrt(d) * L(1, χ_d) / 2
//
// Computing R(d) requires the continued fraction of sqrt(d).
// The period length gives us the fundamental unit, from which R = log(ε).
// Continued fraction of sqrt(d): sqrt(d) = [a0; a1, a2, ..., a_{p-1}, 2*a0]
// where the sequence a1,...,a_{p-1},2*a0 repeats
__device__ double compute_regulator(uint64_t d) {
uint64_t a0 = (uint64_t)sqrt((double)d);
if (a0 * a0 == d) return 0.0; // perfect square, not a field
// Compute CF expansion of sqrt(d) until we find the period
uint64_t m = 0, dd = 1, a = a0;
double log_epsilon = 0.0;
// Track convergents P/Q
// ε = P + Q*sqrt(d) where (P, Q) comes from the period
double P_prev = 1, P_curr = a0;
double Q_prev = 0, Q_curr = 1;
for (int i = 0; i < 10000; i++) {
m = dd * a - m;
dd = (d - m * m) / dd;
if (dd == 0) break;
a = (a0 + m) / dd;
double P_next = a * P_curr + P_prev;
double Q_next = a * Q_curr + Q_prev;
P_prev = P_curr; P_curr = P_next;
Q_prev = Q_curr; Q_curr = Q_next;
// Period ends when a = 2*a0
if (a == 2 * a0) {
// Fundamental unit ε = P_curr + Q_curr * sqrt(d)
log_epsilon = log(P_curr + Q_curr * sqrt((double)d));
break;
}
}
return log_epsilon;
}
__global__ void compute_class_numbers(uint64_t start_d, uint64_t count,
uint64_t *class_numbers_out,
uint64_t *h1_count, uint64_t *total_count,
uint32_t *max_h, uint64_t *max_h_d) {
uint64_t idx = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= count) return;
uint64_t d = start_d + idx;
if (!is_fundamental_discriminant(d)) return;
atomicAdd((unsigned long long*)total_count, 1ULL);
double R = compute_regulator(d);
if (R <= 0.0) return;
// L(1, χ_d) approximation — use more terms for larger d
int L_terms = (int)(sqrt((double)d) * 2);
if (L_terms > 100000) L_terms = 100000;
if (L_terms < 1000) L_terms = 1000;
double L1 = approx_L1((int64_t)d, L_terms);
// h(d) = round(sqrt(d) * L1 / (2 * R))
double h_approx = sqrt((double)d) * L1 / (2.0 * R);
uint64_t h = (uint64_t)(h_approx + 0.5);
if (h == 0) h = 1;
if (class_numbers_out != NULL) {
class_numbers_out[idx] = h;
}
if (h == 1) {
atomicAdd((unsigned long long*)h1_count, 1ULL);
}
if (h > *max_h) {
atomicMax(max_h, (uint32_t)h);
*max_h_d = d;
}
}
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <start_d> <end_d>\n", argv[0]);
return 1;
}
uint64_t start_d = (uint64_t)atoll(argv[1]);
uint64_t end_d = (uint64_t)atoll(argv[2]);
uint64_t count = end_d - start_d + 1;
printf("Real Quadratic Field Class Numbers\n");
printf("Discriminant range: d = %lu to %lu\n", start_d, end_d);
printf("Testing Cohen-Lenstra heuristics\n\n");
int device_count;
cudaGetDeviceCount(&device_count);
printf("GPUs available: %d\n\n", device_count);
uint64_t *d_h1_count, *d_total;
uint32_t *d_max_h;
uint64_t *d_max_h_d;
cudaMalloc(&d_h1_count, sizeof(uint64_t));
cudaMalloc(&d_total, sizeof(uint64_t));
cudaMalloc(&d_max_h, sizeof(uint32_t));
cudaMalloc(&d_max_h_d, sizeof(uint64_t));
cudaMemset(d_h1_count, 0, sizeof(uint64_t));
cudaMemset(d_total, 0, sizeof(uint64_t));
cudaMemset(d_max_h, 0, sizeof(uint32_t));
uint64_t chunk_size = 10000000;
struct timespec t_start, t_end;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for (uint64_t offset = 0; offset < count; offset += chunk_size) {
uint64_t chunk = chunk_size;
if (offset + chunk > count) chunk = count - offset;
int gpu = (offset / chunk_size) % device_count;
cudaSetDevice(gpu);
int blocks = (chunk + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
compute_class_numbers<<<blocks, THREADS_PER_BLOCK>>>(
start_d + offset, chunk, NULL,
d_h1_count, d_total, d_max_h, d_max_h_d
);
cudaDeviceSynchronize();
clock_gettime(CLOCK_MONOTONIC, &t_end);
double elapsed = (t_end.tv_sec - t_start.tv_sec) +
(t_end.tv_nsec - t_start.tv_nsec) / 1e9;
double progress = (double)(offset + chunk) / count * 100;
uint64_t h_total;
cudaMemcpy(&h_total, d_total, sizeof(uint64_t), cudaMemcpyDeviceToHost);
printf("[GPU %d] d=%lu..%lu (%.1f%%, %lu fund. disc. so far, %.1fs)\n",
gpu, start_d + offset, start_d + offset + chunk,
progress, h_total, elapsed);
fflush(stdout);
}
uint64_t h_h1_count, h_total;
uint32_t h_max_h;
uint64_t h_max_h_d;
cudaMemcpy(&h_h1_count, d_h1_count, sizeof(uint64_t), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_total, d_total, sizeof(uint64_t), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_max_h, d_max_h, sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_max_h_d, d_max_h_d, sizeof(uint64_t), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_MONOTONIC, &t_end);
double total_elapsed = (t_end.tv_sec - t_start.tv_sec) +
(t_end.tv_nsec - t_start.tv_nsec) / 1e9;
double h1_ratio = (double)h_h1_count / h_total;
// Cohen-Lenstra predicts h=1 occurs with probability ~75.446% for real quadratic fields
double cl_prediction = 0.75446;
printf("\n========================================\n");
printf("Real Quadratic Class Numbers: d = %lu to %lu\n", start_d, end_d);
printf("Fundamental discriminants found: %lu\n", h_total);
printf("Class number h=1: %lu (%.4f%%)\n", h_h1_count, 100.0 * h1_ratio);
printf("Cohen-Lenstra prediction for h=1: %.4f%%\n", 100.0 * cl_prediction);
printf("Ratio (observed/predicted): %.6f\n", h1_ratio / cl_prediction);
printf("Largest class number: h=%u (d=%lu)\n", h_max_h, h_max_h_d);
printf("Time: %.1fs\n", total_elapsed);
printf("========================================\n");
cudaFree(d_h1_count); cudaFree(d_total);
cudaFree(d_max_h); cudaFree(d_max_h_d);
return 0;
}