| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <math.h> |
| #include <time.h> |
|
|
| #define THREADS_PER_BLOCK 256 |
|
|
| |
| |
| |
| __device__ bool is_fundamental_discriminant(uint64_t d) { |
| if (d <= 1) return false; |
|
|
| |
| uint64_t d_mod4 = d % 4; |
|
|
| if (d_mod4 == 1) { |
| |
| for (uint64_t p = 2; p * p <= d; p++) { |
| if (d % (p * p) == 0) return false; |
| } |
| return true; |
| } else if (d_mod4 == 0) { |
| uint64_t m = d / 4; |
| uint64_t m_mod4 = m % 4; |
| if (m_mod4 != 2 && m_mod4 != 3) return false; |
| for (uint64_t p = 2; p * p <= m; p++) { |
| if (m % (p * p) == 0) return false; |
| } |
| return true; |
| } |
| return false; |
| } |
|
|
| |
| __device__ int kronecker_symbol(int64_t d, uint64_t n) { |
| if (n == 0) return (d == 1 || d == -1) ? 1 : 0; |
| if (n == 1) return 1; |
|
|
| |
| int result = 1; |
| while (n % 2 == 0) { |
| n /= 2; |
| int d_mod8 = ((d % 8) + 8) % 8; |
| if (d_mod8 == 3 || d_mod8 == 5) result = -result; |
| } |
| if (n == 1) return result; |
|
|
| |
| int64_t a = d % (int64_t)n; |
| if (a < 0) a += n; |
| uint64_t b = n; |
|
|
| while (a != 0) { |
| while (a % 2 == 0) { |
| a /= 2; |
| if (b % 8 == 3 || b % 8 == 5) result = -result; |
| } |
| |
| int64_t temp = a; |
| a = b; |
| b = temp; |
| if (a % 4 == 3 && b % 4 == 3) result = -result; |
| a = a % b; |
| } |
|
|
| return (b == 1) ? result : 0; |
| } |
|
|
| |
| |
| |
| |
| __device__ double approx_L1(int64_t d, int N) { |
| double sum = 0.0; |
| for (int n = 1; n <= N; n++) { |
| int chi = kronecker_symbol(d, n); |
| sum += (double)chi / (double)n; |
| } |
| return sum; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| __device__ double compute_regulator(uint64_t d) { |
| uint64_t a0 = (uint64_t)sqrt((double)d); |
| if (a0 * a0 == d) return 0.0; |
|
|
| |
| uint64_t m = 0, dd = 1, a = a0; |
| double log_epsilon = 0.0; |
|
|
| |
| |
| double P_prev = 1, P_curr = a0; |
| double Q_prev = 0, Q_curr = 1; |
|
|
| for (int i = 0; i < 10000; i++) { |
| m = dd * a - m; |
| dd = (d - m * m) / dd; |
| if (dd == 0) break; |
| a = (a0 + m) / dd; |
|
|
| double P_next = a * P_curr + P_prev; |
| double Q_next = a * Q_curr + Q_prev; |
| P_prev = P_curr; P_curr = P_next; |
| Q_prev = Q_curr; Q_curr = Q_next; |
|
|
| |
| if (a == 2 * a0) { |
| |
| log_epsilon = log(P_curr + Q_curr * sqrt((double)d)); |
| break; |
| } |
| } |
|
|
| return log_epsilon; |
| } |
|
|
| __global__ void compute_class_numbers(uint64_t start_d, uint64_t count, |
| uint64_t *class_numbers_out, |
| uint64_t *h1_count, uint64_t *total_count, |
| uint32_t *max_h, uint64_t *max_h_d) { |
| uint64_t idx = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; |
| if (idx >= count) return; |
|
|
| uint64_t d = start_d + idx; |
| if (!is_fundamental_discriminant(d)) return; |
|
|
| atomicAdd((unsigned long long*)total_count, 1ULL); |
|
|
| double R = compute_regulator(d); |
| if (R <= 0.0) return; |
|
|
| |
| int L_terms = (int)(sqrt((double)d) * 2); |
| if (L_terms > 100000) L_terms = 100000; |
| if (L_terms < 1000) L_terms = 1000; |
| double L1 = approx_L1((int64_t)d, L_terms); |
|
|
| |
| double h_approx = sqrt((double)d) * L1 / (2.0 * R); |
| uint64_t h = (uint64_t)(h_approx + 0.5); |
| if (h == 0) h = 1; |
|
|
| if (class_numbers_out != NULL) { |
| class_numbers_out[idx] = h; |
| } |
|
|
| if (h == 1) { |
| atomicAdd((unsigned long long*)h1_count, 1ULL); |
| } |
|
|
| if (h > *max_h) { |
| atomicMax(max_h, (uint32_t)h); |
| *max_h_d = d; |
| } |
| } |
|
|
| int main(int argc, char **argv) { |
| if (argc < 3) { |
| fprintf(stderr, "Usage: %s <start_d> <end_d>\n", argv[0]); |
| return 1; |
| } |
|
|
| uint64_t start_d = (uint64_t)atoll(argv[1]); |
| uint64_t end_d = (uint64_t)atoll(argv[2]); |
| uint64_t count = end_d - start_d + 1; |
|
|
| printf("Real Quadratic Field Class Numbers\n"); |
| printf("Discriminant range: d = %lu to %lu\n", start_d, end_d); |
| printf("Testing Cohen-Lenstra heuristics\n\n"); |
|
|
| int device_count; |
| cudaGetDeviceCount(&device_count); |
| printf("GPUs available: %d\n\n", device_count); |
|
|
| uint64_t *d_h1_count, *d_total; |
| uint32_t *d_max_h; |
| uint64_t *d_max_h_d; |
|
|
| cudaMalloc(&d_h1_count, sizeof(uint64_t)); |
| cudaMalloc(&d_total, sizeof(uint64_t)); |
| cudaMalloc(&d_max_h, sizeof(uint32_t)); |
| cudaMalloc(&d_max_h_d, sizeof(uint64_t)); |
| cudaMemset(d_h1_count, 0, sizeof(uint64_t)); |
| cudaMemset(d_total, 0, sizeof(uint64_t)); |
| cudaMemset(d_max_h, 0, sizeof(uint32_t)); |
|
|
| uint64_t chunk_size = 10000000; |
| struct timespec t_start, t_end; |
| clock_gettime(CLOCK_MONOTONIC, &t_start); |
|
|
| for (uint64_t offset = 0; offset < count; offset += chunk_size) { |
| uint64_t chunk = chunk_size; |
| if (offset + chunk > count) chunk = count - offset; |
|
|
| int gpu = (offset / chunk_size) % device_count; |
| cudaSetDevice(gpu); |
|
|
| int blocks = (chunk + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
| compute_class_numbers<<<blocks, THREADS_PER_BLOCK>>>( |
| start_d + offset, chunk, NULL, |
| d_h1_count, d_total, d_max_h, d_max_h_d |
| ); |
| cudaDeviceSynchronize(); |
|
|
| clock_gettime(CLOCK_MONOTONIC, &t_end); |
| double elapsed = (t_end.tv_sec - t_start.tv_sec) + |
| (t_end.tv_nsec - t_start.tv_nsec) / 1e9; |
| double progress = (double)(offset + chunk) / count * 100; |
|
|
| uint64_t h_total; |
| cudaMemcpy(&h_total, d_total, sizeof(uint64_t), cudaMemcpyDeviceToHost); |
|
|
| printf("[GPU %d] d=%lu..%lu (%.1f%%, %lu fund. disc. so far, %.1fs)\n", |
| gpu, start_d + offset, start_d + offset + chunk, |
| progress, h_total, elapsed); |
| fflush(stdout); |
| } |
|
|
| uint64_t h_h1_count, h_total; |
| uint32_t h_max_h; |
| uint64_t h_max_h_d; |
| cudaMemcpy(&h_h1_count, d_h1_count, sizeof(uint64_t), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&h_total, d_total, sizeof(uint64_t), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&h_max_h, d_max_h, sizeof(uint32_t), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&h_max_h_d, d_max_h_d, sizeof(uint64_t), cudaMemcpyDeviceToHost); |
|
|
| clock_gettime(CLOCK_MONOTONIC, &t_end); |
| double total_elapsed = (t_end.tv_sec - t_start.tv_sec) + |
| (t_end.tv_nsec - t_start.tv_nsec) / 1e9; |
|
|
| double h1_ratio = (double)h_h1_count / h_total; |
| |
| double cl_prediction = 0.75446; |
|
|
| printf("\n========================================\n"); |
| printf("Real Quadratic Class Numbers: d = %lu to %lu\n", start_d, end_d); |
| printf("Fundamental discriminants found: %lu\n", h_total); |
| printf("Class number h=1: %lu (%.4f%%)\n", h_h1_count, 100.0 * h1_ratio); |
| printf("Cohen-Lenstra prediction for h=1: %.4f%%\n", 100.0 * cl_prediction); |
| printf("Ratio (observed/predicted): %.6f\n", h1_ratio / cl_prediction); |
| printf("Largest class number: h=%u (d=%lu)\n", h_max_h, h_max_h_d); |
| printf("Time: %.1fs\n", total_elapsed); |
| printf("========================================\n"); |
|
|
| cudaFree(d_h1_count); cudaFree(d_total); |
| cudaFree(d_max_h); cudaFree(d_max_h_d); |
| return 0; |
| } |
|
|