| #ifdef TORCH_EXTENSION_NAME |
| #include <torch/torch.h> |
| #endif |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <math.h> |
| #include <string.h> |
| #include <time.h> |
| #ifndef TORCH_EXTENSION_NAME |
| #include <pthread.h> |
| #endif |
|
|
| typedef unsigned long long uint64; |
| typedef long long int64; |
|
|
| #define BLOCK_SIZE 256 |
| #define MAX_CF_STEPS 2000000 |
| #define CHUNK_SIZE 10000000 |
|
|
| |
| |
| |
| #define NUM_PRIMES 9592 |
| __constant__ int d_primes[NUM_PRIMES]; |
|
|
| |
| |
| |
| __device__ int kronecker(int64 d, int p) { |
| if (p == 2) { |
| int dm8 = ((int)(d % 8) + 8) % 8; |
| if (dm8 == 1 || dm8 == 7) return 1; |
| if (dm8 == 3 || dm8 == 5) return -1; |
| return 0; |
| } |
| |
| int64 a = ((d % p) + p) % p; |
| if (a == 0) return 0; |
| int64 result = 1; |
| int64 exp = (p - 1) / 2; |
| int64 base = a; |
| while (exp > 0) { |
| if (exp & 1) result = (result * base) % p; |
| base = (base * base) % p; |
| exp >>= 1; |
| } |
| return (result == 1) ? 1 : -1; |
| } |
|
|
| |
| |
| |
| __global__ void compute_class_numbers( |
| uint64 *discriminants, |
| uint32_t count, |
| int *class_numbers_out, |
| double *regulators_out, |
| |
| uint64 *h1_count, |
| uint64 *h_histogram, |
| uint64 *total_processed, |
| uint64 *div3_count, |
| uint64 *div5_count, |
| uint64 *div7_count) |
| { |
| uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (idx >= count) return; |
|
|
| uint64 d = discriminants[idx]; |
| if (d < 5) return; |
|
|
| |
| |
| |
|
|
| double regulator = 0.0; |
| double log_P_prev, log_P_curr, log_Q_prev, log_Q_curr; |
|
|
| if (d % 4 == 0) { |
| |
| uint64 m_val = d / 4; |
| uint64 a0 = (uint64)sqrt((double)m_val); |
| while (a0 * a0 > m_val) a0--; |
| while ((a0+1)*(a0+1) <= m_val) a0++; |
| if (a0 * a0 == m_val) return; |
|
|
| int64 mm = 0, D = 1, a = (int64)a0; |
| log_P_prev = 0.0; |
| log_P_curr = log((double)a0); |
| log_Q_prev = -1e30; |
| log_Q_curr = 0.0; |
|
|
| for (int step = 0; step < MAX_CF_STEPS; step++) { |
| mm = D * a - mm; |
| D = ((int64)m_val - mm * mm) / D; |
| if (D == 0) break; |
| a = ((int64)a0 + mm) / D; |
|
|
| |
| if (D == 1) { |
| double diff = log_Q_curr + 0.5 * log((double)m_val) - log_P_curr; |
| regulator = log_P_curr + log(1.0 + exp(diff)); |
| break; |
| } |
|
|
| |
| double rp = exp(log_P_prev - log_P_curr); |
| log_P_prev = log_P_curr; |
| log_P_curr = log_P_curr + log((double)a + rp); |
| double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0; |
| log_Q_prev = log_Q_curr; |
| log_Q_curr = log_Q_curr + log((double)a + rq); |
| } |
| } else { |
| |
| uint64 isqrt_d = (uint64)sqrt((double)d); |
| while (isqrt_d * isqrt_d > d) isqrt_d--; |
| while ((isqrt_d+1)*(isqrt_d+1) <= d) isqrt_d++; |
|
|
| int64 P = 1, Q = 2; |
| int64 a = (P + (int64)isqrt_d) / Q; |
| log_P_prev = 0.0; |
| log_P_curr = log((double)(a > 0 ? a : 1)); |
| log_Q_prev = -1e30; |
| log_Q_curr = 0.0; |
|
|
| |
| int64 first_P = -1, first_Q = -1; |
| double log_eps0 = 0.0; |
|
|
| for (int step = 0; step < MAX_CF_STEPS; step++) { |
| int64 P_new = a * Q - P; |
| int64 Q_new = ((int64)d - P_new * P_new) / Q; |
| if (Q_new == 0) break; |
| int64 a_new = (P_new + (int64)isqrt_d) / Q_new; |
| P = P_new; Q = Q_new; a = a_new; |
|
|
| |
| double rp = exp(log_P_prev - log_P_curr); |
| log_P_prev = log_P_curr; |
| log_P_curr = log_P_curr + log((double)a + rp); |
| double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0; |
| log_Q_prev = log_Q_curr; |
| log_Q_curr = log_Q_curr + log((double)a + rq); |
|
|
| |
| int is_reduced = (Q > 0 && P > 0 && P <= (int64)isqrt_d && P > (int64)isqrt_d - Q); |
| if (!is_reduced) continue; |
|
|
| |
| double ratio_qp = exp(log_Q_curr - log_P_curr); |
| double log_2pmq = log_P_curr + log(2.0 - ratio_qp); |
| double diff = log_Q_curr + 0.5 * log((double)d) - log_2pmq; |
| double log_eps = log_2pmq + log(1.0 + exp(diff)) - log(2.0); |
|
|
| if (first_P < 0) { |
| |
| first_P = P; first_Q = Q; |
| log_eps0 = log_eps; |
| } else if (P == first_P && Q == first_Q) { |
| |
| regulator = log_eps - log_eps0; |
| break; |
| } |
| } |
| } |
|
|
| if (regulator < 0.01) regulator = 0.01; |
|
|
| |
| double L1 = 1.0; |
| for (int i = 0; i < NUM_PRIMES; i++) { |
| int p = d_primes[i]; |
| int chi = kronecker((int64)d, p); |
| if (chi != 0) { |
| L1 *= 1.0 / (1.0 - (double)chi / p); |
| } |
| |
| } |
|
|
| |
| double h_approx = sqrt((double)d) * L1 / (2.0 * regulator); |
| int h = (int)round(h_approx); |
| if (h < 1) h = 1; |
|
|
| class_numbers_out[idx] = h; |
| if (regulators_out) regulators_out[idx] = regulator; |
|
|
| |
| atomicAdd(total_processed, 1ULL); |
| if (h == 1) atomicAdd(h1_count, 1ULL); |
| if (h < 1024) atomicAdd(&h_histogram[h], 1ULL); |
| if (h % 3 == 0) atomicAdd(div3_count, 1ULL); |
| if (h % 5 == 0) atomicAdd(div5_count, 1ULL); |
| if (h % 7 == 0) atomicAdd(div7_count, 1ULL); |
| } |
|
|
| |
| |
| |
| __global__ void gpu_sieve_squarefree( |
| uint8_t *sieve, uint64 lo, uint64 len, |
| const int *primes, int num_primes) |
| { |
| uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x; |
| if (pos >= len) return; |
| uint64 d = lo + pos; |
| for (int i = 0; i < num_primes; i++) { |
| int p = primes[i]; |
| uint64 p2 = (uint64)p * p; |
| if (p2 > d) break; |
| if (d % p2 == 0) { sieve[pos] = 0; return; } |
| } |
| } |
|
|
| __global__ void gpu_extract_fundamental( |
| const uint8_t *sieve, uint64 lo, uint64 len, |
| uint64 *output, uint32_t *count, uint32_t max_out) |
| { |
| uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x; |
| if (pos >= len) return; |
| uint64 d = lo + pos; |
| if (d < 5) return; |
| int is_fund = 0; |
| if (d % 4 == 1 && sieve[pos]) { |
| is_fund = 1; |
| } else if (d % 4 == 0) { |
| uint64 m = d / 4; |
| if ((m % 4 == 2 || m % 4 == 3)) { |
| if (m >= lo && m < lo + len && sieve[m - lo]) is_fund = 1; |
| else if (m < lo) { |
| |
| int sqf = 1; |
| for (uint64 p = 2; p * p <= m && sqf; p++) |
| if (m % (p*p) == 0) sqf = 0; |
| if (sqf) is_fund = 1; |
| } |
| } |
| } |
| if (is_fund) { |
| uint32_t idx = atomicAdd(count, 1); |
| if (idx < max_out) output[idx] = d; |
| } |
| } |
|
|
| |
| |
| |
| #ifndef TORCH_EXTENSION_NAME |
|
|
| int generate_primes(int *primes, int max_prime) { |
| char *sieve = (char*)calloc(max_prime + 1, 1); |
| memset(sieve, 1, max_prime + 1); |
| sieve[0] = sieve[1] = 0; |
| for (int i = 2; i * i <= max_prime; i++) |
| if (sieve[i]) |
| for (int j = i*i; j <= max_prime; j += i) |
| sieve[j] = 0; |
| int count = 0; |
| for (int i = 2; i <= max_prime && count < NUM_PRIMES; i++) |
| if (sieve[i]) primes[count++] = i; |
| free(sieve); |
| return count; |
| } |
|
|
| |
| |
| |
| typedef struct { |
| int gpu_id; |
| uint64 d_start, d_end; |
| char output_path[256]; |
| |
| uint64 total_processed; |
| uint64 h1_count; |
| uint64 div3, div5, div7; |
| uint64 h_hist[1024]; |
| } GPUWork; |
|
|
| void *gpu_worker(void *arg) { |
| GPUWork *work = (GPUWork*)arg; |
| cudaSetDevice(work->gpu_id); |
|
|
| |
| uint64 *d_discriminants; |
| int *d_class_numbers; |
| uint64 *d_h1, *d_total, *d_div3, *d_div5, *d_div7, *d_hist; |
|
|
| uint32_t max_per_chunk = CHUNK_SIZE; |
| cudaMalloc(&d_discriminants, max_per_chunk * sizeof(uint64)); |
| cudaMalloc(&d_class_numbers, max_per_chunk * sizeof(int)); |
| cudaMalloc(&d_h1, sizeof(uint64)); |
| cudaMalloc(&d_total, sizeof(uint64)); |
| cudaMalloc(&d_div3, sizeof(uint64)); |
| cudaMalloc(&d_div5, sizeof(uint64)); |
| cudaMalloc(&d_div7, sizeof(uint64)); |
| cudaMalloc(&d_hist, 1024 * sizeof(uint64)); |
|
|
| cudaMemset(d_h1, 0, sizeof(uint64)); |
| cudaMemset(d_total, 0, sizeof(uint64)); |
| cudaMemset(d_div3, 0, sizeof(uint64)); |
| cudaMemset(d_div5, 0, sizeof(uint64)); |
| cudaMemset(d_div7, 0, sizeof(uint64)); |
| cudaMemset(d_hist, 0, 1024 * sizeof(uint64)); |
|
|
| |
| uint64 chunk_raw = CHUNK_SIZE * 3; |
| uint8_t *d_sieve; |
| uint32_t *d_sieve_count; |
| int *d_sieve_primes; |
| cudaMalloc(&d_sieve, chunk_raw); |
| cudaMalloc(&d_sieve_count, sizeof(uint32_t)); |
|
|
| |
| uint64 sqrt_max = (uint64)sqrt((double)work->d_end) + 2; |
| int *h_sieve_primes = (int*)malloc(sqrt_max * sizeof(int)); |
| int n_sieve_primes = 0; |
| { |
| char *isp = (char*)calloc(sqrt_max + 1, 1); |
| for (uint64 i = 2; i <= sqrt_max; i++) isp[i] = 1; |
| for (uint64 i = 2; i * i <= sqrt_max; i++) |
| if (isp[i]) for (uint64 j = i*i; j <= sqrt_max; j += i) isp[j] = 0; |
| for (uint64 i = 2; i <= sqrt_max; i++) |
| if (isp[i]) h_sieve_primes[n_sieve_primes++] = (int)i; |
| free(isp); |
| } |
| cudaMalloc(&d_sieve_primes, n_sieve_primes * sizeof(int)); |
| cudaMemcpy(d_sieve_primes, h_sieve_primes, n_sieve_primes * sizeof(int), cudaMemcpyHostToDevice); |
| free(h_sieve_primes); |
|
|
| uint64 chunks_done = 0; |
|
|
| for (uint64 d_lo = work->d_start; d_lo < work->d_end; d_lo += chunk_raw) { |
| uint64 d_hi = d_lo + chunk_raw; |
| if (d_hi > work->d_end) d_hi = work->d_end; |
| uint64 len = d_hi - d_lo; |
|
|
| |
| cudaMemset(d_sieve, 1, len); |
| cudaMemset(d_sieve_count, 0, sizeof(uint32_t)); |
| uint64 sieve_blocks = (len + BLOCK_SIZE - 1) / BLOCK_SIZE; |
| gpu_sieve_squarefree<<<sieve_blocks, BLOCK_SIZE>>>( |
| d_sieve, d_lo, len, d_sieve_primes, n_sieve_primes); |
| gpu_extract_fundamental<<<sieve_blocks, BLOCK_SIZE>>>( |
| d_sieve, d_lo, len, d_discriminants, d_sieve_count, max_per_chunk); |
| uint32_t count; |
| cudaMemcpy(&count, d_sieve_count, sizeof(uint32_t), cudaMemcpyDeviceToHost); |
| if (count == 0) continue; |
| if (count > max_per_chunk) count = max_per_chunk; |
|
|
| |
| int blocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE; |
| compute_class_numbers<<<blocks, BLOCK_SIZE>>>( |
| d_discriminants, count, d_class_numbers, NULL, |
| d_h1, d_hist, d_total, d_div3, d_div5, d_div7); |
| cudaDeviceSynchronize(); |
|
|
| |
| if (work->output_path[0]) { |
| uint64 *h_disc = (uint64*)malloc(count * sizeof(uint64)); |
| int *h_cls = (int*)malloc(count * sizeof(int)); |
| cudaMemcpy(h_disc, d_discriminants, count * sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(h_cls, d_class_numbers, count * sizeof(int), cudaMemcpyDeviceToHost); |
|
|
| FILE *fout = fopen(work->output_path, "ab"); |
| if (fout) { |
| for (uint32_t i = 0; i < count; i++) { |
| if (h_cls[i] > 0) { |
| fwrite(&h_disc[i], sizeof(uint64), 1, fout); |
| fwrite(&h_cls[i], sizeof(int), 1, fout); |
| } |
| } |
| fclose(fout); |
| } |
| free(h_disc); free(h_cls); |
| } |
|
|
| chunks_done++; |
| if (chunks_done % 20 == 0) { |
| uint64 total; |
| cudaMemcpy(&total, d_total, sizeof(uint64), cudaMemcpyDeviceToHost); |
| double pct = 100.0 * (d_lo - work->d_start) / (double)(work->d_end - work->d_start); |
| printf("[GPU %d] %.1f%% | %llu discriminants | d ~ %.2e\n", |
| work->gpu_id, pct, total, (double)d_lo); |
| fflush(stdout); |
| } |
| } |
|
|
| |
| cudaDeviceSynchronize(); |
| cudaMemcpy(&work->total_processed, d_total, sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&work->h1_count, d_h1, sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&work->div3, d_div3, sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&work->div5, d_div5, sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(&work->div7, d_div7, sizeof(uint64), cudaMemcpyDeviceToHost); |
| cudaMemcpy(work->h_hist, d_hist, 1024 * sizeof(uint64), cudaMemcpyDeviceToHost); |
|
|
| cudaFree(d_discriminants); cudaFree(d_class_numbers); |
| cudaFree(d_h1); cudaFree(d_total); cudaFree(d_div3); cudaFree(d_div5); cudaFree(d_div7); |
| cudaFree(d_hist); |
| cudaFree(d_sieve); cudaFree(d_sieve_count); cudaFree(d_sieve_primes); |
|
|
| printf("[GPU %d] done: %llu discriminants\n", work->gpu_id, work->total_processed); |
| return NULL; |
| } |
|
|
| |
| |
| |
| int main(int argc, char **argv) { |
| uint64 D_start = argc > 1 ? strtoull(argv[1], NULL, 10) : 5; |
| uint64 D_end = argc > 2 ? strtoull(argv[2], NULL, 10) : 1000000; |
|
|
| printf("========================================\n"); |
| printf("Class Numbers of Real Quadratic Fields v2\n"); |
| printf("Range: [%llu, %llu)\n", D_start, D_end); |
| printf("========================================\n\n"); |
|
|
| |
| int h_primes[NUM_PRIMES]; |
| int nprimes = generate_primes(h_primes, 100003); |
| printf("Primes: %d (up to %d)\n", nprimes, h_primes[nprimes-1]); |
|
|
| int num_gpus; |
| cudaGetDeviceCount(&num_gpus); |
| printf("GPUs: %d\n\n", num_gpus); |
|
|
| |
| for (int g = 0; g < num_gpus; g++) { |
| cudaSetDevice(g); |
| cudaMemcpyToSymbol(d_primes, h_primes, nprimes * sizeof(int)); |
| } |
|
|
| struct timespec t0, t1; |
| clock_gettime(CLOCK_MONOTONIC, &t0); |
|
|
| |
| uint64 range = D_end - D_start; |
| uint64 per_gpu = (range + num_gpus - 1) / num_gpus; |
|
|
| pthread_t threads[8]; |
| GPUWork works[8]; |
| for (int g = 0; g < num_gpus; g++) { |
| works[g].gpu_id = g; |
| works[g].d_start = D_start + g * per_gpu; |
| works[g].d_end = D_start + (g + 1) * per_gpu; |
| if (works[g].d_end > D_end) works[g].d_end = D_end; |
| memset(works[g].h_hist, 0, sizeof(works[g].h_hist)); |
| snprintf(works[g].output_path, 256, |
| "/home/amsysistestdrive2026/idontknow/data/class-numbers/raw_gpu%d_%llu_%llu.bin", |
| g, works[g].d_start, works[g].d_end); |
| pthread_create(&threads[g], NULL, gpu_worker, &works[g]); |
| } |
|
|
| |
| uint64 grand_total = 0, grand_h1 = 0; |
| uint64 grand_div3 = 0, grand_div5 = 0, grand_div7 = 0; |
| uint64 grand_hist[1024] = {0}; |
|
|
| for (int g = 0; g < num_gpus; g++) { |
| pthread_join(threads[g], NULL); |
| grand_total += works[g].total_processed; |
| grand_h1 += works[g].h1_count; |
| grand_div3 += works[g].div3; |
| grand_div5 += works[g].div5; |
| grand_div7 += works[g].div7; |
| for (int h = 0; h < 1024; h++) |
| grand_hist[h] += works[g].h_hist[h]; |
| } |
|
|
| clock_gettime(CLOCK_MONOTONIC, &t1); |
| double elapsed = (t1.tv_sec-t0.tv_sec) + (t1.tv_nsec-t0.tv_nsec)/1e9; |
|
|
| printf("\n========================================\n"); |
| printf("RESULTS\n"); |
| printf("========================================\n"); |
| printf("Range: [%llu, %llu)\n", D_start, D_end); |
| printf("Fundamental discriminants: %llu\n", grand_total); |
| printf("Time: %.1fs (%.0f disc/sec)\n", elapsed, grand_total / elapsed); |
| printf("\nCohen-Lenstra statistics:\n"); |
| printf(" h(d) = 1: %llu (%.4f%%)\n", grand_h1, 100.0 * grand_h1 / grand_total); |
| printf(" C-L predicted h=1: ~75.446%%\n"); |
| printf(" 3 | h(d): %llu (%.4f%%)\n", grand_div3, 100.0 * grand_div3 / grand_total); |
| printf(" 5 | h(d): %llu (%.4f%%)\n", grand_div5, 100.0 * grand_div5 / grand_total); |
| printf(" 7 | h(d): %llu (%.4f%%)\n", grand_div7, 100.0 * grand_div7 / grand_total); |
|
|
| printf("\nClass number distribution (first 20):\n"); |
| for (int h = 1; h <= 20; h++) |
| printf(" h=%2d: %llu (%.3f%%)\n", h, grand_hist[h], 100.0 * grand_hist[h] / grand_total); |
|
|
| printf("\n========================================\n"); |
| return 0; |
| } |
| #endif |
|
|
|
|
| #ifdef TORCH_EXTENSION_NAME |
| static void setup_primes_for_torch() { |
| static bool initialized = false; |
| if (initialized) return; |
| int h_primes[NUM_PRIMES]; |
| int count = 0; |
| bool sieve[100004]; |
| memset(sieve, true, sizeof(sieve)); |
| for (int i = 2; i * i <= 100003; i++) |
| if (sieve[i]) for (int j = i*i; j <= 100003; j += i) sieve[j] = false; |
| for (int i = 2; i <= 100003 && count < NUM_PRIMES; i++) |
| if (sieve[i]) h_primes[count++] = i; |
| cudaMemcpyToSymbol(d_primes, h_primes, count * sizeof(int)); |
| initialized = true; |
| } |
|
|
| std::vector<torch::Tensor> compute_class_numbers_torch(torch::Tensor discriminants) { |
| TORCH_CHECK(discriminants.is_cuda(), "discriminants must be CUDA tensor"); |
| TORCH_CHECK(discriminants.dtype() == torch::kInt64, "discriminants must be int64"); |
| TORCH_CHECK(discriminants.dim() == 1, "discriminants must be 1-D"); |
|
|
| setup_primes_for_torch(); |
|
|
| int n = discriminants.size(0); |
| auto class_nums = torch::zeros({n}, torch::dtype(torch::kInt32).device(discriminants.device())); |
| auto regulators = torch::zeros({n}, torch::dtype(torch::kFloat64).device(discriminants.device())); |
| auto h1_count = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device())); |
| auto histogram = torch::zeros({1024}, torch::dtype(torch::kInt64).device(discriminants.device())); |
| auto total_proc = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device())); |
| auto div3 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device())); |
| auto div5 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device())); |
| auto div7 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device())); |
|
|
| int blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; |
| compute_class_numbers<<<blocks, BLOCK_SIZE>>>( |
| (uint64*)discriminants.data_ptr<int64_t>(), (uint32_t)n, |
| class_nums.data_ptr<int>(), regulators.data_ptr<double>(), |
| (uint64*)h1_count.data_ptr<int64_t>(), |
| (uint64*)histogram.data_ptr<int64_t>(), |
| (uint64*)total_proc.data_ptr<int64_t>(), |
| (uint64*)div3.data_ptr<int64_t>(), |
| (uint64*)div5.data_ptr<int64_t>(), |
| (uint64*)div7.data_ptr<int64_t>()); |
| cudaDeviceSynchronize(); |
| return {class_nums, regulators}; |
| } |
| #endif |
|
|