Add torch library bindings (guard main, add torch wrapper functions)

a51a509 verified 5 days ago

21.6 kB

	#ifdef TORCH_EXTENSION_NAME
	#include <torch/torch.h>
	#endif

	/*
	* Class Numbers of Real Quadratic Fields — v2 Multi-GPU
	*
	* Computes h(d) for all fundamental discriminants d in [D_lo, D_hi]
	* using: h(d) = round(sqrt(d) * L(1, chi_d) / (2 * R(d)))
	*
	* Key improvements over v1:
	* - Integer-only CF for regulator (no FP64 overflow)
	* - Euler product with 9592 primes to 10^5 (was 1229 to 10^4)
	* - CPU segmented sieve for fundamental discriminants
	* - Multi-GPU via pthreads (one thread per GPU)
	* - Incremental log accumulation for regulator
	* - Cohen-Lenstra statistics collection
	*
	* Compile: nvcc -O3 -arch=sm_100a -o class_v2 \
	* scripts/experiments/class-numbers/class_numbers_v2.cu -lpthread -lm
	*
	* Run: ./class_v2 <start> <end>
	* e.g. ./class_v2 5 1000000000 (validate against known tables)
	* ./class_v2 100000000000 10000000000000 (new computation)
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <math.h>
	#include <string.h>
	#include <time.h>
	#ifndef TORCH_EXTENSION_NAME
	#include <pthread.h>
	#endif

	typedef unsigned long long uint64;
	typedef long long int64;

	#define BLOCK_SIZE 256
	#define MAX_CF_STEPS 2000000 // cap for CF period (covers 99.9% of d < 10^13)
	#define CHUNK_SIZE 10000000 // 10M raw d per chunk

	// =====================================================
	// Primes in constant memory (up to 100003 = 9592 primes)
	// =====================================================
	#define NUM_PRIMES 9592
	__constant__ int d_primes[NUM_PRIMES];

	// =====================================================
	// Kronecker symbol (d/p) — modular exponentiation
	// =====================================================
	__device__ int kronecker(int64 d, int p) {
	if (p == 2) {
	int dm8 = ((int)(d % 8) + 8) % 8;
	if (dm8 == 1 \|\| dm8 == 7) return 1;
	if (dm8 == 3 \|\| dm8 == 5) return -1;
	return 0;
	}
	// Euler's criterion: d^((p-1)/2) mod p
	int64 a = ((d % p) + p) % p;
	if (a == 0) return 0;
	int64 result = 1;
	int64 exp = (p - 1) / 2;
	int64 base = a;
	while (exp > 0) {
	if (exp & 1) result = (result * base) % p;
	base = (base * base) % p;
	exp >>= 1;
	}
	return (result == 1) ? 1 : -1;
	}

	// =====================================================
	// Combined kernel: regulator + L-function + class number
	// =====================================================
	__global__ void compute_class_numbers(
	uint64 *discriminants, // fundamental discriminants
	uint32_t count,
	int *class_numbers_out,
	double *regulators_out, // optional: NULL to skip output
	// Statistics (atomics)
	uint64 *h1_count, // count of h(d) = 1
	uint64 *h_histogram, // h_histogram[h] for h < 1024
	uint64 *total_processed,
	uint64 *div3_count, // count of 3 \| h(d)
	uint64 *div5_count,
	uint64 *div7_count)
	{
	uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx >= count) return;

	uint64 d = discriminants[idx];
	if (d < 5) return;

	// ===== PHASE 1: Regulator (validated: matches PARI/GP on 1000 discriminants) =====
	// For d ≡ 0 mod 4 (d=4m): CF of √m, stop at first D==1
	// For d ≡ 1 mod 4: CF of (1+√d)/2, stop when P=1,Q=2

	double regulator = 0.0;
	double log_P_prev, log_P_curr, log_Q_prev, log_Q_curr;

	if (d % 4 == 0) {
	// d = 4m: CF of √m
	uint64 m_val = d / 4;
	uint64 a0 = (uint64)sqrt((double)m_val);
	while (a0 * a0 > m_val) a0--;
	while ((a0+1)*(a0+1) <= m_val) a0++;
	if (a0 * a0 == m_val) return;

	int64 mm = 0, D = 1, a = (int64)a0;
	log_P_prev = 0.0;
	log_P_curr = log((double)a0);
	log_Q_prev = -1e30;
	log_Q_curr = 0.0;

	for (int step = 0; step < MAX_CF_STEPS; step++) {
	mm = D * a - mm;
	D = ((int64)m_val - mm * mm) / D;
	if (D == 0) break;
	a = ((int64)a0 + mm) / D;

	// Check D==1 BEFORE updating convergents (critical!)
	if (D == 1) {
	double diff = log_Q_curr + 0.5 * log((double)m_val) - log_P_curr;
	regulator = log_P_curr + log(1.0 + exp(diff));
	break;
	}

	// Update log convergents
	double rp = exp(log_P_prev - log_P_curr);
	log_P_prev = log_P_curr;
	log_P_curr = log_P_curr + log((double)a + rp);
	double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0;
	log_Q_prev = log_Q_curr;
	log_Q_curr = log_Q_curr + log((double)a + rq);
	}
	} else {
	// d ≡ 1 mod 4: CF of (1+√d)/2 with reduced-state cycle detection
	uint64 isqrt_d = (uint64)sqrt((double)d);
	while (isqrt_d * isqrt_d > d) isqrt_d--;
	while ((isqrt_d+1)*(isqrt_d+1) <= d) isqrt_d++;

	int64 P = 1, Q = 2;
	int64 a = (P + (int64)isqrt_d) / Q;
	log_P_prev = 0.0;
	log_P_curr = log((double)(a > 0 ? a : 1));
	log_Q_prev = -1e30;
	log_Q_curr = 0.0;

	// Cycle detection via reduced states
	int64 first_P = -1, first_Q = -1;
	double log_eps0 = 0.0;

	for (int step = 0; step < MAX_CF_STEPS; step++) {
	int64 P_new = a * Q - P;
	int64 Q_new = ((int64)d - P_new * P_new) / Q;
	if (Q_new == 0) break;
	int64 a_new = (P_new + (int64)isqrt_d) / Q_new;
	P = P_new; Q = Q_new; a = a_new;

	// Update log convergents
	double rp = exp(log_P_prev - log_P_curr);
	log_P_prev = log_P_curr;
	log_P_curr = log_P_curr + log((double)a + rp);
	double rq = (log_Q_prev > -1e20) ? exp(log_Q_prev - log_Q_curr) : 0.0;
	log_Q_prev = log_Q_curr;
	log_Q_curr = log_Q_curr + log((double)a + rq);

	// Check if reduced: 0 < P <= isqrt_d, P > isqrt_d - Q, Q > 0
	int is_reduced = (Q > 0 && P > 0 && P <= (int64)isqrt_d && P > (int64)isqrt_d - Q);
	if (!is_reduced) continue;

	// Compute log(ε) = log((2p - q + q√d) / 2)
	double ratio_qp = exp(log_Q_curr - log_P_curr);
	double log_2pmq = log_P_curr + log(2.0 - ratio_qp);
	double diff = log_Q_curr + 0.5 * log((double)d) - log_2pmq;
	double log_eps = log_2pmq + log(1.0 + exp(diff)) - log(2.0);

	if (first_P < 0) {
	// First reduced state: save it
	first_P = P; first_Q = Q;
	log_eps0 = log_eps;
	} else if (P == first_P && Q == first_Q) {
	// Cycle detected! R = log(ε_now) - log(ε_first)
	regulator = log_eps - log_eps0;
	break;
	}
	}
	}

	if (regulator < 0.01) regulator = 0.01;

	// ===== PHASE 2: L(1, chi_d) via Euler product =====
	double L1 = 1.0;
	for (int i = 0; i < NUM_PRIMES; i++) {
	int p = d_primes[i];
	int chi = kronecker((int64)d, p);
	if (chi != 0) {
	L1 *= 1.0 / (1.0 - (double)chi / p);
	}
	// If chi = 0, the factor is 1/(1-0) = 1, no change
	}

	// ===== PHASE 3: Assemble class number =====
	double h_approx = sqrt((double)d) * L1 / (2.0 * regulator);
	int h = (int)round(h_approx);
	if (h < 1) h = 1;

	class_numbers_out[idx] = h;
	if (regulators_out) regulators_out[idx] = regulator;

	// ===== PHASE 4: Statistics =====
	atomicAdd(total_processed, 1ULL);
	if (h == 1) atomicAdd(h1_count, 1ULL);
	if (h < 1024) atomicAdd(&h_histogram[h], 1ULL);
	if (h % 3 == 0) atomicAdd(div3_count, 1ULL);
	if (h % 5 == 0) atomicAdd(div5_count, 1ULL);
	if (h % 7 == 0) atomicAdd(div7_count, 1ULL);
	}

	// =====================================================
	// GPU: Squarefree sieve + fundamental discriminant extraction
	// =====================================================
	__global__ void gpu_sieve_squarefree(
	uint8_t *sieve, uint64 lo, uint64 len,
	const int *primes, int num_primes)
	{
	uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x;
	if (pos >= len) return;
	uint64 d = lo + pos;
	for (int i = 0; i < num_primes; i++) {
	int p = primes[i];
	uint64 p2 = (uint64)p * p;
	if (p2 > d) break;
	if (d % p2 == 0) { sieve[pos] = 0; return; }
	}
	}

	__global__ void gpu_extract_fundamental(
	const uint8_t *sieve, uint64 lo, uint64 len,
	uint64 output, uint32_t count, uint32_t max_out)
	{
	uint64 pos = (uint64)blockIdx.x * blockDim.x + threadIdx.x;
	if (pos >= len) return;
	uint64 d = lo + pos;
	if (d < 5) return;
	int is_fund = 0;
	if (d % 4 == 1 && sieve[pos]) {
	is_fund = 1;
	} else if (d % 4 == 0) {
	uint64 m = d / 4;
	if ((m % 4 == 2 \|\| m % 4 == 3)) {
	if (m >= lo && m < lo + len && sieve[m - lo]) is_fund = 1;
	else if (m < lo) {
	// Trial division for m outside sieve range
	int sqf = 1;
	for (uint64 p = 2; p * p <= m && sqf; p++)
	if (m % (p*p) == 0) sqf = 0;
	if (sqf) is_fund = 1;
	}
	}
	}
	if (is_fund) {
	uint32_t idx = atomicAdd(count, 1);
	if (idx < max_out) output[idx] = d;
	}
	}

	// =====================================================
	// Generate prime table
	// =====================================================
	#ifndef TORCH_EXTENSION_NAME

	int generate_primes(int *primes, int max_prime) {
	char sieve = (char)calloc(max_prime + 1, 1);
	memset(sieve, 1, max_prime + 1);
	sieve[0] = sieve[1] = 0;
	for (int i = 2; i * i <= max_prime; i++)
	if (sieve[i])
	for (int j = i*i; j <= max_prime; j += i)
	sieve[j] = 0;
	int count = 0;
	for (int i = 2; i <= max_prime && count < NUM_PRIMES; i++)
	if (sieve[i]) primes[count++] = i;
	free(sieve);
	return count;
	}

	// =====================================================
	// GPU worker thread
	// =====================================================
	typedef struct {
	int gpu_id;
	uint64 d_start, d_end;
	char output_path[256]; // binary output file path
	// Results
	uint64 total_processed;
	uint64 h1_count;
	uint64 div3, div5, div7;
	uint64 h_hist[1024];
	} GPUWork;

	void gpu_worker(void arg) {
	GPUWork work = (GPUWork)arg;
	cudaSetDevice(work->gpu_id);

	// Allocate GPU buffers
	uint64 *d_discriminants;
	int *d_class_numbers;
	uint64 d_h1, d_total, d_div3, d_div5, d_div7, d_hist;

	uint32_t max_per_chunk = CHUNK_SIZE; // max fundamental discriminants per chunk
	cudaMalloc(&d_discriminants, max_per_chunk * sizeof(uint64));
	cudaMalloc(&d_class_numbers, max_per_chunk * sizeof(int));
	cudaMalloc(&d_h1, sizeof(uint64));
	cudaMalloc(&d_total, sizeof(uint64));
	cudaMalloc(&d_div3, sizeof(uint64));
	cudaMalloc(&d_div5, sizeof(uint64));
	cudaMalloc(&d_div7, sizeof(uint64));
	cudaMalloc(&d_hist, 1024 * sizeof(uint64));

	cudaMemset(d_h1, 0, sizeof(uint64));
	cudaMemset(d_total, 0, sizeof(uint64));
	cudaMemset(d_div3, 0, sizeof(uint64));
	cudaMemset(d_div5, 0, sizeof(uint64));
	cudaMemset(d_div7, 0, sizeof(uint64));
	cudaMemset(d_hist, 0, 1024 * sizeof(uint64));

	// GPU sieve buffers
	uint64 chunk_raw = CHUNK_SIZE * 3;
	uint8_t *d_sieve;
	uint32_t *d_sieve_count;
	int *d_sieve_primes;
	cudaMalloc(&d_sieve, chunk_raw);
	cudaMalloc(&d_sieve_count, sizeof(uint32_t));

	// Generate sieve primes on CPU (up to sqrt of max d)
	uint64 sqrt_max = (uint64)sqrt((double)work->d_end) + 2;
	int h_sieve_primes = (int)malloc(sqrt_max * sizeof(int));
	int n_sieve_primes = 0;
	{
	char isp = (char)calloc(sqrt_max + 1, 1);
	for (uint64 i = 2; i <= sqrt_max; i++) isp[i] = 1;
	for (uint64 i = 2; i * i <= sqrt_max; i++)
	if (isp[i]) for (uint64 j = i*i; j <= sqrt_max; j += i) isp[j] = 0;
	for (uint64 i = 2; i <= sqrt_max; i++)
	if (isp[i]) h_sieve_primes[n_sieve_primes++] = (int)i;
	free(isp);
	}
	cudaMalloc(&d_sieve_primes, n_sieve_primes * sizeof(int));
	cudaMemcpy(d_sieve_primes, h_sieve_primes, n_sieve_primes * sizeof(int), cudaMemcpyHostToDevice);
	free(h_sieve_primes);

	uint64 chunks_done = 0;

	for (uint64 d_lo = work->d_start; d_lo < work->d_end; d_lo += chunk_raw) {
	uint64 d_hi = d_lo + chunk_raw;
	if (d_hi > work->d_end) d_hi = work->d_end;
	uint64 len = d_hi - d_lo;

	// GPU Sieve: squarefree + fundamental discriminant extraction
	cudaMemset(d_sieve, 1, len);
	cudaMemset(d_sieve_count, 0, sizeof(uint32_t));
	uint64 sieve_blocks = (len + BLOCK_SIZE - 1) / BLOCK_SIZE;
	gpu_sieve_squarefree<<<sieve_blocks, BLOCK_SIZE>>>(
	d_sieve, d_lo, len, d_sieve_primes, n_sieve_primes);
	gpu_extract_fundamental<<<sieve_blocks, BLOCK_SIZE>>>(
	d_sieve, d_lo, len, d_discriminants, d_sieve_count, max_per_chunk);
	uint32_t count;
	cudaMemcpy(&count, d_sieve_count, sizeof(uint32_t), cudaMemcpyDeviceToHost);
	if (count == 0) continue;
	if (count > max_per_chunk) count = max_per_chunk;

	// Launch kernel
	int blocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
	compute_class_numbers<<<blocks, BLOCK_SIZE>>>(
	d_discriminants, count, d_class_numbers, NULL,
	d_h1, d_hist, d_total, d_div3, d_div5, d_div7);
	cudaDeviceSynchronize();

	// Write raw (d, h) pairs to binary file
	if (work->output_path[0]) {
	uint64 h_disc = (uint64)malloc(count * sizeof(uint64));
	int h_cls = (int)malloc(count * sizeof(int));
	cudaMemcpy(h_disc, d_discriminants, count * sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(h_cls, d_class_numbers, count * sizeof(int), cudaMemcpyDeviceToHost);

	FILE *fout = fopen(work->output_path, "ab"); // append binary
	if (fout) {
	for (uint32_t i = 0; i < count; i++) {
	if (h_cls[i] > 0) { // skip invalid
	fwrite(&h_disc[i], sizeof(uint64), 1, fout);
	fwrite(&h_cls[i], sizeof(int), 1, fout);
	}
	}
	fclose(fout);
	}
	free(h_disc); free(h_cls);
	}

	chunks_done++;
	if (chunks_done % 20 == 0) {
	uint64 total;
	cudaMemcpy(&total, d_total, sizeof(uint64), cudaMemcpyDeviceToHost);
	double pct = 100.0 * (d_lo - work->d_start) / (double)(work->d_end - work->d_start);
	printf("[GPU %d] %.1f%% \| %llu discriminants \| d ~ %.2e\n",
	work->gpu_id, pct, total, (double)d_lo);
	fflush(stdout);
	}
	}

	// Collect results
	cudaDeviceSynchronize();
	cudaMemcpy(&work->total_processed, d_total, sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(&work->h1_count, d_h1, sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(&work->div3, d_div3, sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(&work->div5, d_div5, sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(&work->div7, d_div7, sizeof(uint64), cudaMemcpyDeviceToHost);
	cudaMemcpy(work->h_hist, d_hist, 1024 * sizeof(uint64), cudaMemcpyDeviceToHost);

	cudaFree(d_discriminants); cudaFree(d_class_numbers);
	cudaFree(d_h1); cudaFree(d_total); cudaFree(d_div3); cudaFree(d_div5); cudaFree(d_div7);
	cudaFree(d_hist);
	cudaFree(d_sieve); cudaFree(d_sieve_count); cudaFree(d_sieve_primes);

	printf("[GPU %d] done: %llu discriminants\n", work->gpu_id, work->total_processed);
	return NULL;
	}

	// =====================================================
	// Main
	// =====================================================
	int main(int argc, char **argv) {
	uint64 D_start = argc > 1 ? strtoull(argv[1], NULL, 10) : 5;
	uint64 D_end = argc > 2 ? strtoull(argv[2], NULL, 10) : 1000000;

	printf("========================================\n");
	printf("Class Numbers of Real Quadratic Fields v2\n");
	printf("Range: [%llu, %llu)\n", D_start, D_end);
	printf("========================================\n\n");

	// Generate primes
	int h_primes[NUM_PRIMES];
	int nprimes = generate_primes(h_primes, 100003);
	printf("Primes: %d (up to %d)\n", nprimes, h_primes[nprimes-1]);

	int num_gpus;
	cudaGetDeviceCount(&num_gpus);
	printf("GPUs: %d\n\n", num_gpus);

	// Upload primes to all GPUs
	for (int g = 0; g < num_gpus; g++) {
	cudaSetDevice(g);
	cudaMemcpyToSymbol(d_primes, h_primes, nprimes * sizeof(int));
	}

	struct timespec t0, t1;
	clock_gettime(CLOCK_MONOTONIC, &t0);

	// Launch workers
	uint64 range = D_end - D_start;
	uint64 per_gpu = (range + num_gpus - 1) / num_gpus;

	pthread_t threads[8];
	GPUWork works[8];
	for (int g = 0; g < num_gpus; g++) {
	works[g].gpu_id = g;
	works[g].d_start = D_start + g * per_gpu;
	works[g].d_end = D_start + (g + 1) * per_gpu;
	if (works[g].d_end > D_end) works[g].d_end = D_end;
	memset(works[g].h_hist, 0, sizeof(works[g].h_hist));
	snprintf(works[g].output_path, 256,
	"/home/amsysistestdrive2026/idontknow/data/class-numbers/raw_gpu%d_%llu_%llu.bin",
	g, works[g].d_start, works[g].d_end);
	pthread_create(&threads[g], NULL, gpu_worker, &works[g]);
	}

	// Collect
	uint64 grand_total = 0, grand_h1 = 0;
	uint64 grand_div3 = 0, grand_div5 = 0, grand_div7 = 0;
	uint64 grand_hist[1024] = {0};

	for (int g = 0; g < num_gpus; g++) {
	pthread_join(threads[g], NULL);
	grand_total += works[g].total_processed;
	grand_h1 += works[g].h1_count;
	grand_div3 += works[g].div3;
	grand_div5 += works[g].div5;
	grand_div7 += works[g].div7;
	for (int h = 0; h < 1024; h++)
	grand_hist[h] += works[g].h_hist[h];
	}

	clock_gettime(CLOCK_MONOTONIC, &t1);
	double elapsed = (t1.tv_sec-t0.tv_sec) + (t1.tv_nsec-t0.tv_nsec)/1e9;

	printf("\n========================================\n");
	printf("RESULTS\n");
	printf("========================================\n");
	printf("Range: [%llu, %llu)\n", D_start, D_end);
	printf("Fundamental discriminants: %llu\n", grand_total);
	printf("Time: %.1fs (%.0f disc/sec)\n", elapsed, grand_total / elapsed);
	printf("\nCohen-Lenstra statistics:\n");
	printf(" h(d) = 1: %llu (%.4f%%)\n", grand_h1, 100.0 * grand_h1 / grand_total);
	printf(" C-L predicted h=1: ~75.446%%\n");
	printf(" 3 \| h(d): %llu (%.4f%%)\n", grand_div3, 100.0 * grand_div3 / grand_total);
	printf(" 5 \| h(d): %llu (%.4f%%)\n", grand_div5, 100.0 * grand_div5 / grand_total);
	printf(" 7 \| h(d): %llu (%.4f%%)\n", grand_div7, 100.0 * grand_div7 / grand_total);

	printf("\nClass number distribution (first 20):\n");
	for (int h = 1; h <= 20; h++)
	printf(" h=%2d: %llu (%.3f%%)\n", h, grand_hist[h], 100.0 * grand_hist[h] / grand_total);

	printf("\n========================================\n");
	return 0;
	}
	#endif


	#ifdef TORCH_EXTENSION_NAME
	static void setup_primes_for_torch() {
	static bool initialized = false;
	if (initialized) return;
	int h_primes[NUM_PRIMES];
	int count = 0;
	bool sieve[100004];
	memset(sieve, true, sizeof(sieve));
	for (int i = 2; i * i <= 100003; i++)
	if (sieve[i]) for (int j = i*i; j <= 100003; j += i) sieve[j] = false;
	for (int i = 2; i <= 100003 && count < NUM_PRIMES; i++)
	if (sieve[i]) h_primes[count++] = i;
	cudaMemcpyToSymbol(d_primes, h_primes, count * sizeof(int));
	initialized = true;
	}

	std::vector<torch::Tensor> compute_class_numbers_torch(torch::Tensor discriminants) {
	TORCH_CHECK(discriminants.is_cuda(), "discriminants must be CUDA tensor");
	TORCH_CHECK(discriminants.dtype() == torch::kInt64, "discriminants must be int64");
	TORCH_CHECK(discriminants.dim() == 1, "discriminants must be 1-D");

	setup_primes_for_torch();

	int n = discriminants.size(0);
	auto class_nums = torch::zeros({n}, torch::dtype(torch::kInt32).device(discriminants.device()));
	auto regulators = torch::zeros({n}, torch::dtype(torch::kFloat64).device(discriminants.device()));
	auto h1_count = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
	auto histogram = torch::zeros({1024}, torch::dtype(torch::kInt64).device(discriminants.device()));
	auto total_proc = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
	auto div3 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
	auto div5 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));
	auto div7 = torch::zeros({1}, torch::dtype(torch::kInt64).device(discriminants.device()));

	int blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	compute_class_numbers<<<blocks, BLOCK_SIZE>>>(
	(uint64*)discriminants.data_ptr<int64_t>(), (uint32_t)n,
	class_nums.data_ptr<int>(), regulators.data_ptr<double>(),
	(uint64*)h1_count.data_ptr<int64_t>(),
	(uint64*)histogram.data_ptr<int64_t>(),
	(uint64*)total_proc.data_ptr<int64_t>(),
	(uint64*)div3.data_ptr<int64_t>(),
	(uint64*)div5.data_ptr<int64_t>(),
	(uint64*)div7.data_ptr<int64_t>());
	cudaDeviceSynchronize();
	return {class_nums, regulators};
	}
	#endif