Add build variant torch211-cxx11-cu130-x86_64-linux for kernels compatibility
Browse files- .gitattributes +1 -0
- build/torch211-cxx11-cu130-x86_64-linux/__init__.py +18 -0
- build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-311.pyc +0 -0
- build/torch211-cxx11-cu130-x86_64-linux/_flint_hills_cuda.so +3 -0
- build/torch211-cxx11-cu130-x86_64-linux/flint_hills/__init__.py +15 -0
- build/torch211-cxx11-cu130-x86_64-linux/metadata.json +13 -0
- flint_hills/qd_real.h +254 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
build/torch211-cxx11-cu130-x86_64-linux/_flint_hills_cuda.so filter=lfs diff=lfs merge=lfs -text
|
build/torch211-cxx11-cu130-x86_64-linux/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
def _load_extension():
|
| 6 |
+
so_path = Path(__file__).parent / "_flint_hills_cuda.so"
|
| 7 |
+
spec = importlib.util.spec_from_file_location("_flint_hills_cuda", so_path)
|
| 8 |
+
if spec is None:
|
| 9 |
+
raise ImportError(f"Cannot find {so_path}")
|
| 10 |
+
mod = importlib.util.module_from_spec(spec)
|
| 11 |
+
sys.modules["_flint_hills_cuda"] = mod
|
| 12 |
+
spec.loader.exec_module(mod)
|
| 13 |
+
return mod
|
| 14 |
+
|
| 15 |
+
_ext = _load_extension()
|
| 16 |
+
|
| 17 |
+
# Re-export all public symbols from the extension
|
| 18 |
+
from _flint_hills_cuda import *
|
build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.14 kB). View file
|
|
|
build/torch211-cxx11-cu130-x86_64-linux/_flint_hills_cuda.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d44a06dd1689a8a7489eae018f150bbace26194fc793ec89e7ac337aeba3c98
|
| 3 |
+
size 648992
|
build/torch211-cxx11-cu130-x86_64-linux/flint_hills/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util, ctypes, sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
def _import_from_path(file_path):
|
| 5 |
+
path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
|
| 6 |
+
module_name = path_hash
|
| 7 |
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
| 8 |
+
if spec is None:
|
| 9 |
+
raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
|
| 10 |
+
module = importlib.util.module_from_spec(spec)
|
| 11 |
+
sys.modules[module_name] = module
|
| 12 |
+
spec.loader.exec_module(module)
|
| 13 |
+
return module
|
| 14 |
+
|
| 15 |
+
globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
|
build/torch211-cxx11-cu130-x86_64-linux/metadata.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": 1,
|
| 3 |
+
"python-depends": [],
|
| 4 |
+
"backend": {
|
| 5 |
+
"type": "cuda",
|
| 6 |
+
"archs": [
|
| 7 |
+
"8.0",
|
| 8 |
+
"9.0",
|
| 9 |
+
"10.0",
|
| 10 |
+
"12.0"
|
| 11 |
+
]
|
| 12 |
+
}
|
| 13 |
+
}
|
flint_hills/qd_real.h
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef QD_REAL_H
|
| 2 |
+
#define QD_REAL_H
|
| 3 |
+
|
| 4 |
+
#include <math.h>
|
| 5 |
+
|
| 6 |
+
/* ================================================================
|
| 7 |
+
* Quad-double arithmetic for CUDA
|
| 8 |
+
*
|
| 9 |
+
* A qd_real is an unevaluated sum of 4 doubles: x = x[0]+x[1]+x[2]+x[3]
|
| 10 |
+
* with |x[1]| <= eps*|x[0]|, |x[2]| <= eps*|x[1]|, etc.
|
| 11 |
+
* This gives ~212 bits (~62 decimal digits) of precision.
|
| 12 |
+
*
|
| 13 |
+
* Based on: Hida, Li, Bailey (2001)
|
| 14 |
+
* "Library for Double-Double and Quad-Double Arithmetic"
|
| 15 |
+
* ================================================================ */
|
| 16 |
+
|
| 17 |
+
typedef struct { double x[4]; } qd_real;
|
| 18 |
+
|
| 19 |
+
/* ---- Two-Sum and Two-Prod primitives ---- */
|
| 20 |
+
|
| 21 |
+
__host__ __device__ inline void two_sum(double a, double b, double *s, double *e) {
|
| 22 |
+
*s = a + b;
|
| 23 |
+
double v = *s - a;
|
| 24 |
+
*e = (a - (*s - v)) + (b - v);
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
__host__ __device__ inline void two_prod(double a, double b, double *p, double *e) {
|
| 28 |
+
*p = a * b;
|
| 29 |
+
*e = fma(a, b, -(*p));
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/* ---- Double-double addition: (a0+a1) + (b0+b1) = (s0+s1) ---- */
|
| 33 |
+
|
| 34 |
+
__host__ __device__ inline void dd_add(double a0, double a1, double b0, double b1,
|
| 35 |
+
double *s0, double *s1) {
|
| 36 |
+
double t1, t2, e;
|
| 37 |
+
two_sum(a0, b0, &t1, &t2);
|
| 38 |
+
t2 += a1 + b1;
|
| 39 |
+
two_sum(t1, t2, s0, &e);
|
| 40 |
+
*s1 = e;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/* ---- qd_real constructors ---- */
|
| 44 |
+
|
| 45 |
+
__host__ __device__ inline qd_real qd_from_double(double a) {
|
| 46 |
+
qd_real r; r.x[0]=a; r.x[1]=0; r.x[2]=0; r.x[3]=0; return r;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
__host__ __device__ inline qd_real qd_from_int(long long n) {
|
| 50 |
+
return qd_from_double((double)n);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/* ---- Renormalize: ensure non-overlapping property ---- */
|
| 54 |
+
|
| 55 |
+
__host__ __device__ inline qd_real qd_renorm(double c0, double c1, double c2,
|
| 56 |
+
double c3, double c4) {
|
| 57 |
+
double s, t0, t1, t2, t3;
|
| 58 |
+
qd_real r;
|
| 59 |
+
|
| 60 |
+
two_sum(c3, c4, &s, &t3);
|
| 61 |
+
two_sum(c2, s, &s, &t2);
|
| 62 |
+
two_sum(c1, s, &s, &t1);
|
| 63 |
+
two_sum(c0, s, &r.x[0], &t0);
|
| 64 |
+
|
| 65 |
+
two_sum(t1, t2, &s, &t1);
|
| 66 |
+
two_sum(t0, s, &r.x[1], &t0);
|
| 67 |
+
|
| 68 |
+
two_sum(t0, t1, &r.x[2], &t0);
|
| 69 |
+
r.x[3] = t0 + t3;
|
| 70 |
+
|
| 71 |
+
return r;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
/* ---- Addition ---- */
|
| 75 |
+
|
| 76 |
+
__host__ __device__ inline qd_real qd_add(qd_real a, qd_real b) {
|
| 77 |
+
/* Index-paired cascade addition, then renormalize */
|
| 78 |
+
double s, e;
|
| 79 |
+
double c[5] = {0, 0, 0, 0, 0};
|
| 80 |
+
|
| 81 |
+
two_sum(a.x[0], b.x[0], &c[0], &e);
|
| 82 |
+
double t = e;
|
| 83 |
+
two_sum(a.x[1], b.x[1], &s, &e);
|
| 84 |
+
double t2;
|
| 85 |
+
two_sum(t, s, &c[1], &t2);
|
| 86 |
+
t = t2 + e;
|
| 87 |
+
two_sum(a.x[2], b.x[2], &s, &e);
|
| 88 |
+
two_sum(t, s, &c[2], &t2);
|
| 89 |
+
t = t2 + e;
|
| 90 |
+
two_sum(a.x[3], b.x[3], &s, &e);
|
| 91 |
+
two_sum(t, s, &c[3], &t2);
|
| 92 |
+
c[4] = t2 + e;
|
| 93 |
+
|
| 94 |
+
return qd_renorm(c[0], c[1], c[2], c[3], c[4]);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
__host__ __device__ inline qd_real qd_neg(qd_real a) {
|
| 98 |
+
qd_real r;
|
| 99 |
+
r.x[0] = -a.x[0]; r.x[1] = -a.x[1];
|
| 100 |
+
r.x[2] = -a.x[2]; r.x[3] = -a.x[3];
|
| 101 |
+
return r;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
__host__ __device__ inline qd_real qd_sub(qd_real a, qd_real b) {
|
| 105 |
+
return qd_add(a, qd_neg(b));
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
/* ---- Multiplication ---- */
|
| 109 |
+
|
| 110 |
+
__host__ __device__ inline qd_real qd_mul(qd_real a, qd_real b) {
|
| 111 |
+
double p0, p1, p2, p3, p4, p5;
|
| 112 |
+
double q0, q1, q2, q3, q4, q5;
|
| 113 |
+
double t0, t1;
|
| 114 |
+
|
| 115 |
+
two_prod(a.x[0], b.x[0], &p0, &q0);
|
| 116 |
+
two_prod(a.x[0], b.x[1], &p1, &q1);
|
| 117 |
+
two_prod(a.x[1], b.x[0], &p2, &q2);
|
| 118 |
+
two_prod(a.x[0], b.x[2], &p3, &q3);
|
| 119 |
+
two_prod(a.x[1], b.x[1], &p4, &q4);
|
| 120 |
+
two_prod(a.x[2], b.x[0], &p5, &q5);
|
| 121 |
+
|
| 122 |
+
/* Accumulate from bottom */
|
| 123 |
+
two_sum(p1, p2, &p1, &p2);
|
| 124 |
+
two_sum(q0, p1, &t0, &t1);
|
| 125 |
+
|
| 126 |
+
double r1 = t0;
|
| 127 |
+
double c2 = t1 + p2;
|
| 128 |
+
|
| 129 |
+
two_sum(p3, p4, &t0, &t1);
|
| 130 |
+
double t2 = t1;
|
| 131 |
+
two_sum(t0, p5, &t0, &t1);
|
| 132 |
+
t2 += t1;
|
| 133 |
+
two_sum(c2, t0, &c2, &t0);
|
| 134 |
+
t2 += t0;
|
| 135 |
+
|
| 136 |
+
double c3 = t2 + q1 + q2 + q3 + q4 + q5
|
| 137 |
+
+ a.x[0]*b.x[3] + a.x[1]*b.x[2]
|
| 138 |
+
+ a.x[2]*b.x[1] + a.x[3]*b.x[0];
|
| 139 |
+
|
| 140 |
+
return qd_renorm(p0, r1, c2, c3, 0.0);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
/* ---- Division: a / b using Newton iteration ---- */
|
| 144 |
+
|
| 145 |
+
__host__ __device__ inline qd_real qd_div(qd_real a, qd_real b) {
|
| 146 |
+
/* Compute q = a/b using long division */
|
| 147 |
+
double q0 = a.x[0] / b.x[0];
|
| 148 |
+
qd_real r = qd_sub(a, qd_mul(qd_from_double(q0), b));
|
| 149 |
+
|
| 150 |
+
double q1 = r.x[0] / b.x[0];
|
| 151 |
+
r = qd_sub(r, qd_mul(qd_from_double(q1), b));
|
| 152 |
+
|
| 153 |
+
double q2 = r.x[0] / b.x[0];
|
| 154 |
+
r = qd_sub(r, qd_mul(qd_from_double(q2), b));
|
| 155 |
+
|
| 156 |
+
double q3 = r.x[0] / b.x[0];
|
| 157 |
+
|
| 158 |
+
return qd_renorm(q0, q1, q2, q3, 0.0);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/* ---- Comparison ---- */
|
| 162 |
+
|
| 163 |
+
__host__ __device__ inline int qd_gt(qd_real a, qd_real b) {
|
| 164 |
+
if (a.x[0] != b.x[0]) return a.x[0] > b.x[0];
|
| 165 |
+
if (a.x[1] != b.x[1]) return a.x[1] > b.x[1];
|
| 166 |
+
if (a.x[2] != b.x[2]) return a.x[2] > b.x[2];
|
| 167 |
+
return a.x[3] > b.x[3];
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
__host__ __device__ inline int qd_lt_zero(qd_real a) { return a.x[0] < 0.0; }
|
| 171 |
+
|
| 172 |
+
__host__ __device__ inline double qd_to_double(qd_real a) { return a.x[0] + a.x[1]; }
|
| 173 |
+
|
| 174 |
+
/* ---- Absolute value ---- */
|
| 175 |
+
|
| 176 |
+
__host__ __device__ inline qd_real qd_abs(qd_real a) {
|
| 177 |
+
return qd_lt_zero(a) ? qd_neg(a) : a;
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
/* ---- Constants ---- */
|
| 181 |
+
|
| 182 |
+
/* π to ~62 decimal digits as a quad-double.
|
| 183 |
+
* These are the exact double decomposition of:
|
| 184 |
+
* 3.14159265358979323846264338327950288419716939937510...
|
| 185 |
+
*/
|
| 186 |
+
__host__ __device__ inline qd_real qd_pi() {
|
| 187 |
+
qd_real r;
|
| 188 |
+
r.x[0] = 3.141592653589793116e+00;
|
| 189 |
+
r.x[1] = 1.224646799147353207e-16;
|
| 190 |
+
r.x[2] = -2.994769809718339666e-33;
|
| 191 |
+
r.x[3] = 1.112454220863365282e-49;
|
| 192 |
+
return r;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
/* 2π */
|
| 196 |
+
__host__ __device__ inline qd_real qd_two_pi() {
|
| 197 |
+
qd_real r;
|
| 198 |
+
r.x[0] = 6.283185307179586232e+00;
|
| 199 |
+
r.x[1] = 2.449293598294706414e-16;
|
| 200 |
+
r.x[2] = -5.989539619436679332e-33;
|
| 201 |
+
r.x[3] = 2.224908441726730563e-49;
|
| 202 |
+
return r;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
/* ---- Multiply qd by integer ---- */
|
| 206 |
+
|
| 207 |
+
__host__ __device__ inline qd_real qd_mul_int(qd_real a, long long n) {
|
| 208 |
+
return qd_mul(a, qd_from_double((double)n));
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
/* ---- sin via argument reduction + Taylor series ---- */
|
| 212 |
+
|
| 213 |
+
__host__ __device__ inline qd_real qd_sin(qd_real a) {
|
| 214 |
+
/* Argument reduction: compute a mod 2π, then reduce to [-π, π] */
|
| 215 |
+
qd_real two_pi = qd_two_pi();
|
| 216 |
+
qd_real pi = qd_pi();
|
| 217 |
+
|
| 218 |
+
/* k = round(a / (2π)) */
|
| 219 |
+
double k_d = round(a.x[0] / two_pi.x[0]);
|
| 220 |
+
long long k = (long long)k_d;
|
| 221 |
+
|
| 222 |
+
/* r = a - k * 2π */
|
| 223 |
+
qd_real r = qd_sub(a, qd_mul_int(two_pi, k));
|
| 224 |
+
|
| 225 |
+
/* Further reduce: if r > π, r -= 2π; if r < -π, r += 2π */
|
| 226 |
+
if (qd_gt(r, pi)) r = qd_sub(r, two_pi);
|
| 227 |
+
if (qd_lt_zero(qd_add(r, pi))) r = qd_add(r, two_pi);
|
| 228 |
+
|
| 229 |
+
/* Now |r| <= π. Use range reduction to |r| <= π/4 via identities:
|
| 230 |
+
* For simplicity, just use Taylor series directly (r is usually small
|
| 231 |
+
* for our use case since we're evaluating at integers near multiples of π).
|
| 232 |
+
*/
|
| 233 |
+
|
| 234 |
+
/* Taylor series: sin(r) = r - r³/3! + r⁵/5! - r⁷/7! + ...
|
| 235 |
+
* Converges fast when |r| < π. We need ~20 terms for 62-digit precision.
|
| 236 |
+
*/
|
| 237 |
+
qd_real r2 = qd_mul(r, r);
|
| 238 |
+
qd_real term = r;
|
| 239 |
+
qd_real sum = r;
|
| 240 |
+
|
| 241 |
+
for (int i = 1; i <= 25; i++) {
|
| 242 |
+
double denom = -(2.0*i) * (2.0*i + 1.0);
|
| 243 |
+
term = qd_mul(term, r2);
|
| 244 |
+
term = qd_div(term, qd_from_double(denom));
|
| 245 |
+
sum = qd_add(sum, term);
|
| 246 |
+
|
| 247 |
+
/* Early termination if term is negligible */
|
| 248 |
+
if (fabs(term.x[0]) < 1e-60 * fabs(sum.x[0])) break;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
return sum;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
#endif /* QD_REAL_H */
|