cahlen commited on
Commit
2807b6c
·
verified ·
1 Parent(s): 42ef9b5

Add build variant torch211-cxx11-cu130-x86_64-linux for kernels compatibility

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ build/torch211-cxx11-cu130-x86_64-linux/_flint_hills_cuda.so filter=lfs diff=lfs merge=lfs -text
build/torch211-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ def _load_extension():
6
+ so_path = Path(__file__).parent / "_flint_hills_cuda.so"
7
+ spec = importlib.util.spec_from_file_location("_flint_hills_cuda", so_path)
8
+ if spec is None:
9
+ raise ImportError(f"Cannot find {so_path}")
10
+ mod = importlib.util.module_from_spec(spec)
11
+ sys.modules["_flint_hills_cuda"] = mod
12
+ spec.loader.exec_module(mod)
13
+ return mod
14
+
15
+ _ext = _load_extension()
16
+
17
+ # Re-export all public symbols from the extension
18
+ from _flint_hills_cuda import *
build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.14 kB). View file
 
build/torch211-cxx11-cu130-x86_64-linux/_flint_hills_cuda.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d44a06dd1689a8a7489eae018f150bbace26194fc793ec89e7ac337aeba3c98
3
+ size 648992
build/torch211-cxx11-cu130-x86_64-linux/flint_hills/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util, ctypes, sys
2
+ from pathlib import Path
3
+
4
+ def _import_from_path(file_path):
5
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
6
+ module_name = path_hash
7
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
8
+ if spec is None:
9
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
10
+ module = importlib.util.module_from_spec(spec)
11
+ sys.modules[module_name] = module
12
+ spec.loader.exec_module(module)
13
+ return module
14
+
15
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "8.0",
8
+ "9.0",
9
+ "10.0",
10
+ "12.0"
11
+ ]
12
+ }
13
+ }
flint_hills/qd_real.h ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef QD_REAL_H
2
+ #define QD_REAL_H
3
+
4
+ #include <math.h>
5
+
6
+ /* ================================================================
7
+ * Quad-double arithmetic for CUDA
8
+ *
9
+ * A qd_real is an unevaluated sum of 4 doubles: x = x[0]+x[1]+x[2]+x[3]
10
+ * with |x[1]| <= eps*|x[0]|, |x[2]| <= eps*|x[1]|, etc.
11
+ * This gives ~212 bits (~62 decimal digits) of precision.
12
+ *
13
+ * Based on: Hida, Li, Bailey (2001)
14
+ * "Library for Double-Double and Quad-Double Arithmetic"
15
+ * ================================================================ */
16
+
17
+ typedef struct { double x[4]; } qd_real;
18
+
19
+ /* ---- Two-Sum and Two-Prod primitives ---- */
20
+
21
+ __host__ __device__ inline void two_sum(double a, double b, double *s, double *e) {
22
+ *s = a + b;
23
+ double v = *s - a;
24
+ *e = (a - (*s - v)) + (b - v);
25
+ }
26
+
27
+ __host__ __device__ inline void two_prod(double a, double b, double *p, double *e) {
28
+ *p = a * b;
29
+ *e = fma(a, b, -(*p));
30
+ }
31
+
32
+ /* ---- Double-double addition: (a0+a1) + (b0+b1) = (s0+s1) ---- */
33
+
34
+ __host__ __device__ inline void dd_add(double a0, double a1, double b0, double b1,
35
+ double *s0, double *s1) {
36
+ double t1, t2, e;
37
+ two_sum(a0, b0, &t1, &t2);
38
+ t2 += a1 + b1;
39
+ two_sum(t1, t2, s0, &e);
40
+ *s1 = e;
41
+ }
42
+
43
+ /* ---- qd_real constructors ---- */
44
+
45
+ __host__ __device__ inline qd_real qd_from_double(double a) {
46
+ qd_real r; r.x[0]=a; r.x[1]=0; r.x[2]=0; r.x[3]=0; return r;
47
+ }
48
+
49
+ __host__ __device__ inline qd_real qd_from_int(long long n) {
50
+ return qd_from_double((double)n);
51
+ }
52
+
53
+ /* ---- Renormalize: ensure non-overlapping property ---- */
54
+
55
+ __host__ __device__ inline qd_real qd_renorm(double c0, double c1, double c2,
56
+ double c3, double c4) {
57
+ double s, t0, t1, t2, t3;
58
+ qd_real r;
59
+
60
+ two_sum(c3, c4, &s, &t3);
61
+ two_sum(c2, s, &s, &t2);
62
+ two_sum(c1, s, &s, &t1);
63
+ two_sum(c0, s, &r.x[0], &t0);
64
+
65
+ two_sum(t1, t2, &s, &t1);
66
+ two_sum(t0, s, &r.x[1], &t0);
67
+
68
+ two_sum(t0, t1, &r.x[2], &t0);
69
+ r.x[3] = t0 + t3;
70
+
71
+ return r;
72
+ }
73
+
74
+ /* ---- Addition ---- */
75
+
76
+ __host__ __device__ inline qd_real qd_add(qd_real a, qd_real b) {
77
+ /* Index-paired cascade addition, then renormalize */
78
+ double s, e;
79
+ double c[5] = {0, 0, 0, 0, 0};
80
+
81
+ two_sum(a.x[0], b.x[0], &c[0], &e);
82
+ double t = e;
83
+ two_sum(a.x[1], b.x[1], &s, &e);
84
+ double t2;
85
+ two_sum(t, s, &c[1], &t2);
86
+ t = t2 + e;
87
+ two_sum(a.x[2], b.x[2], &s, &e);
88
+ two_sum(t, s, &c[2], &t2);
89
+ t = t2 + e;
90
+ two_sum(a.x[3], b.x[3], &s, &e);
91
+ two_sum(t, s, &c[3], &t2);
92
+ c[4] = t2 + e;
93
+
94
+ return qd_renorm(c[0], c[1], c[2], c[3], c[4]);
95
+ }
96
+
97
+ __host__ __device__ inline qd_real qd_neg(qd_real a) {
98
+ qd_real r;
99
+ r.x[0] = -a.x[0]; r.x[1] = -a.x[1];
100
+ r.x[2] = -a.x[2]; r.x[3] = -a.x[3];
101
+ return r;
102
+ }
103
+
104
+ __host__ __device__ inline qd_real qd_sub(qd_real a, qd_real b) {
105
+ return qd_add(a, qd_neg(b));
106
+ }
107
+
108
+ /* ---- Multiplication ---- */
109
+
110
+ __host__ __device__ inline qd_real qd_mul(qd_real a, qd_real b) {
111
+ double p0, p1, p2, p3, p4, p5;
112
+ double q0, q1, q2, q3, q4, q5;
113
+ double t0, t1;
114
+
115
+ two_prod(a.x[0], b.x[0], &p0, &q0);
116
+ two_prod(a.x[0], b.x[1], &p1, &q1);
117
+ two_prod(a.x[1], b.x[0], &p2, &q2);
118
+ two_prod(a.x[0], b.x[2], &p3, &q3);
119
+ two_prod(a.x[1], b.x[1], &p4, &q4);
120
+ two_prod(a.x[2], b.x[0], &p5, &q5);
121
+
122
+ /* Accumulate from bottom */
123
+ two_sum(p1, p2, &p1, &p2);
124
+ two_sum(q0, p1, &t0, &t1);
125
+
126
+ double r1 = t0;
127
+ double c2 = t1 + p2;
128
+
129
+ two_sum(p3, p4, &t0, &t1);
130
+ double t2 = t1;
131
+ two_sum(t0, p5, &t0, &t1);
132
+ t2 += t1;
133
+ two_sum(c2, t0, &c2, &t0);
134
+ t2 += t0;
135
+
136
+ double c3 = t2 + q1 + q2 + q3 + q4 + q5
137
+ + a.x[0]*b.x[3] + a.x[1]*b.x[2]
138
+ + a.x[2]*b.x[1] + a.x[3]*b.x[0];
139
+
140
+ return qd_renorm(p0, r1, c2, c3, 0.0);
141
+ }
142
+
143
+ /* ---- Division: a / b using Newton iteration ---- */
144
+
145
+ __host__ __device__ inline qd_real qd_div(qd_real a, qd_real b) {
146
+ /* Compute q = a/b using long division */
147
+ double q0 = a.x[0] / b.x[0];
148
+ qd_real r = qd_sub(a, qd_mul(qd_from_double(q0), b));
149
+
150
+ double q1 = r.x[0] / b.x[0];
151
+ r = qd_sub(r, qd_mul(qd_from_double(q1), b));
152
+
153
+ double q2 = r.x[0] / b.x[0];
154
+ r = qd_sub(r, qd_mul(qd_from_double(q2), b));
155
+
156
+ double q3 = r.x[0] / b.x[0];
157
+
158
+ return qd_renorm(q0, q1, q2, q3, 0.0);
159
+ }
160
+
161
+ /* ---- Comparison ---- */
162
+
163
+ __host__ __device__ inline int qd_gt(qd_real a, qd_real b) {
164
+ if (a.x[0] != b.x[0]) return a.x[0] > b.x[0];
165
+ if (a.x[1] != b.x[1]) return a.x[1] > b.x[1];
166
+ if (a.x[2] != b.x[2]) return a.x[2] > b.x[2];
167
+ return a.x[3] > b.x[3];
168
+ }
169
+
170
+ __host__ __device__ inline int qd_lt_zero(qd_real a) { return a.x[0] < 0.0; }
171
+
172
+ __host__ __device__ inline double qd_to_double(qd_real a) { return a.x[0] + a.x[1]; }
173
+
174
+ /* ---- Absolute value ---- */
175
+
176
+ __host__ __device__ inline qd_real qd_abs(qd_real a) {
177
+ return qd_lt_zero(a) ? qd_neg(a) : a;
178
+ }
179
+
180
+ /* ---- Constants ---- */
181
+
182
+ /* π to ~62 decimal digits as a quad-double.
183
+ * These are the exact double decomposition of:
184
+ * 3.14159265358979323846264338327950288419716939937510...
185
+ */
186
+ __host__ __device__ inline qd_real qd_pi() {
187
+ qd_real r;
188
+ r.x[0] = 3.141592653589793116e+00;
189
+ r.x[1] = 1.224646799147353207e-16;
190
+ r.x[2] = -2.994769809718339666e-33;
191
+ r.x[3] = 1.112454220863365282e-49;
192
+ return r;
193
+ }
194
+
195
+ /* 2π */
196
+ __host__ __device__ inline qd_real qd_two_pi() {
197
+ qd_real r;
198
+ r.x[0] = 6.283185307179586232e+00;
199
+ r.x[1] = 2.449293598294706414e-16;
200
+ r.x[2] = -5.989539619436679332e-33;
201
+ r.x[3] = 2.224908441726730563e-49;
202
+ return r;
203
+ }
204
+
205
+ /* ---- Multiply qd by integer ---- */
206
+
207
+ __host__ __device__ inline qd_real qd_mul_int(qd_real a, long long n) {
208
+ return qd_mul(a, qd_from_double((double)n));
209
+ }
210
+
211
+ /* ---- sin via argument reduction + Taylor series ---- */
212
+
213
+ __host__ __device__ inline qd_real qd_sin(qd_real a) {
214
+ /* Argument reduction: compute a mod 2π, then reduce to [-π, π] */
215
+ qd_real two_pi = qd_two_pi();
216
+ qd_real pi = qd_pi();
217
+
218
+ /* k = round(a / (2π)) */
219
+ double k_d = round(a.x[0] / two_pi.x[0]);
220
+ long long k = (long long)k_d;
221
+
222
+ /* r = a - k * 2π */
223
+ qd_real r = qd_sub(a, qd_mul_int(two_pi, k));
224
+
225
+ /* Further reduce: if r > π, r -= 2π; if r < -π, r += 2π */
226
+ if (qd_gt(r, pi)) r = qd_sub(r, two_pi);
227
+ if (qd_lt_zero(qd_add(r, pi))) r = qd_add(r, two_pi);
228
+
229
+ /* Now |r| <= π. Use range reduction to |r| <= π/4 via identities:
230
+ * For simplicity, just use Taylor series directly (r is usually small
231
+ * for our use case since we're evaluating at integers near multiples of π).
232
+ */
233
+
234
+ /* Taylor series: sin(r) = r - r³/3! + r⁵/5! - r⁷/7! + ...
235
+ * Converges fast when |r| < π. We need ~20 terms for 62-digit precision.
236
+ */
237
+ qd_real r2 = qd_mul(r, r);
238
+ qd_real term = r;
239
+ qd_real sum = r;
240
+
241
+ for (int i = 1; i <= 25; i++) {
242
+ double denom = -(2.0*i) * (2.0*i + 1.0);
243
+ term = qd_mul(term, r2);
244
+ term = qd_div(term, qd_from_double(denom));
245
+ sum = qd_add(sum, term);
246
+
247
+ /* Early termination if term is negligible */
248
+ if (fabs(term.x[0]) < 1e-60 * fabs(sum.x[0])) break;
249
+ }
250
+
251
+ return sum;
252
+ }
253
+
254
+ #endif /* QD_REAL_H */