CompressedGemma commited on
Commit
4384a45
Β·
verified Β·
1 Parent(s): f32b3c6

Update hexstate_quantize.c

Browse files
Files changed (1) hide show
  1. hexstate_quantize.c +287 -349
hexstate_quantize.c CHANGED
@@ -633,7 +633,6 @@ static int is_attention_tensor(const char *gguf_name)
633
  * conservative too" β€” creating coherent precision allocation.
634
  * ═══════════════════════════════════════════════════════════════════════════ */
635
 
636
-
637
  /* ── Multi-quhit expanded scale table ──
638
  * Search grid: 24Γ—24 = 576 (d, dmin) candidates
639
  * Quhit encoding: bin 24 β†’ 6 for D=6 quhits (BP operates on 6-state marginals)
@@ -1511,6 +1510,16 @@ static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
1511
  #define HEX_POLISH_ULP 4
1512
  #endif
1513
 
 
 
 
 
 
 
 
 
 
 
1514
  /* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
1515
  *
1516
  * The quantity that matters downstream is the layer-output error
@@ -2229,363 +2238,108 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
2229
  err_shaped += hex_spectral_penalty(e_gs, QK4_0);
2230
  int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
2231
 
2232
- for (int j = 0; j < QK4_0 / 2; j++) {
2233
- int q0 = q_final[j];
2234
- int q1 = q_final[j + QK4_0/2];
2235
- output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4));
2236
-
2237
- float deq0 = ((float)q0 - 8.0f) * actual_d;
2238
- float deq1 = ((float)q1 - 8.0f) * actual_d;
2239
- total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
2240
- }
2241
- }
2242
-
2243
- *out_total_error = total_err;
2244
- free(greedy_d);
2245
- free(cand_errors);
2246
- free(cand_d16);
2247
- free(best_candidate);
2248
- }
2249
-
2250
- /* ════════════════════════════════════════════════════════════════════════
2251
- * Q8_0 HPC QUANTIZER β€” Shor pipeline at 8 bits
2252
- *
2253
- * Same pipeline as Q4_0: WLS scale + tight candidate grid scored on the
2254
- * extended objective (weighted SSE + DC + vesica/wave), triality-quhit
2255
- * graph with Boltzmann-encoded candidate errors, CZ chain entanglement,
2256
- * Shor Griffiths-Niu sequential measurement for bin consensus, greedy
2257
- * override (HEX_GREEDY_OVERRIDE_RATIO), then per-block ULP polish, the
2258
- * vesica/DC error-shaping descent with an extended-objective guard, and
2259
- * the candidate floor. Intended for embedding / LM-head tensors (tied
2260
- * embeddings especially), where 2-4 bit codes destroy logit precision.
2261
- * At 8 bits the candidate grid is tight (Β±1.5%) β€” the win over naive
2262
- * amax/127 rounding comes from WLS + ULP + spectral selection, not from
2263
- * coarse scale exploration.
2264
- * ════════════════════════════════════════════════════════════════════════ */
2265
-
2266
- #ifndef QK8_0
2267
- #define QK8_0 32
2268
- #endif
2269
- typedef struct { uint16_t d; int8_t qs[QK8_0]; } hex_block_q8_0;
2270
-
2271
- #define Q8_N_CAND 24
2272
- static const float Q8_NEIGHBOR_MULTS[Q8_N_CAND] = {
2273
- 0.9850f, 0.9865f, 0.9880f, 0.9895f, 0.9910f, 0.9925f,
2274
- 0.9940f, 0.9952f, 0.9964f, 0.9976f, 0.9988f, 1.0000f,
2275
- 1.0010f, 1.0020f, 1.0030f, 1.0040f, 1.0052f, 1.0064f,
2276
- 1.0076f, 1.0088f, 1.0100f, 1.0115f, 1.0130f, 1.0150f,
2277
- };
2278
- /* 24 candidates β†’ 6 quhit states (4 per bin), same folding as Q4_0 */
2279
- static const int Q8_CAND_TO_QUHIT[Q8_N_CAND] = {
2280
- 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5
2281
- };
2282
-
2283
- static inline float q8_block_ext_err(const float *bw, const float *iw,
2284
- float d, int8_t *qs_out)
2285
- {
2286
- float e_arr[QK8_0];
2287
- float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f;
2288
- float err = 0.0f;
2289
- for (int j = 0; j < QK8_0; j++) {
2290
- int q = gguf_nearest_int(bw[j] * id);
2291
- if (q < -127) q = -127; if (q > 127) q = 127;
2292
- if (qs_out) qs_out[j] = (int8_t)q;
2293
- float e = bw[j] - (float)q * d;
2294
- e_arr[j] = e;
2295
- float w = iw ? iw[j] : 1.0f;
2296
- err += e * e * w;
2297
- }
2298
- return err + hex_spectral_penalty(e_arr, QK8_0);
2299
- }
2300
-
2301
- static void quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
2302
- hex_block_q8_0 *output,
2303
- float *out_total_error,
2304
- const float *imat_importance, int verbose)
2305
- {
2306
- int64_t n_blocks = n_elements / QK8_0;
2307
- float total_err = 0.0f;
2308
- (void)verbose;
2309
-
2310
- float (*cand_errors)[Q8_N_CAND] = (float (*)[Q8_N_CAND])
2311
- calloc(n_blocks, sizeof(float[Q8_N_CAND]));
2312
- uint16_t (*cand_d16)[Q8_N_CAND] = (uint16_t (*)[Q8_N_CAND])
2313
- calloc(n_blocks, sizeof(uint16_t[Q8_N_CAND]));
2314
- int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
2315
- if (!cand_errors || !cand_d16 || !best_candidate) {
2316
- free(cand_errors); free(cand_d16); free(best_candidate);
2317
- if (out_total_error) *out_total_error = -1.0f;
2318
- return;
2319
- }
2320
-
2321
- /* ── Phase 1+2: WLS-refined scale + tight candidate grid ── */
2322
- #pragma omp parallel for schedule(dynamic, 256)
2323
- for (int64_t blk = 0; blk < n_blocks; blk++) {
2324
- const float *bw = weights + blk * QK8_0;
2325
- const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;
2326
-
2327
- float amax = 0.0f;
2328
- for (int j = 0; j < QK8_0; j++) {
2329
- float av = fabsf(bw[j]);
2330
- if (av > amax) amax = av;
2331
- }
2332
- float wls_d = amax / 127.0f;
2333
-
2334
- /* ggml-style fixed-point WLS with DC rank-1 augmentation */
2335
- for (int it = 0; it < 3 && wls_d > 1e-20f; it++) {
2336
- float inv_d = 1.0f / wls_d;
2337
- float num = 0.0f, den = 0.0f, dcS = 0.0f, dcQ = 0.0f;
2338
- for (int j = 0; j < QK8_0; j++) {
2339
- int q = gguf_nearest_int(bw[j] * inv_d);
2340
- if (q < -127) q = -127; if (q > 127) q = 127;
2341
- float qf = (float)q;
2342
- float w = iw ? iw[j] : 1.0f;
2343
- num += w * bw[j] * qf;
2344
- den += w * qf * qf;
2345
- dcS += bw[j];
2346
- dcQ += qf;
2347
- }
2348
- num += (HEX_DC_LAMBDA / (float)QK8_0) * dcS * dcQ;
2349
- den += (HEX_DC_LAMBDA / (float)QK8_0) * dcQ * dcQ;
2350
- if (den > 1e-15f) {
2351
- float d_new = num / den;
2352
- if (d_new > 1e-20f) wls_d = d_new;
2353
  }
2354
- }
2355
-
2356
- for (int ci = 0; ci < Q8_N_CAND; ci++) {
2357
- float trial_d = wls_d * Q8_NEIGHBOR_MULTS[ci];
2358
- uint16_t d16 = gguf_fp32_to_fp16(trial_d);
2359
- float actual_d = gguf_fp16_to_fp32(d16);
2360
- cand_d16 [blk][ci] = d16;
2361
- cand_errors[blk][ci] = q8_block_ext_err(bw, iw, actual_d, NULL);
2362
- }
2363
- best_candidate[blk] = 11; /* Γ—1.0000 neutral seed */
2364
- }
2365
-
2366
- /* ── Phase 3: Shor graph β€” triality quhits, CZ chain, GN measurement ── */
2367
- int shor_ran = 0;
2368
- if (n_blocks >= 2) {
2369
- int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
2370
- int64_t stride = n_blocks / graph_blocks;
2371
-
2372
- HPCGraph *graph = hpc_create(graph_blocks);
2373
- if (graph) {
2374
- shor_ran = 1;
2375
-
2376
- /* Adaptive temperature from the candidate-error landscape */
2377
- float temperature = 1e-10f;
2378
- {
2379
- double err_accum = 0.0;
2380
- int err_count = 0;
2381
- for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
2382
- int64_t blk = gi * stride;
2383
- float max_e = 0.0f;
2384
- for (int c = 0; c < Q8_N_CAND; c++)
2385
- if (cand_errors[blk][c] > max_e)
2386
- max_e = cand_errors[blk][c];
2387
- err_accum += (double)max_e;
2388
- err_count++;
2389
  }
2390
- if (err_count > 0) {
2391
- temperature = (float)(err_accum / err_count) * 0.1f;
2392
- if (temperature < 1e-10f) temperature = 1e-10f;
 
 
2393
  }
2394
  }
2395
 
2396
- /* Boltzmann-encode stride-aggregated candidate errors as
2397
- * quhit amplitudes (24 candidates folded into 6 states) */
2398
- for (int64_t i = 0; i < graph_blocks; i++) {
2399
- float agg_errors[Q8_N_CAND];
2400
- for (int c = 0; c < Q8_N_CAND; c++) agg_errors[c] = 0.0f;
2401
- int64_t blk_start = i * stride;
2402
- int64_t blk_end = blk_start + stride;
2403
- if (blk_end > n_blocks) blk_end = n_blocks;
2404
- for (int64_t b = blk_start; b < blk_end; b++)
2405
- for (int c = 0; c < Q8_N_CAND; c++)
2406
- agg_errors[c] += cand_errors[b][c];
2407
- float min_err = 1e30f;
2408
- for (int c = 0; c < Q8_N_CAND; c++)
2409
- if (agg_errors[c] < min_err) min_err = agg_errors[c];
2410
-
2411
- double amp_re[6] = {0,0,0,0,0,0};
2412
- double amp_norm = 0.0;
2413
- for (int ci = 0; ci < Q8_N_CAND; ci++)
2414
- amp_re[Q8_CAND_TO_QUHIT[ci]] +=
2415
- exp(-(double)(agg_errors[ci] - min_err) /
2416
- (2.0 * (double)temperature));
2417
- for (int v = 0; v < 6; v++) amp_norm += amp_re[v] * amp_re[v];
2418
- if (amp_norm > 1e-30) {
2419
- double inv = 1.0 / sqrt(amp_norm);
2420
- for (int v = 0; v < 6; v++) amp_re[v] *= inv;
2421
- }
2422
- for (int v = 0; v < 6; v++) {
2423
- graph->locals[i].edge_re[v] = amp_re[v];
2424
- graph->locals[i].edge_im[v] = 0.0;
2425
- }
2426
- graph->locals[i].primary = VIEW_EDGE;
2427
- graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
2428
- graph->locals[i].delta_valid = 0;
2429
- triality_update_mask(&graph->locals[i]);
2430
- }
2431
-
2432
- for (int64_t i = 0; i < graph_blocks - 1; i++)
2433
- hpc_cz(graph, i, i + 1);
2434
-
2435
- double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
2436
- int *measured = (int *)calloc(graph_blocks, sizeof(int));
2437
- if (marg && measured) {
2438
- shor_measure_graph(graph, graph_blocks, marg, measured, 1);
2439
-
2440
- /* Per-block selection: best candidate inside the Shor-
2441
- * measured bin, then greedy override against the global
2442
- * argmin β€” identical Step-F semantics to Q2_K/Q4_0. */
2443
- for (int64_t i = 0; i < graph_blocks; i++) {
2444
- int bin = measured[i];
2445
- if (bin < 0 || bin > 5) {
2446
- double bm = -1.0; bin = 0;
2447
- for (int v = 0; v < 6; v++)
2448
- if (marg[i][v] > bm) { bm = marg[i][v]; bin = v; }
2449
  }
2450
- int64_t blk_start = i * stride;
2451
- int64_t blk_end = blk_start + stride;
2452
- if (blk_end > n_blocks) blk_end = n_blocks;
2453
- for (int64_t b = blk_start; b < blk_end; b++) {
2454
- float bin_best = 1e30f; int bin_cand = -1;
2455
- float g_best = 1e30f; int g_cand = 0;
2456
- for (int c = 0; c < Q8_N_CAND; c++) {
2457
- float e = cand_errors[b][c];
2458
- if (e < g_best) { g_best = e; g_cand = c; }
2459
- if (Q8_CAND_TO_QUHIT[c] == bin && e < bin_best) {
2460
- bin_best = e; bin_cand = c;
2461
- }
2462
- }
2463
- int sel = (bin_cand >= 0) ? bin_cand : g_cand;
2464
- if (g_best < cand_errors[b][sel] * HEX_GREEDY_OVERRIDE_RATIO)
2465
- sel = g_cand;
2466
- best_candidate[b] = sel;
2467
  }
2468
  }
2469
- }
2470
- free(marg); free(measured);
2471
- hpc_destroy(graph);
2472
- }
2473
- }
2474
- if (!shor_ran) {
2475
- for (int64_t blk = 0; blk < n_blocks; blk++) {
2476
- float g_best = cand_errors[blk][0]; int g_cand = 0;
2477
- for (int c = 1; c < Q8_N_CAND; c++)
2478
- if (cand_errors[blk][c] < g_best) {
2479
- g_best = cand_errors[blk][c]; g_cand = c;
2480
- }
2481
- best_candidate[blk] = g_cand;
2482
- }
2483
- }
2484
-
2485
- /* ── Phase 4: ULP polish + vesica/DC shaping guard + floor ── */
2486
- #pragma omp parallel for schedule(dynamic, 256) reduction(+:total_err)
2487
- for (int64_t blk = 0; blk < n_blocks; blk++) {
2488
- const float *bw = weights + blk * QK8_0;
2489
- const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;
2490
- int cidx = best_candidate[blk];
2491
-
2492
- uint16_t best_d16 = cand_d16[blk][cidx];
2493
- float best_err = cand_errors[blk][cidx];
2494
-
2495
- /* Β±8 fp16 ULP joint search on the extended objective */
2496
- for (int du = -8; du <= 8; du++) {
2497
- if (du == 0) continue;
2498
- int c16 = (int)cand_d16[blk][cidx] + du;
2499
- if (c16 <= 0 || c16 > 0x7BFF) continue;
2500
- float td = gguf_fp16_to_fp32((uint16_t)c16);
2501
- float err = q8_block_ext_err(bw, iw, td, NULL);
2502
- if (err < best_err) { best_err = err; best_d16 = (uint16_t)c16; }
2503
- }
2504
-
2505
- /* Candidate floor: final ≀ best raw grid candidate (by construction
2506
- * the ULP search already starts from it, so this is implicit). */
2507
- float d = gguf_fp16_to_fp32(best_d16);
2508
- int8_t qs[QK8_0];
2509
- (void)q8_block_ext_err(bw, iw, d, qs);
2510
-
2511
- /* Vesica/DC greedy shaping with extended-objective guard */
2512
- {
2513
- int8_t qs_shaped[QK8_0];
2514
- memcpy(qs_shaped, qs, QK8_0);
2515
- float e_live[QK8_0], v_live[QK8_0 / 2];
2516
- float vesica_cur = 0.0f, dc_cur = 0.0f;
2517
- for (int k = 0; k < QK8_0; k++)
2518
- e_live[k] = bw[k] - (float)qs_shaped[k] * d;
2519
- for (int p = 0; p < QK8_0 / 2; p++) {
2520
- v_live[p] = e_live[p] + e_live[p + QK8_0 / 2];
2521
- vesica_cur += v_live[p] * v_live[p];
2522
- dc_cur += v_live[p];
2523
- }
2524
- float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
2525
- for (int pass = 0; pass < QK8_0; pass++) {
2526
- int best_k = -1, best_q_alt = 0;
2527
- float best_delta = 0.0f;
2528
- for (int k = 0; k < QK8_0; k++) {
2529
- int q_try = (e_live[k] >= 0.0f) ? qs_shaped[k] + 1
2530
- : qs_shaped[k] - 1;
2531
- if (q_try < -127 || q_try > 127) continue;
2532
- float e_new = bw[k] - (float)q_try * d;
2533
- float de = e_new - e_live[k];
2534
- int pi = (k < QK8_0 / 2) ? k : k - QK8_0 / 2;
2535
- float v_new = v_live[pi] + de;
2536
- float ves_a = vesica_cur - v_live[pi] * v_live[pi]
2537
- + v_new * v_new;
2538
- float dc_a = dc_cur + de;
2539
- float delta = metric_cur - (4.0f * ves_a + dc_a * dc_a);
2540
- if (delta > best_delta) {
2541
- best_delta = delta; best_k = k; best_q_alt = q_try;
2542
  }
2543
  }
2544
- if (best_k < 0) break;
2545
- {
2546
- float e_new = bw[best_k] - (float)best_q_alt * d;
2547
- float de = e_new - e_live[best_k];
2548
- int pi = (best_k < QK8_0 / 2) ? best_k
2549
- : best_k - QK8_0 / 2;
2550
- float v_new = v_live[pi] + de;
2551
- vesica_cur += v_new * v_new - v_live[pi] * v_live[pi];
2552
- dc_cur += de;
2553
- metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
2554
- v_live[pi] = v_new;
2555
- e_live[best_k] = e_new;
2556
- qs_shaped[best_k] = (int8_t)best_q_alt;
2557
- }
2558
- }
2559
- /* Guard on the extended objective vs originals */
2560
- float e_b[QK8_0], e_s[QK8_0];
2561
- float err_b = 0.0f, err_s = 0.0f;
2562
- for (int k = 0; k < QK8_0; k++) {
2563
- float w = iw ? iw[k] : 1.0f;
2564
- e_b[k] = bw[k] - (float)qs[k] * d;
2565
- e_s[k] = bw[k] - (float)qs_shaped[k] * d;
2566
- err_b += e_b[k] * e_b[k] * w;
2567
- err_s += e_s[k] * e_s[k] * w;
2568
  }
2569
- err_b += hex_spectral_penalty(e_b, QK8_0);
2570
- err_s += hex_spectral_penalty(e_s, QK8_0);
2571
- if (err_s < err_b) memcpy(qs, qs_shaped, QK8_0);
2572
  }
2573
 
2574
- output[blk].d = best_d16;
2575
- for (int k = 0; k < QK8_0; k++) {
2576
- output[blk].qs[k] = qs[k];
2577
- float e = bw[k] - (float)qs[k] * d;
2578
- total_err += e * e; /* pure reconstruction SSE report */
 
 
 
2579
  }
2580
  }
2581
 
 
 
2582
  free(cand_errors);
2583
  free(cand_d16);
2584
  free(best_candidate);
2585
- if (out_total_error) *out_total_error = total_err;
2586
  }
2587
 
2588
-
2589
  /* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
2590
  * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
2591
  * candidate generation, so stored codes are unnecessary. */
@@ -3934,6 +3688,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3934
  * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
3935
  * into L survives wherever it is SSE-neutral, and is overridden
3936
  * only where it was costing true reconstruction error. */
 
3937
  {
3938
  uint8_t pl_Ls[16], pl_Lm[16];
3939
  for (int j = 0; j < N_SUB; j++) {
@@ -4215,6 +3970,201 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
4215
  output[blk].dmin = gguf_fp32_to_fp16(mm);
4216
  }
4217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4218
  /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
4219
  *
4220
  * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
@@ -4959,18 +4909,6 @@ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
4959
  if (out_error) *out_error = err;
4960
  }
4961
 
4962
- int hexstate_q8_0_block_bytes(void) { return (int)sizeof(hex_block_q8_0); }
4963
- int hexstate_q8_0_block_elements(void) { return QK8_0; }
4964
-
4965
- void hexstate_quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
4966
- void *output, float *out_error,
4967
- const float *imat_importance, int verbose)
4968
- {
4969
- quantize_tensor_q8_0_hpc(weights, n_elements,
4970
- (hex_block_q8_0 *)output, out_error,
4971
- imat_importance, verbose);
4972
- }
4973
-
4974
  #ifndef HEXSTATE_LIBRARY
4975
  /* ═══════════════════════════════════════════════════════════════════════════
4976
  * MAIN
 
633
  * conservative too" β€” creating coherent precision allocation.
634
  * ═══════════════════════════════════════════════════════════════════════════ */
635
 
 
636
  /* ── Multi-quhit expanded scale table ──
637
  * Search grid: 24Γ—24 = 576 (d, dmin) candidates
638
  * Quhit encoding: bin 24 β†’ 6 for D=6 quhits (BP operates on 6-state marginals)
 
1510
  #define HEX_POLISH_ULP 4
1511
  #endif
1512
 
1513
+ /* Number of Digital-Twin basin jumps per block (Phase 4.65). Each round
1514
+ * builds the block's ideal twin from the exact per-sub-block qkx2 optima,
1515
+ * collapses it over a 6Γ—6 (d, dmin) hex lattice, and β€” only on strict
1516
+ * improvement of the true extended objective β€” re-seats the block in the
1517
+ * better basin before the Phase-4.6 polish re-runs. 0 disables the phase
1518
+ * and recovers the previous pipeline bit-exactly. */
1519
+ #ifndef HEX_TWIN_ROUNDS
1520
+ #define HEX_TWIN_ROUNDS 3
1521
+ #endif
1522
+
1523
  /* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
1524
  *
1525
  * The quantity that matters downstream is the layer-output error
 
2238
  err_shaped += hex_spectral_penalty(e_gs, QK4_0);
2239
  int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
2240
 
2241
+ /* ══ Q4_0 PHASE 4.6: Closed-form d refit + ULP polish ══
2242
+ * After the Hadamard descent may have flipped several codes, the
2243
+ * assembly-phase d is no longer optimal for the committed code set.
2244
+ * A one-shot WLS refit followed by a Β±4 ULP micro-search recovers
2245
+ * the FP16 rounding gap. Accept only on strict improvement of the
2246
+ * extended objective β€” monotone-safe, cannot raise RMSE. */
2247
+ {
2248
+ /* ── (1) Closed-form WLS d refit against committed codes ── */
2249
+ float num_r = 0.0f, den_r = 0.0f;
2250
+ float dcS_r = 0.0f, dcQ_r = 0.0f;
2251
+ for (int j = 0; j < QK4_0; j++) {
2252
+ float qc = (float)q_final[j] - 8.0f;
2253
+ float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2254
+ num_r += w * bw[j] * qc;
2255
+ den_r += w * qc * qc;
2256
+ dcS_r += bw[j];
2257
+ dcQ_r += qc;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2258
  }
2259
+ num_r += (HEX_DC_LAMBDA / (float)QK4_0) * dcS_r * dcQ_r;
2260
+ den_r += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ_r * dcQ_r;
2261
+ if (den_r > 1e-15f) {
2262
+ float d_refit = num_r / den_r;
2263
+ float d_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_refit));
2264
+
2265
+ float err_cur_r = 0.0f, err_try_r = 0.0f;
2266
+ float e_cr[QK4_0], e_tr[QK4_0];
2267
+ for (int j = 0; j < QK4_0; j++) {
2268
+ float qc = (float)q_final[j] - 8.0f;
2269
+ float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2270
+ e_cr[j] = bw[j] - qc * actual_d;
2271
+ e_tr[j] = bw[j] - qc * d_try;
2272
+ err_cur_r += e_cr[j] * e_cr[j] * w;
2273
+ err_try_r += e_tr[j] * e_tr[j] * w;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2274
  }
2275
+ err_cur_r += hex_spectral_penalty(e_cr, QK4_0);
2276
+ err_try_r += hex_spectral_penalty(e_tr, QK4_0);
2277
+ if (err_try_r < err_cur_r) {
2278
+ actual_d = d_try;
2279
+ output[blk].d = gguf_fp32_to_fp16(d_try);
2280
  }
2281
  }
2282
 
2283
+ /* ── (2) Β±4 ULP micro-search on the true objective ── */
2284
+ {
2285
+ uint16_t base_d16_p = gguf_fp32_to_fp16(actual_d);
2286
+ uint16_t best_d16_p = base_d16_p;
2287
+ float best_pol_err = 1e30f;
2288
+
2289
+ for (int delta_p = -4; delta_p <= 4; delta_p++) {
2290
+ int cd16_p = (int)base_d16_p + delta_p;
2291
+ if (cd16_p < 0 || cd16_p > 0x7BFF) continue;
2292
+ float trial_d_p = gguf_fp16_to_fp32((uint16_t)cd16_p);
2293
+ float trial_id_p = (fabsf(trial_d_p) > 1e-15f) ? 1.0f / trial_d_p : 0.0f;
2294
+ float err_p = 0.0f;
2295
+ float e_p[QK4_0];
2296
+ for (int j = 0; j < QK4_0; j++) {
2297
+ /* Re-quantize with trial_d_p to get optimal codes */
2298
+ int q_p = (int)(bw[j] * trial_id_p + 8.5f);
2299
+ if (q_p < 0) q_p = 0; if (q_p > 15) q_p = 15;
2300
+ float deq_p = ((float)q_p - 8.0f) * trial_d_p;
2301
+ float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2302
+ e_p[j] = bw[j] - deq_p;
2303
+ err_p += e_p[j] * e_p[j] * w;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2304
  }
2305
+ err_p += hex_spectral_penalty(e_p, QK4_0);
2306
+ if (err_p < best_pol_err) {
2307
+ best_pol_err = err_p;
2308
+ best_d16_p = (uint16_t)cd16_p;
 
 
 
 
 
 
 
 
 
 
 
 
 
2309
  }
2310
  }
2311
+ if (best_d16_p != base_d16_p) {
2312
+ actual_d = gguf_fp16_to_fp32(best_d16_p);
2313
+ output[blk].d = best_d16_p;
2314
+ /* Recompute final codes with the polished d */
2315
+ float id_pol = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;
2316
+ for (int j = 0; j < QK4_0; j++) {
2317
+ int q_pol = (int)(bw[j] * id_pol + 8.5f);
2318
+ if (q_pol < 0) q_pol = 0; if (q_pol > 15) q_pol = 15;
2319
+ q_final[j] = q_pol;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2320
  }
2321
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2322
  }
 
 
 
2323
  }
2324
 
2325
+ for (int j = 0; j < QK4_0 / 2; j++) {
2326
+ int q0 = q_final[j];
2327
+ int q1 = q_final[j + QK4_0/2];
2328
+ output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4));
2329
+
2330
+ float deq0 = ((float)q0 - 8.0f) * actual_d;
2331
+ float deq1 = ((float)q1 - 8.0f) * actual_d;
2332
+ total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
2333
  }
2334
  }
2335
 
2336
+ *out_total_error = total_err;
2337
+ free(greedy_d);
2338
  free(cand_errors);
2339
  free(cand_d16);
2340
  free(best_candidate);
 
2341
  }
2342
 
 
2343
  /* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
2344
  * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
2345
  * candidate generation, so stored codes are unnecessary. */
 
3688
  * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
3689
  * into L survives wherever it is SSE-neutral, and is overridden
3690
  * only where it was costing true reconstruction error. */
3691
+ for (int twin_round = 0; ; twin_round++) {
3692
  {
3693
  uint8_t pl_Ls[16], pl_Lm[16];
3694
  for (int j = 0; j < N_SUB; j++) {
 
3970
  output[blk].dmin = gguf_fp32_to_fp16(mm);
3971
  }
3972
 
3973
+ /* ══ PHASE 4.65: DIGITAL TWIN COLLAPSE (ideal-block basin jump) ══
3974
+ *
3975
+ * The polish above is exact coordinate descent, but coordinate
3976
+ * descent only reaches the bottom of the basin it starts in. This
3977
+ * phase builds the block's DIGITAL TWIN: the residual-free encoding
3978
+ * implied by seeds[blk] β€” the exact per-sub-block qkx2 optima, i.e.
3979
+ * the block as it "wants" to be quantised before any super-scale
3980
+ * compromise. A 6Γ—6 hexagonal lattice of (d, dmin) scalings around
3981
+ * that ideal is scored EXACTLY (derived sub-scales, per-weight
3982
+ * optimal q, true extended objective vs the ORIGINAL weights). The
3983
+ * 36 scores are loaded into a 2-quhit twin as Boltzmann amplitudes
3984
+ * sharply peaked at the minimum, CZ-entangled, and collapsed via
3985
+ * shor_measure_graph β€” the same Griffiths–Niu machinery as the
3986
+ * assembly phase. The collapsed cell (guarded by the exact argmin,
3987
+ * so the collapse can never land above the ideal) is committed only
3988
+ * on STRICT improvement of E, after which the Phase-4.6 polish
3989
+ * re-runs inside the new basin. Strict-improvement commits β‡’ the
3990
+ * extended objective is monotone: final RMSE cannot go up, and the
3991
+ * Phase-4.7 candidate floor below still applies unchanged. */
3992
+ int twin_jumped = 0;
3993
+ if (twin_round < HEX_TWIN_ROUNDS) {
3994
+ /* Committed extended error β€” true objective, original weights */
3995
+ float E_cur = 0.0f;
3996
+ {
3997
+ float e_c[QK_K];
3998
+ for (int j = 0; j < N_SUB; j++) {
3999
+ float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
4000
+ float m_sub = mm * (float)(output[blk].scales[j] >> 4);
4001
+ for (int k = 0; k < 16; k++) {
4002
+ int idx = 16 * j + k;
4003
+ float w = (imat_importance) ?
4004
+ imat_importance[blk * QK_K + idx] : 1.0f;
4005
+ e_c[idx] = block_x[idx] -
4006
+ (d_sub * (float)L[idx] - m_sub);
4007
+ E_cur += e_c[idx] * e_c[idx] * w;
4008
+ }
4009
+ }
4010
+ E_cur += hex_spectral_penalty(e_c, QK_K);
4011
+ }
4012
+
4013
+ /* The ideal twin's super-scalars: residual-free encoding of the
4014
+ * exact per-sub optima (4-bit sub-scales span 0..15). */
4015
+ float id_s = 0.0f, id_m = 0.0f;
4016
+ for (int j = 0; j < N_SUB; j++) {
4017
+ if (seeds[blk].scales[j] > id_s) id_s = seeds[blk].scales[j];
4018
+ if (seeds[blk].mins[j] > id_m) id_m = seeds[blk].mins[j];
4019
+ }
4020
+ float d_id = id_s / 15.0f;
4021
+ float m_id = id_m / 15.0f;
4022
+
4023
+ if (d_id > 1e-15f) {
4024
+ /* 6Γ—6 hex lattice of scalings around the ideal twin */
4025
+ static const float hexfac[6] =
4026
+ {0.79f, 0.88f, 0.96f, 1.04f, 1.13f, 1.23f};
4027
+ float cell_E[6][6];
4028
+ float bestE = 1e30f;
4029
+ int ba = 0, bb = 0;
4030
+
4031
+ for (int a = 0; a < 6; a++) {
4032
+ float c_dm = gguf_fp16_to_fp32(
4033
+ gguf_fp32_to_fp16(d_id * hexfac[a]));
4034
+ for (int b = 0; b < 6; b++) {
4035
+ float c_mm = gguf_fp16_to_fp32(
4036
+ gguf_fp32_to_fp16(m_id * hexfac[b]));
4037
+ uint8_t c_Ls[16], c_Lm[16];
4038
+ hex_derive_subscales(seeds[blk].scales,
4039
+ seeds[blk].mins,
4040
+ c_dm, c_mm, c_Ls, c_Lm);
4041
+ float e_t[QK_K];
4042
+ float E = 0.0f;
4043
+ for (int j = 0; j < N_SUB; j++) {
4044
+ float d_sub = c_dm * (float)c_Ls[j];
4045
+ float m_sub = c_mm * (float)c_Lm[j];
4046
+ for (int k = 0; k < 16; k++) {
4047
+ int idx = 16 * j + k;
4048
+ float w = (imat_importance) ?
4049
+ imat_importance[blk * QK_K + idx]
4050
+ : 1.0f;
4051
+ int q = 0;
4052
+ if (d_sub >= 1e-15f) {
4053
+ q = gguf_nearest_int(
4054
+ (block_x[idx] + m_sub) / d_sub);
4055
+ if (q < 0) q = 0; if (q > 3) q = 3;
4056
+ }
4057
+ e_t[idx] = block_x[idx] -
4058
+ (d_sub * (float)q - m_sub);
4059
+ E += e_t[idx] * e_t[idx] * w;
4060
+ }
4061
+ }
4062
+ E += hex_spectral_penalty(e_t, QK_K);
4063
+ cell_E[a][b] = E;
4064
+ if (E < bestE) { bestE = E; ba = a; bb = b; }
4065
+ }
4066
+ }
4067
+
4068
+ /* Load the twin: two quhits over the lattice axes, Boltzmann
4069
+ * amplitudes sharply peaked at the ideal, CZ-entangled,
4070
+ * then collapsed β€” house pattern from the assembly phase. */
4071
+ int _tw_tid = 0;
4072
+ #ifdef _OPENMP
4073
+ _tw_tid = omp_get_thread_num();
4074
+ #endif
4075
+ HPCGraph *tg = _tl_graphs[_tw_tid];
4076
+ hpc_reset_for_subblock(tg, 2);
4077
+ {
4078
+ double axmin[2][6];
4079
+ for (int a = 0; a < 6; a++) {
4080
+ axmin[0][a] = 1e30;
4081
+ axmin[1][a] = 1e30;
4082
+ for (int b = 0; b < 6; b++) {
4083
+ if (cell_E[a][b] < axmin[0][a])
4084
+ axmin[0][a] = cell_E[a][b];
4085
+ if (cell_E[b][a] < axmin[1][a])
4086
+ axmin[1][a] = cell_E[b][a];
4087
+ }
4088
+ }
4089
+ for (int s = 0; s < 2; s++) {
4090
+ double lo = 1e30, hi = -1e30;
4091
+ for (int v = 0; v < 6; v++) {
4092
+ if (axmin[s][v] < lo) lo = axmin[s][v];
4093
+ if (axmin[s][v] > hi) hi = axmin[s][v];
4094
+ }
4095
+ /* Sharp peak: low temperature β‡’ the collapse lands
4096
+ * on the ideal with overwhelming amplitude. */
4097
+ double T = (hi - lo) * 0.05 + 1e-12;
4098
+ double amp_re[6], amp_norm = 0.0;
4099
+ for (int v = 0; v < 6; v++) {
4100
+ amp_re[v] = exp(-(axmin[s][v] - lo) / T);
4101
+ amp_norm += amp_re[v] * amp_re[v];
4102
+ }
4103
+ if (amp_norm > 1e-30) {
4104
+ double inv = 1.0 / sqrt(amp_norm);
4105
+ for (int v = 0; v < 6; v++) amp_re[v] *= inv;
4106
+ }
4107
+ for (int v = 0; v < 6; v++) {
4108
+ tg->locals[s].edge_re[v] = amp_re[v];
4109
+ tg->locals[s].edge_im[v] = 0.0;
4110
+ }
4111
+ tg->locals[s].primary = VIEW_EDGE;
4112
+ tg->locals[s].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL
4113
+ | DIRTY_FOLDED;
4114
+ tg->locals[s].delta_valid = 0;
4115
+ triality_update_mask(&tg->locals[s]);
4116
+ }
4117
+ hpc_cz(tg, 0, 1);
4118
+
4119
+ double tw_marg[2][6];
4120
+ int tw_meas[2];
4121
+ memset(tw_marg, 0, sizeof(tw_marg));
4122
+ memset(tw_meas, 0, sizeof(tw_meas));
4123
+ shor_measure_graph(tg, 2, tw_marg, tw_meas, 1);
4124
+
4125
+ int ca = tw_meas[0], cb = tw_meas[1];
4126
+ /* Collapse guard: the twin may never land above its own
4127
+ * ideal β€” the exact argmin backstops the Born draw. */
4128
+ if (cell_E[ca][cb] > bestE) { ca = ba; cb = bb; }
4129
+
4130
+ if (cell_E[ca][cb] < E_cur) {
4131
+ /* Commit the collapsed cell β€” rebuilt exactly as it
4132
+ * was scored (same pattern as the Phase-4.7 floor). */
4133
+ float c_dm = gguf_fp16_to_fp32(
4134
+ gguf_fp32_to_fp16(d_id * hexfac[ca]));
4135
+ float c_mm = gguf_fp16_to_fp32(
4136
+ gguf_fp32_to_fp16(m_id * hexfac[cb]));
4137
+ uint8_t c_Ls[16], c_Lm[16];
4138
+ hex_derive_subscales(seeds[blk].scales,
4139
+ seeds[blk].mins,
4140
+ c_dm, c_mm, c_Ls, c_Lm);
4141
+ for (int j = 0; j < N_SUB; j++) {
4142
+ float d_sub = c_dm * (float)c_Ls[j];
4143
+ float m_sub = c_mm * (float)c_Lm[j];
4144
+ for (int k = 0; k < 16; k++) {
4145
+ int idx = 16 * j + k;
4146
+ int q = 0;
4147
+ if (d_sub >= 1e-15f) {
4148
+ q = gguf_nearest_int(
4149
+ (block_x[idx] + m_sub) / d_sub);
4150
+ if (q < 0) q = 0; if (q > 3) q = 3;
4151
+ }
4152
+ L[idx] = (uint8_t)q;
4153
+ }
4154
+ output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
4155
+ }
4156
+ dm = c_dm;
4157
+ mm = c_mm;
4158
+ output[blk].d = gguf_fp32_to_fp16(dm);
4159
+ output[blk].dmin = gguf_fp32_to_fp16(mm);
4160
+ twin_jumped = 1;
4161
+ }
4162
+ }
4163
+ }
4164
+ }
4165
+ if (!twin_jumped) break; /* no better basin β€” twin and block agree */
4166
+ } /* twin_round */
4167
+
4168
  /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
4169
  *
4170
  * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
 
4909
  if (out_error) *out_error = err;
4910
  }
4911
 
 
 
 
 
 
 
 
 
 
 
 
 
4912
  #ifndef HEXSTATE_LIBRARY
4913
  /* ═══════════════════════════════════════════════════════════════════════════
4914
  * MAIN