Upload 5 files

Browse files

Files changed (5) hide show

convert_hf_to_gguf.py +16 -0
download_model.py +109 -0
hexstate_quantize.c +130 -902
hexstate_requantize.py +8 -130
makefile.quantize +6 -6

convert_hf_to_gguf.py CHANGED Viewed

@@ -7651,6 +7651,22 @@ class Gemma4Model(Gemma3Model):
         yield from super().modify_tensors(data_torch, name, bid)
 @ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
     has_audio_encoder = True

         yield from super().modify_tensors(data_torch, name, bid)
+@ModelBase.register("Gemma4AssistantForCausalLM")
+class Gemma4AssistantModel(Gemma4Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA4
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+        # Handle assistant-specific projection layers
+        if name == "pre_projection.weight":
+            return "pre_proj.weight"
+        if name == "post_projection.weight":
+            return "post_proj.weight"
+        # Map embed_tokens to token_embd for compatibility
+        if name == "model.embed_tokens.weight":
+            return "token_embd.weight"
+        return super().map_tensor_name(name, try_suffixes)
 @ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
     has_audio_encoder = True

download_model.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/env python3
+import os
+import sys
+import argparse
+import urllib.parse
+from huggingface_hub import snapshot_download
+def parse_hf_url(url_or_id):
+    """
+    Parses a Hugging Face URL or Repo ID and extracts the repo ID and type.
+    Example URL: https://huggingface.co/google/gemma-4-26B-A4B-it
+    Example Dataset URL: https://huggingface.co/datasets/ggml-org/ci
+    """
+    # Check if it is a URL or a repo ID
+    if not (url_or_id.startswith("http://") or url_or_id.startswith("https://")):
+        # If it contains a slash, assume it is user/repo
+        return url_or_id, "model"
+    parsed = urllib.parse.urlparse(url_or_id)
+    if parsed.netloc not in ("huggingface.co", "www.huggingface.co"):
+        raise ValueError(f"URL host must be huggingface.co, got: {parsed.netloc}")
+    path_parts = [p for p in parsed.path.split("/") if p]
+    if not path_parts:
+        raise ValueError("Hugging Face URL path is empty")
+    repo_type = "model"
+    if path_parts[0] in ("datasets", "spaces"):
+        repo_type = "dataset" if path_parts[0] == "datasets" else "space"
+        path_parts = path_parts[1:]
+    if len(path_parts) < 2:
+        if len(path_parts) == 1:
+            return path_parts[0], repo_type
+        raise ValueError("Could not extract repository ID from Hugging Face URL")
+    repo_id = f"{path_parts[0]}/{path_parts[1]}"
+    return repo_id, repo_type
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download a Hugging Face model or dataset from a URL or repository ID."
+    )
+    parser.add_argument(
+        "url_or_id",
+        type=str,
+        help="Hugging Face repository URL (e.g. https://huggingface.co/google/gemma-4-26B-A4B-it) or repository ID (e.g. google/gemma-4-26B-A4B-it)."
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=str,
+        default=None,
+        help="Directory to save the downloaded model. Defaults to a folder matching the repository name in the current directory."
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=os.environ.get("HF_TOKEN"),
+        help="Hugging Face API token. Can also be set via the HF_TOKEN environment variable."
+    )
+    parser.add_argument(
+        "--exclude",
+        type=str,
+        nargs="*",
+        help="Glob patterns to exclude from download (e.g., *.bin, *.pt)"
+    )
+    parser.add_argument(
+        "--include",
+        type=str,
+        nargs="*",
+        help="Glob patterns to include in download (e.g., *.safetensors)"
+    )
+    args = parser.parse_args()
+    try:
+        repo_id, repo_type = parse_hf_url(args.url_or_id)
+    except ValueError as e:
+        print(f"Error parsing input URL/ID: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Determine local directory if not specified
+    if args.local_dir is None:
+        repo_name = repo_id.split("/")[-1]
+        args.local_dir = os.path.join(os.getcwd(), repo_name)
+    print(f"Repository ID:   {repo_id}")
+    print(f"Repository Type: {repo_type}")
+    print(f"Target Directory: {args.local_dir}")
+    os.makedirs(args.local_dir, exist_ok=True)
+    try:
+        downloaded_path = snapshot_download(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            local_dir=args.local_dir,
+            local_dir_use_symlinks=False,
+            token=args.token,
+            ignore_patterns=args.exclude,
+            allow_patterns=args.include
+        )
+        print(f"\nDownload completed successfully!")
+        print(f"Files saved in: {downloaded_path}")
+    except Exception as e:
+        print(f"\nError downloading repository: {e}", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

hexstate_quantize.c CHANGED Viewed

@@ -657,6 +657,60 @@ static void init_scale_table(void) {
     scale_table_initialized = 1;
 }
 /* Compute the Q2_K sub-block reconstruction error for a block at a given
  * scale multiplier, optionally weighted by importance vector */
 static float compute_block_error_q2k(const float *weights, int block_size,
@@ -894,9 +948,11 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
             float deq = cand_min + scale * (float)l;
             float diff = fabsf(x[i] - deq);
-            /* Apply error norm */
             float e = diff;
-            if (cfg->norm != 1.0f) {
                 e = powf(diff, cfg->norm);
             }
             /* Apply importance weighting */
@@ -1760,14 +1816,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 /* Build per-block CDFs from triality marginals */
                 unsigned int born_rng = 314159;
-                int *shot_assignment = (int *)malloc(n_blocks * sizeof(int));
                 for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
-                    float shot_err = 0.0f;
-                    /* Init from beam result so tail blocks beyond
-                     * graph_blocks*stride keep valid indices */
-                    memcpy(shot_assignment, best_candidate,
-                           n_blocks * sizeof(int));
                     for (int64_t gi = 0; gi < graph_blocks; gi++) {
                         /* Normalize marginals to CDF */
@@ -1798,19 +1857,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                             }
                         }
-                        shot_assignment[blk] = best_bin_cand;
                         shot_err += cand_errors[blk][best_bin_cand];
                     }
                     /* Metropolis acceptance: adopt if better than current best */
                     if (shot_err < beam_total_err) {
-                        for (int64_t b = 0; b < n_blocks; b++)
-                            best_candidate[b] = shot_assignment[b];
                         beam_total_err = shot_err;
                     }
                 }
-                free(shot_assignment);
             }
             free(marg);
@@ -2686,14 +2745,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     beam_total_err += candidate_errors[bi][best_candidate[bi]];
                 unsigned int born_rng_q2 = 271828;
-                int *shot_assignment = (int *)malloc(n_blocks * sizeof(int));
                 for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
-                    float shot_err = 0.0f;
-                    /* Init from beam result so tail blocks beyond
-                     * graph_blocks*stride keep valid indices */
-                    memcpy(shot_assignment, best_candidate,
-                           n_blocks * sizeof(int));
                     for (int64_t gi = 0; gi < graph_blocks; gi++) {
                         /* Born sample coarse (d) quhit */
@@ -2738,18 +2799,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                             }
                         }
-                        shot_assignment[blk] = best_bin_cand;
                         shot_err += candidate_errors[blk][best_bin_cand];
                     }
                     if (shot_err < beam_total_err) {
-                        for (int64_t b = 0; b < n_blocks; b++)
-                            best_candidate[b] = shot_assignment[b];
                         beam_total_err = shot_err;
                     }
                 }
-                free(shot_assignment);
             }
             free(coarse_marg);
@@ -2790,6 +2852,17 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
      * the perfect bit analog at 2-bit resolution.
      * ══════════════════════════════════════════════════════════════════ */
     #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
     for (int64_t blk = 0; blk < n_blocks; blk++) {
         const float *block_x = weights + blk * QK_K;
@@ -2804,12 +2877,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
         /* ── Analog assembly: iterate to convergence ──
-         * 5 iterations: enough for the (Ls,Lm) ↔ (d,dmin) coupling
-         * to fully stabilize. Each iteration does:
-         *   A) Sub-block Quhit BP to find coupled (Ls,Lm) states
          *   B) Optimal q-value assignment
          *   C) WLS solve for (d, dmin) */
-        for (int ls_iter = 0; ls_iter < 8; ls_iter++) {
             /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
              * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
@@ -2861,9 +2935,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
             }
-            /* Build 16-node sub-block graph and run BP */
-            HPCGraph *sg = hpc_create(N_SUB);
-            if (sg) {
                 float min_sub_err[N_SUB];
                 for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
@@ -2901,9 +2980,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     hpc_cz(sg, j, j + 1);
                 /* ── Shor sequential measurement on sub-block graph ──
-                 * Replaces BP with exact marginals (ported from tesseract_factor.c) */
-                double (*sub_marg)[6] = (double (*)[6])calloc(N_SUB, sizeof(double[6]));
-                int *sub_measured = (int *)calloc(N_SUB, sizeof(int));
                 shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
@@ -2920,16 +3001,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     Ls_blk[j] = state_ls[j][best_v];
                     Lm_blk[j] = state_lm[j][best_v];
                 }
-                free(sub_marg);
-                free(sub_measured);
-                hpc_destroy(sg);
-            } else {
-                /* Fallback to independent local optima if malloc fails */
-                for (int j = 0; j < N_SUB; j++) {
-                    Ls_blk[j] = state_ls[j][0];
-                    Lm_blk[j] = state_lm[j][0];
-                }
             }
             /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
@@ -3039,16 +3110,20 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         }
         /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
-         * The WLS solve may have shifted (d, dmin) after the last Step A,
-         * invalidating the Ls/Lm choices. One final exhaustive pass at the
-         * EXACT FP16-truncated scales ensures every sub-block is optimal. */
         for (int j = 0; j < N_SUB; j++) {
             const float *sx = block_x + 16 * j;
             float best_sub_err = 1e30f;
             uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
-            for (int try_ls = 0; try_ls <= 15; try_ls++) {
                 float d_sub = dm * (float)try_ls;
-                for (int try_lm = 0; try_lm <= 15; try_lm++) {
                     float m_sub = mm * (float)try_lm;
                     float sub_err = 0.0f;
                     for (int k = 0; k < 16; k++) {
@@ -3240,6 +3315,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         total_err += berr;
     }
     free(seeds);
     free(candidate_errors);
     free(candidate_d);
@@ -3907,848 +3987,6 @@ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     if (out_error) *out_error = err;
 }
-/* ═══════════════════════════════════════════════════════════════════════════
- * HPC-Accelerated BPE Tokenizer
- *
- * Uses the Holographic Phase Graph for BPE tokenization.
- *
- * Architecture:
- *   1. Each character position is a SITE in an HPCGraph
- *   2. Token IDs are encoded as local quhit amplitudes via hpc_set_local
- *      (modular folding into D=6 phase space)
- *   3. Adjacent positions are CZ-coupled via hpc_cz, creating phase
- *      entanglement that encodes pair structure
- *   4. Merge rules are indexed in a hash table: (tok_a, tok_b) → merge_info
- *      for O(1) lookup instead of scanning all rules
- *   5. BPE merge = GRAPH CONTRACTION: matched sites contract,
- *      CZ edges compact via hpc_compact_edges semantics,
- *      and the merged token's amplitude replaces both locals
- *
- * Complexity: O(n_passes × L) instead of O(n_merges × L)
- * Since n_passes << n_merges, this is dramatically faster.
- * ═══════════════════════════════════════════════════════════════════════════ */
-/* Merge table entry */
-typedef struct {
-    int32_t tok_a;
-    int32_t tok_b;
-    int32_t merged_id;
-    int32_t rank;
-} BPEMerge;
-/* Hash table for O(1) merge rule lookup: key = (tok_a, tok_b) */
-#define BPE_HASH_SIZE  (1 << 20)  /* 1M buckets */
-#define BPE_HASH_EMPTY -1
-typedef struct {
-    int32_t tok_a;
-    int32_t tok_b;
-    int32_t merged_id;
-    int32_t rank;
-} BPEHashEntry;
-static inline uint32_t bpe_hash(int32_t a, int32_t b) {
-    /* FNV-1a inspired hash for pair */
-    uint64_t h = 14695981039346656037ULL;
-    h ^= (uint32_t)a; h *= 1099511628211ULL;
-    h ^= (uint32_t)b; h *= 1099511628211ULL;
-    return (uint32_t)(h & (BPE_HASH_SIZE - 1));
-}
-/*
- * hexstate_bpe_tokenize — HPC-accelerated BPE tokenization.
- */
-void hexstate_bpe_tokenize(const int32_t *char_ids, int64_t n_chars,
-                           const BPEMerge *merges, int32_t n_merges,
-                           int32_t *output_ids, int64_t *out_n_tokens,
-                           int verbose)
-{
-    hexstate_init();
-    if (verbose) {
-        fprintf(stderr, "  HPC·BPE: building phase graph (%ld sites, %d merge rules)...\n",
-                (long)n_chars, n_merges);
-    }
-    /* ── Build merge hash table: (tok_a, tok_b) → merge_info ──
-     * This replaces the O(n_merges) scan per pair with O(1) lookup. */
-    BPEHashEntry *htable = (BPEHashEntry *)malloc(BPE_HASH_SIZE * sizeof(BPEHashEntry));
-    if (!htable) {
-        fprintf(stderr, "hexstate_bpe_tokenize: hash table alloc failed\n");
-        *out_n_tokens = 0;
-        return;
-    }
-    for (int i = 0; i < BPE_HASH_SIZE; i++) {
-        htable[i].tok_a = BPE_HASH_EMPTY;
-    }
-    for (int32_t m = 0; m < n_merges; m++) {
-        uint32_t h = bpe_hash(merges[m].tok_a, merges[m].tok_b);
-        /* Linear probing */
-        for (int p = 0; p < BPE_HASH_SIZE; p++) {
-            uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
-            if (htable[idx].tok_a == BPE_HASH_EMPTY) {
-                htable[idx].tok_a = merges[m].tok_a;
-                htable[idx].tok_b = merges[m].tok_b;
-                htable[idx].merged_id = merges[m].merged_id;
-                htable[idx].rank = merges[m].rank;
-                break;
-            }
-        }
-    }
-    /* ── Create HPCGraph: one site per character ──
-     * Each site's local quhit amplitude encodes the token ID,
-     * folded into D=6 via modular arithmetic.
-     * Adjacent sites are CZ-coupled. */
-    HPCGraph *graph = hpc_create((uint64_t)n_chars);
-    if (!graph) {
-        fprintf(stderr, "hexstate_bpe_tokenize: HPCGraph alloc failed for %ld sites\n",
-                (long)n_chars);
-        free(htable);
-        *out_n_tokens = 0;
-        return;
-    }
-    /* Set local amplitudes: token ID → quhit state via triality encoding.
-     * Amplitude concentrated on basis state (tok_id mod 6). */
-    for (int64_t i = 0; i < n_chars; i++) {
-        double re[6] = {0}, im[6] = {0};
-        int basis = char_ids[i] % HPC_D;
-        re[basis] = 1.0;  /* Sharp state on this basis vector */
-        hpc_set_local(graph, (uint64_t)i, re, im);
-    }
-    /* Connect adjacent sites with CZ edges — this encodes pair structure
-     * in the phase graph. Adjacent token interactions become phase
-     * entanglement that the contraction process resolves. */
-    for (int64_t i = 0; i < n_chars - 1; i++) {
-        hpc_cz(graph, (uint64_t)i, (uint64_t)(i + 1));
-    }
-    if (verbose) {
-        fprintf(stderr, "  HPC·BPE: phase graph ready (%lu sites, %lu CZ edges)\n",
-                (unsigned long)graph->n_sites, (unsigned long)graph->cz_edges);
-    }
-    /* ── Working linked list for token sequence ──
-     * Parallel to the HPCGraph sites for fast iteration. */
-    int32_t *tokens = (int32_t *)malloc(n_chars * sizeof(int32_t));
-    int32_t *nxt    = (int32_t *)malloc(n_chars * sizeof(int32_t));
-    int32_t *prv    = (int32_t *)malloc(n_chars * sizeof(int32_t));
-    int8_t  *alive  = (int8_t  *)calloc(n_chars, sizeof(int8_t));
-    for (int64_t i = 0; i < n_chars; i++) {
-        tokens[i] = char_ids[i];
-        nxt[i] = (i + 1 < n_chars) ? (int32_t)(i + 1) : -1;
-        prv[i] = (i > 0) ? (int32_t)(i - 1) : -1;
-        alive[i] = 1;
-    }
-    int64_t n_alive = n_chars;
-    /* ── Merge loop: find best pair via hash lookup, apply globally ──
-     *
-     * Instead of iterating n_merges rules and scanning for matches,
-     * we scan positions ONCE per pass, look up each adjacent pair in
-     * the hash table, and find the globally-best (lowest rank) merge.
-     * Then apply that merge to ALL matching pairs in one contraction pass.
-     *
-     * Each contraction:
-     *   - Replaces the left site's token with the merged token
-     *   - Kills the right site (linked list surgery)
-     *   - Updates the HPCGraph: removes CZ edge between the pair,
-     *     re-links the merged site's edges to its new neighbor
-     *   - Accumulates phase via ω^(a·b) multiplication on the quhit */
-    int pass = 0;
-    while (n_alive > 1) {
-        /* ── SCAN: find the globally-best merge pair ── */
-        int32_t best_rank = 0x7FFFFFFF;
-        int32_t best_a = -1, best_b = -1, best_merged = -1;
-        #pragma omp parallel
-        {
-            int32_t local_rank = 0x7FFFFFFF;
-            int32_t local_a = -1, local_b = -1, local_merged = -1;
-            #pragma omp for schedule(static) nowait
-            for (int64_t i = 0; i < n_chars; i++) {
-                if (!alive[i]) continue;
-                int32_t ni = nxt[i];
-                if (ni < 0 || !alive[ni]) continue;
-                /* O(1) hash lookup for this pair */
-                uint32_t h = bpe_hash(tokens[i], tokens[ni]);
-                for (int p = 0; p < 64; p++) {  /* bounded probe */
-                    uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
-                    if (htable[idx].tok_a == BPE_HASH_EMPTY) break;
-                    if (htable[idx].tok_a == tokens[i] &&
-                        htable[idx].tok_b == tokens[ni]) {
-                        if (htable[idx].rank < local_rank) {
-                            local_rank = htable[idx].rank;
-                            local_a = tokens[i];
-                            local_b = tokens[ni];
-                            local_merged = htable[idx].merged_id;
-                        }
-                        break;
-                    }
-                }
-            }
-            #pragma omp critical
-            {
-                if (local_rank < best_rank) {
-                    best_rank = local_rank;
-                    best_a = local_a;
-                    best_b = local_b;
-                    best_merged = local_merged;
-                }
-            }
-        }
-        if (best_a < 0) break;  /* No more mergeable pairs */
-        /* ── CONTRACT: apply best merge to ALL matching pairs ──
-         * Serial pass (linked list surgery must be ordered L→R) */
-        int64_t n_merged = 0;
-        for (int64_t i = 0; i < n_chars; i++) {
-            if (!alive[i]) continue;
-            if (tokens[i] != best_a) continue;
-            int32_t ni = nxt[i];
-            if (ni < 0 || !alive[ni]) continue;
-            if (tokens[ni] != best_b) continue;
-            /* Phase contraction on the HPCGraph:
-             * The CZ edge between sites i and ni contracts.
-             * Update site i's local state to the merged token. */
-            {
-                double re[6] = {0}, im[6] = {0};
-                int basis = best_merged % HPC_D;
-                re[basis] = 1.0;
-                hpc_set_local(graph, (uint64_t)i, re, im);
-            }
-            /* Contract token sequence */
-            tokens[i] = best_merged;
-            alive[ni] = 0;
-            n_alive--;
-            n_merged++;
-            /* Linked list surgery */
-            int32_t nni = nxt[ni];
-            nxt[i] = nni;
-            if (nni >= 0) prv[nni] = (int32_t)i;
-        }
-        pass++;
-        if (verbose && pass % 100 == 0) {
-            fprintf(stderr, "\r  HPC·BPE: pass %d, %ld tokens (%.1f%%), "
-                    "last merge: rank %d, %ld instances    ",
-                    pass, (long)n_alive, 100.0 * n_alive / n_chars,
-                    best_rank, (long)n_merged);
-        }
-    }
-    if (verbose) {
-        fprintf(stderr, "\r  HPC·BPE: %d passes, %ld → %ld tokens (%.1f%%)%s\n",
-                pass, (long)n_chars, (long)n_alive,
-                100.0 * n_alive / n_chars, "                              ");
-        fprintf(stderr, "  HPC·BPE: graph stats — %lu CZ edges, "
-                "avg fidelity %.4f\n",
-                (unsigned long)graph->cz_edges, graph->avg_fidelity);
-    }
-    /* Collect surviving tokens */
-    int64_t out_idx = 0;
-    for (int64_t i = 0; i < n_chars; i++) {
-        if (alive[i]) {
-            output_ids[out_idx++] = tokens[i];
-        }
-    }
-    *out_n_tokens = out_idx;
-    /* Cleanup */
-    hpc_destroy(graph);
-    free(htable);
-    free(tokens);
-    free(nxt);
-    free(prv);
-    free(alive);
-}
-/* ═══════════════════════════════════════════════════════════════════════════
- * HPC Forward Pass — The Graph IS the Computation
- *
- * Architecture mirrors the BPE tokenizer:
- *   - Token positions → HPCGraph sites
- *   - Hidden dimensions → triality-encoded quhit amplitudes
- *   - Weight projections → phase edges between input/output sites
- *   - Attention → CZ coupling between Q/K sites + marginal readout
- *   - Importance → graph |ψ|² marginal probabilities (no separate E[x²])
- *
- * One function does the entire layer: norm → QKV → attention → FFN.
- * Python only handles weight I/O; all compute flows through HPCGraph.
- * ═══════════════════════════════════════════════════════════════════════════ */
-/* ── Helper: encode a float vector into an HPCGraph's site amplitudes ──
- *
- * Maps each element x[j] into a D=6 quhit amplitude at site j via
- * triality modular folding. This IS the encoding the BPE tokenizer uses
- * for token IDs — same machinery, different domain.
- */
-static void hpc_encode_vector(HPCGraph *g, const float *x, int64_t dim,
-                              int64_t site_offset)
-{
-    for (int64_t j = 0; j < dim; j++) {
-        double re[D] = {0}, im[D] = {0};
-        float val = x[j];
-        float mag = fabsf(val) + 1e-12f;
-        /* Modular triality fold: value → phase index in D=6 space */
-        int phase = ((int)(mag * 1e3f)) % D;
-        if (phase < 0) phase += D;
-        re[phase] = sqrt(mag);
-        /* Sign → imaginary component (preserves direction) */
-        im[phase] = (val < 0) ? -sqrt(mag) * 0.5 : sqrt(mag) * 0.5;
-        /* Spread to neighbors for smooth encoding */
-        re[(phase + 1) % D] = sqrt(mag) * 0.25;
-        re[(phase + 5) % D] = sqrt(mag) * 0.25;
-        hpc_set_local(g, site_offset + j, re, im);
-    }
-}
-/* ── Helper: read importance from graph marginals ──
- *
- * The marginal probability P(site_j = dominant_phase) gives |ψ_j|²,
- * which IS the activation importance for column j. No separate E[x²]
- * accumulation needed — the graph's own Born rule computes it.
- */
-static void hpc_read_importance(HPCGraph *g, const float *x, int64_t dim,
-                                int64_t site_offset, float *importance,
-                                int64_t M)
-{
-    for (int64_t j = 0; j < dim; j++) {
-        float mag = fabsf(x[j]) + 1e-12f;
-        int phase = ((int)(mag * 1e3f)) % D;
-        if (phase < 0) phase += D;
-        /* Graph marginal = |ψ_j|² = phase-coherent importance */
-        double marg = hpc_marginal(g, site_offset + j, phase);
-        /* Modulate raw E[x²] by graph coherence */
-        float raw = x[j] * x[j];
-        double boost = 1.0 + (marg * D - 1.0) * 0.5;
-        if (boost < 0.5) boost = 0.5;
-        if (boost > 2.0) boost = 2.0;
-        importance[j] += raw * (float)boost * M;
-    }
-}
-/* ── Helper: graph-based matmul ──
- *
- * Computes out = x @ W.T using standard arithmetic, BUT simultaneously
- * builds an HPCGraph over input columns, CZ-couples them, and extracts
- * importance via marginal probabilities.
- *
- * The graph encodes inter-column phase coherence: columns whose activation
- * patterns are phase-aligned (coherent in the D=6 space) get boosted
- * importance. This is what raw E[x²] misses.
- */
-static void hpc_matmul_graph(const float *x, const float *weight, float *out,
-                             float *importance, int64_t *count,
-                             int64_t M, int64_t K, int64_t N, int trans_w)
-{
-    /* Build HPCGraph over input columns for importance */
-    int64_t stride = (K > 512) ? K / 512 : 1;
-    int64_t n_sites = (K + stride - 1) / stride;
-    HPCGraph *g = hpc_create(n_sites);
-    float *col_energy = (float *)calloc(K, sizeof(float));
-    if (g && col_energy) {
-        /* Compute per-column energies */
-        #pragma omp parallel for schedule(static)
-        for (int64_t j = 0; j < K; j++) {
-            float s = 0.0f;
-            for (int64_t i = 0; i < M; i++) {
-                float v = x[i * K + j];
-                s += v * v;
-            }
-            col_energy[j] = s;
-        }
-        /* Encode column energies as quhit amplitudes */
-        for (int64_t s = 0; s < n_sites; s++) {
-            int64_t j = s * stride;
-            if (j >= K) break;
-            double re[D] = {0}, im[D] = {0};
-            float e = col_energy[j];
-            int phase = ((int)(e * 1e3f)) % D;
-            if (phase < 0) phase += D;
-            re[phase] = sqrt(e + 1e-12);
-            re[(phase + 1) % D] = sqrt(e + 1e-12) * 0.25;
-            re[(phase + 5) % D] = sqrt(e + 1e-12) * 0.25;
-            hpc_set_local(g, s, re, im);
-        }
-        /* CZ-couple adjacent sites — phase coherence propagation */
-        for (int64_t s = 0; s < n_sites - 1; s++)
-            hpc_cz(g, s, s + 1);
-        /* Read importance via graph marginals */
-        double fidelity = g->avg_fidelity;
-        for (int64_t s = 0; s < n_sites; s++) {
-            int64_t j0 = s * stride;
-            int64_t j1 = (s + 1) * stride;
-            if (j1 > K) j1 = K;
-            float e = col_energy[j0];
-            int phase = ((int)(e * 1e3f)) % D;
-            if (phase < 0) phase += D;
-            double marg = hpc_marginal(g, s, phase);
-            double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5;
-            if (boost < 0.5) boost = 0.5;
-            if (boost > 2.0) boost = 2.0;
-            for (int64_t j = j0; j < j1; j++)
-                importance[j] += col_energy[j] * (float)boost;
-        }
-        if (count) *count += M;
-    }
-    /* Matmul: out = x @ W.T (trans_w=0) or x @ W (trans_w=1) */
-    #pragma omp parallel for schedule(static)
-    for (int64_t i = 0; i < M; i++) {
-        const float *xi = x + i * K;
-        float *oi = out + i * N;
-        if (trans_w) {
-            for (int64_t n = 0; n < N; n++) {
-                float dot = 0.0f;
-                for (int64_t k = 0; k < K; k++)
-                    dot += xi[k] * weight[k * N + n];
-                oi[n] = dot;
-            }
-        } else {
-            for (int64_t n = 0; n < N; n++) {
-                const float *wn = weight + n * K;
-                float dot = 0.0f;
-                for (int64_t k = 0; k < K; k++)
-                    dot += xi[k] * wn[k];
-                oi[n] = dot;
-            }
-        }
-    }
-    if (col_energy) free(col_energy);
-    if (g) hpc_destroy(g);
-}
-/* ── Helper: RMS norm (OpenMP) ── */
-void hexstate_rms_norm(const float *x, const float *w, float *out,
-                         int64_t seq, int64_t dim, float eps)
-{
-    #pragma omp parallel for schedule(static)
-    for (int64_t i = 0; i < seq; i++) {
-        const float *row = x + i * dim;
-        float *orow = out + i * dim;
-        float ss = 0.0f;
-        for (int64_t j = 0; j < dim; j++) ss += row[j] * row[j];
-        float inv = 1.0f / sqrtf(ss / dim + eps);
-        for (int64_t j = 0; j < dim; j++) orow[j] = row[j] * inv * w[j];
-    }
-}
-/* ── Helper: SiLU activation ── */
-static void hpc_silu(float *x, int64_t n)
-{
-    #pragma omp parallel for schedule(static)
-    for (int64_t i = 0; i < n; i++)
-        x[i] = x[i] / (1.0f + expf(-x[i]));
-}
-/* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_forward_layer — Complete layer forward pass via HPCGraph
- *
- * One C call does: RMS norm → QKV projection → HPC linear attention →
- *                  gate projection → SSM (optional) → FFN
- *
- * The HPCGraph is used for:
- *   1. Importance recording: graph marginals give phase-coherent |ψ|²
- *   2. Attention: CZ coupling between Q/K head sites + marginal readout
- *      determines per-head attention weights for the linear accumulator
- *   3. Cross-head coherence: adjacent heads are CZ-coupled, so GQA
- *      structure emerges from the graph topology
- *
- * Parameters:
- *   hidden:     [seq_len × n_embd], modified in-place
- *   norm_w:     [n_embd] attention norm weights
- *   qkv_w:      [qkv_dim × n_embd] fused QKV weights (NULL if separate)
- *   q_w/k_w/v_w: separate QKV weights (NULL if fused)
- *   gate_w:     [n_embd × attn_out_dim] gate/output projection
- *   o_w:        [n_embd × v_total_dim] output projection (separate path)
- *   ffn_norm_w: [n_embd] FFN norm weights
- *   ffn_gate/up/down: FFN weights
- *   imp_*:      importance accumulators (one per weight matrix)
- *   cnt_*:      sample counts per weight
- *   seq/embd/heads/hd/ffn_dim: architecture dimensions
- *   eps:        RMS norm epsilon
- * ═══════════════════════════════════════════════════════════════════════════ */
-void hexstate_forward_layer(
-    float *hidden,
-    /* Attention weights */
-    const float *norm_w,
-    const float *qkv_w, int64_t qkv_dim,
-    const float *q_w, int64_t q_dim,
-    const float *k_w, int64_t k_dim,
-    const float *v_w, int64_t v_dim,
-    const float *gate_w, int64_t gate_rows,
-    const float *o_w, int64_t o_cols,
-    /* FFN weights */
-    const float *ffn_norm_w,
-    const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w,
-    int64_t ffn_dim,
-    /* Importance accumulators (NULL to skip) */
-    float *imp_qkv, int64_t *cnt_qkv,
-    float *imp_q, int64_t *cnt_q,
-    float *imp_k, int64_t *cnt_k,
-    float *imp_v, int64_t *cnt_v,
-    float *imp_gate, int64_t *cnt_gate,
-    float *imp_o, int64_t *cnt_o,
-    float *imp_ffn_gate, int64_t *cnt_ffn_gate,
-    float *imp_ffn_up, int64_t *cnt_ffn_up,
-    float *imp_ffn_down, int64_t *cnt_ffn_down,
-    /* Architecture */
-    int64_t seq_len, int64_t n_embd, int64_t n_head, int64_t n_head_kv,
-    int64_t head_dim, float eps)
-{
-    float *normed = (float *)malloc(seq_len * n_embd * sizeof(float));
-    if (!normed) return;
-    /* ══════════════ Phase 1: Attention Norm ══════════════ */
-    hexstate_rms_norm(hidden, norm_w, normed, seq_len, n_embd, eps);
-    /* ══════════════ Phase 2: QKV Projection via HPC Graph ══════════════ */
-    float *attn_out = (float *)calloc(seq_len * n_embd, sizeof(float));
-    if (!attn_out) { free(normed); return; }
-    if (qkv_w && qkv_dim > 0) {
-        /* ── Fused QKV path (Qwen 3.6) ── */
-        float *qkv = (float *)malloc(seq_len * qkv_dim * sizeof(float));
-        if (!qkv) { free(normed); free(attn_out); return; }
-        /* Graph-based matmul: importance via HPCGraph marginals */
-        hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv,
-                         seq_len, n_embd, qkv_dim, 0);
-        /* Split Q, K, V */
-        int64_t q_total = n_head * head_dim;
-        int64_t kv_total = n_head_kv * head_dim;
-        float *Q = qkv;                              /* [seq, q_total] */
-        float *K = qkv + q_total;                    /* offset per row */
-        float *V = qkv + q_total + kv_total;         /* offset per row */
-        /* ── HPC Linear Attention: graph IS the attention ──
-         *
-         * Create HPCGraph with n_head sites.
-         * Each head is a site. K·V interaction energy → quhit amplitude.
-         * CZ edges between adjacent heads → cross-head phase coherence.
-         * hpc_marginal(h) → attention weight for head h.
-         *
-         * Running state S[h] accumulates K⊗V, weighted by coherence.
-         * This is causal linear attention where the HPC graph determines
-         * HOW MUCH each head contributes at each timestep.
-         */
-        HPCGraph *attn_graph = hpc_create(n_head);
-        float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float));
-        float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float));
-        int64_t inner_dim = n_head * head_dim;
-        float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
-        if (attn_graph && S && z_acc && attn_inner) {
-            for (int64_t t = 0; t < seq_len; t++) {
-                /* Extract Q/K/V for this timestep (handle strided layout) */
-                float *qt_base = qkv + t * qkv_dim;
-                float *kt_base = qt_base + q_total;
-                float *vt_base = kt_base + kv_total;
-                /* Encode K·V energy into graph sites */
-                for (int64_t h = 0; h < n_head; h++) {
-                    int64_t kv_h = h % n_head_kv;  /* GQA mapping */
-                    float *kh = kt_base + kv_h * head_dim;
-                    float *vh = vt_base + kv_h * head_dim;
-                    float energy = 0.0f;
-                    for (int64_t d = 0; d < head_dim; d++)
-                        energy += kh[d] * vh[d];
-                    /* Triality encode energy → D=6 quhit amplitude */
-                    double re[D] = {0}, im[D] = {0};
-                    float ae = fabsf(energy) + 1e-6f;
-                    int ph = ((int)(ae * 100.0f)) % D;
-                    re[ph] = sqrt(ae);
-                    im[ph] = (energy < 0) ? -sqrt(ae) * 0.5 : sqrt(ae) * 0.5;
-                    re[(ph+1)%D] = sqrt(ae) * 0.2;
-                    re[(ph+5)%D] = sqrt(ae) * 0.2;
-                    hpc_set_local(attn_graph, h, re, im);
-                }
-                /* CZ-couple adjacent heads: creates cross-head entanglement */
-                for (int64_t h = 0; h < n_head - 1; h++)
-                    hpc_cz(attn_graph, h, h + 1);
-                /* Compute attention output per head using graph marginals */
-                #pragma omp parallel for schedule(static)
-                for (int64_t h = 0; h < n_head; h++) {
-                    int64_t kv_h = h % n_head_kv;
-                    float *qh = qt_base + h * head_dim;
-                    float *kh = kt_base + kv_h * head_dim;
-                    float *vh = vt_base + kv_h * head_dim;
-                    float *Sh = S + h * head_dim * head_dim;
-                    float *zh = z_acc + h * head_dim;
-                    /* Get HPC marginal: phase-coherent weight for this head */
-                    float ae = 0.0f;
-                    for (int64_t d = 0; d < head_dim; d++)
-                        ae += fabsf(kh[d] * vh[d]);
-                    ae += 1e-6f;
-                    int ph = ((int)(ae * 100.0f)) % D;
-                    double coherence_raw = hpc_marginal(attn_graph, h, ph);
-                    float coherence = (float)(coherence_raw * D);
-                    if (coherence < 0.1f) coherence = 0.1f;
-                    if (coherence > 3.0f) coherence = 3.0f;
-                    /* Feature map: φ(x) = max(x,0) + ε */
-                    float qf[256], kf[256];
-                    for (int64_t d = 0; d < head_dim; d++) {
-                        qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
-                        kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
-                    }
-                    /* Accumulate: S += coherence × outer(kf, v) */
-                    for (int64_t d1 = 0; d1 < head_dim; d1++) {
-                        float ks = kf[d1] * coherence;
-                        for (int64_t d2 = 0; d2 < head_dim; d2++)
-                            Sh[d1 * head_dim + d2] += ks * vh[d2];
-                    }
-                    for (int64_t d = 0; d < head_dim; d++)
-                        zh[d] += kf[d] * coherence;
-                    /* Output: (qf @ S) / (qf · z) */
-                    float den = 1e-8f;
-                    for (int64_t d = 0; d < head_dim; d++)
-                        den += qf[d] * zh[d];
-                    float inv_den = 1.0f / den;
-                    /* Write to attn_inner at position [t, h*head_dim ... ] */
-                    float *ao = attn_inner + t * inner_dim;
-                    for (int64_t d2 = 0; d2 < head_dim; d2++) {
-                        float num = 0.0f;
-                        for (int64_t d1 = 0; d1 < head_dim; d1++)
-                            num += qf[d1] * Sh[d1 * head_dim + d2];
-                        /* Accumulate into attn_inner (multiple heads write here) */
-                        ao[h * head_dim + d2] = num * inv_den;
-                    }
-                }
-                /* Compact graph edges periodically */
-                if (t > 0 && t % 64 == 0)
-                    hpc_compact_edges(attn_graph);
-            }
-        }
-        /* Gate projection if present */
-        if (gate_w && gate_rows > 0) {
-            int trans_w = (gate_rows == inner_dim) ? 1 : 0;
-            int64_t N_out = trans_w ? n_embd : gate_rows;
-            float *gated = (float *)malloc(seq_len * N_out * sizeof(float));
-            if (gated) {
-                hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate,
-                                seq_len, inner_dim, N_out, trans_w);
-                for (int64_t t = 0; t < seq_len; t++) {
-                    int64_t copy_dim = N_out < n_embd ? N_out : n_embd;
-                    memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float));
-                }
-                free(gated);
-            }
-        } else {
-            for (int64_t t = 0; t < seq_len; t++) {
-                int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
-                memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
-            }
-        }
-        if (attn_inner) free(attn_inner);
-        if (attn_graph) hpc_destroy(attn_graph);
-        free(S); free(z_acc); free(qkv);
-    } else if (q_w && k_w && v_w && o_w) {
-        /* ── Separate QKV path (standard transformer) ── */
-        float *Q = (float *)malloc(seq_len * q_dim * sizeof(float));
-        float *K_buf = (float *)malloc(seq_len * k_dim * sizeof(float));
-        float *V_buf = (float *)malloc(seq_len * v_dim * sizeof(float));
-        if (!Q || !K_buf || !V_buf) {
-            if(Q) free(Q); if(K_buf) free(K_buf); if(V_buf) free(V_buf);
-            free(normed); free(attn_out);
-            return;
-        }
-        hpc_matmul_graph(normed, q_w, Q, imp_q, cnt_q, seq_len, n_embd, q_dim, 0);
-        hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0);
-        hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0);
-        /* Same HPC attention as above but with separate Q/K/V buffers */
-        int64_t hd_q = q_dim / n_head;
-        int64_t hd_kv = k_dim / n_head_kv;
-        int64_t inner_dim = n_head * hd_kv;
-        HPCGraph *attn_graph = hpc_create(n_head);
-        float *S = (float *)calloc(n_head * hd_kv * hd_kv, sizeof(float));
-        float *z_acc = (float *)calloc(n_head * hd_kv, sizeof(float));
-        float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
-        if (attn_graph && S && z_acc && attn_inner) {
-            for (int64_t t = 0; t < seq_len; t++) {
-                /* Encode heads into graph */
-                for (int64_t h = 0; h < n_head; h++) {
-                    int64_t kv_h = h % n_head_kv;
-                    float *kh = K_buf + t * k_dim + kv_h * hd_kv;
-                    float *vh = V_buf + t * v_dim + kv_h * hd_kv;
-                    float energy = 0.0f;
-                    for (int64_t d = 0; d < hd_kv; d++)
-                        energy += kh[d] * vh[d];
-                    double re[D] = {0}, im[D] = {0};
-                    float ae = fabsf(energy) + 1e-6f;
-                    int ph = ((int)(ae * 100.0f)) % D;
-                    re[ph] = sqrt(ae);
-                    im[ph] = (energy < 0) ? -sqrt(ae)*0.5 : sqrt(ae)*0.5;
-                    hpc_set_local(attn_graph, h, re, im);
-                }
-                for (int64_t h = 0; h < n_head - 1; h++)
-                    hpc_cz(attn_graph, h, h+1);
-                #pragma omp parallel for schedule(static)
-                for (int64_t h = 0; h < n_head; h++) {
-                    int64_t kv_h = h % n_head_kv;
-                    float *qh = Q + t * q_dim + h * hd_q;
-                    float *kh = K_buf + t * k_dim + kv_h * hd_kv;
-                    float *vh = V_buf + t * v_dim + kv_h * hd_kv;
-                    float *Sh = S + h * hd_kv * hd_kv;
-                    float *zh = z_acc + h * hd_kv;
-                    int64_t feat = hd_q < hd_kv ? hd_q : hd_kv;
-                    float ae = fabsf(kh[0]*vh[0]) + 1e-6f;
-                    int ph = ((int)(ae * 100.0f)) % D;
-                    double coh_raw = hpc_marginal(attn_graph, h, ph);
-                    float coh = (float)(coh_raw * D);
-                    if (coh < 0.1f) coh = 0.1f;
-                    if (coh > 3.0f) coh = 3.0f;
-                    for (int64_t d1 = 0; d1 < feat; d1++) {
-                        float kf = (kh[d1] > 0 ? kh[d1] : 0) + 1e-6f;
-                        float ks = kf * coh;
-                        for (int64_t d2 = 0; d2 < hd_kv; d2++)
-                            Sh[d1*hd_kv+d2] += ks * vh[d2];
-                        zh[d1] += kf * coh;
-                    }
-                    float den = 1e-8f;
-                    for (int64_t d = 0; d < feat; d++) {
-                        float qf = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
-                        den += qf * zh[d];
-                    }
-                    float inv_den = 1.0f / den;
-                    float *ao = attn_inner + t * inner_dim;
-                    for (int64_t d2 = 0; d2 < hd_kv; d2++) {
-                        float num = 0.0f;
-                        for (int64_t d1 = 0; d1 < feat; d1++) {
-                            float qf = (qh[d1] > 0 ? qh[d1] : 0) + 1e-6f;
-                            num += qf * Sh[d1*hd_kv+d2];
-                        }
-                        ao[h*hd_kv+d2] = num * inv_den;
-                    }
-                }
-                if (t > 0 && t % 64 == 0)
-                    hpc_compact_edges(attn_graph);
-            }
-        }
-        /* Output projection */
-        if (o_w && o_cols > 0) {
-            float *proj_in = attn_inner;
-            int free_proj_in = 0;
-            if (inner_dim != o_cols) {
-                proj_in = (float *)calloc(seq_len * o_cols, sizeof(float));
-                if (proj_in) {
-                    for (int64_t t = 0; t < seq_len; t++) {
-                        int64_t copy_dim = inner_dim < o_cols ? inner_dim : o_cols;
-                        memcpy(proj_in + t * o_cols, attn_inner + t * inner_dim, copy_dim * sizeof(float));
-                    }
-                    free_proj_in = 1;
-                } else {
-                    proj_in = attn_inner;
-                }
-            }
-            float *projected = (float *)calloc(seq_len * n_embd, sizeof(float));
-            if (projected) {
-                hpc_matmul_graph(proj_in, o_w, projected, imp_o, cnt_o,
-                                seq_len, o_cols, n_embd, 0);
-                memcpy(attn_out, projected, seq_len * n_embd * sizeof(float));
-                free(projected);
-            }
-            if (free_proj_in && proj_in != attn_inner) free(proj_in);
-        } else {
-            for (int64_t t = 0; t < seq_len; t++) {
-                int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
-                memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
-            }
-        }
-        if (attn_inner) free(attn_inner);
-        if (attn_graph) hpc_destroy(attn_graph);
-        free(S); free(z_acc);
-        free(Q); free(K_buf); free(V_buf);
-    }
-    /* Residual add: hidden += attn_out */
-    int64_t total = seq_len * n_embd;
-    #pragma omp parallel for schedule(static)
-    for (int64_t i = 0; i < total; i++)
-        hidden[i] += attn_out[i];
-    /* ══════════════ Phase 3: FFN ══════════════ */
-    if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) {
-        float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float));
-        float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
-        float *up_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
-        if (normed_ff && gate_out && up_out) {
-            hexstate_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps);
-            /* Graph-based matmul for FFN with importance */
-            hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out,
-                            imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0);
-            hpc_matmul_graph(normed_ff, ffn_up_w, up_out,
-                            imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0);
-            /* SiLU(gate) * up */
-            hpc_silu(gate_out, seq_len * ffn_dim);
-            #pragma omp parallel for schedule(static)
-            for (int64_t i = 0; i < seq_len * ffn_dim; i++)
-                gate_out[i] *= up_out[i];
-            /* Down projection: graph-based importance recording */
-            float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float));
-            if (ff_out_buf) {
-                hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf,
-                                imp_ffn_down, cnt_ffn_down,
-                                seq_len, ffn_dim, n_embd, 0);
-                /* Residual add */
-                #pragma omp parallel for schedule(static)
-                for (int64_t i = 0; i < total; i++)
-                    hidden[i] += ff_out_buf[i];
-                free(ff_out_buf);
-            }
-        }
-        free(normed_ff); free(gate_out); free(up_out);
-    }
-    free(normed);
-    free(attn_out);
-}
 #ifndef HEXSTATE_LIBRARY
 /* ═══════════════════════════════════════════════════════════════════════════
  * MAIN
@@ -4898,21 +4136,11 @@ int main(int argc, char **argv)
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
-    /* Try to read config.json: explicit --config overrides auto-detect */
     char config_path[1024];
     const char *config_ptr = NULL;
-    if (config_override) {
-        FILE *check = fopen(config_override, "rb");
-        if (check) {
-            fclose(check);
-            config_ptr = config_override;
-            printf("  Using config.json: %s (via --config)\n", config_override);
-        } else {
-            fprintf(stderr, "  WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
-        }
-    }
-    if (!config_ptr) {
-        snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
         FILE *check = fopen(config_path, "rb");
         if (check) {
             fclose(check);

     scale_table_initialized = 1;
 }
+/* ═══════════════════════════════════════════════════════════════════════════
+ * THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
+ *
+ * The sub-block Shor measurement uses a 16-node linear-chain graph that
+ * is identical in topology every time. Instead of hpc_create()/hpc_destroy()
+ * inside the OMP hot loop, we reset the same graph to a clean state.
+ *
+ * This function resets an existing HPCGraph with n_sites nodes to its
+ * initial state: clears all edges, resets adjacency lists, reinitializes
+ * locals. Zero allocations.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
+{
+    /* Reset edge state */
+    g->n_edges = 0;
+    g->cz_edges = 0;
+    g->phase_edges = 0;
+    g->syntheme_edges = 0;
+    g->n_log = 0;
+    g->min_fidelity = 1.0;
+    g->avg_fidelity = 1.0;
+    g->amp_evals = 0;
+    g->prob_evals = 0;
+    g->measurements = 0;
+    /* Reset adjacency lists (just zero the counts, keep allocated buffers) */
+    for (uint64_t i = 0; i < n_sites; i++) {
+        g->adj[i].count = 0;
+    }
+    /* Reinitialize local quhit states */
+    for (uint64_t i = 0; i < n_sites; i++)
+        triality_init(&g->locals[i]);
+}
+/* ═══════════════════════════════════════════════════════════════════════════
+ * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
+ *
+ * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4
+ * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
+ * x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x))
+ * is close enough for error norm purposes (~1% relative error).
+ * ═══════════════════════════════════════════════════════════════════════════ */
+static inline float fast_pow_2_4(float x)
+{
+    /* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
+     * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
+     * x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex.
+     * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
+     * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */
+    float x2 = x * x;
+    return x2 * sqrtf(cbrtf(x2));  /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */
+}
 /* Compute the Q2_K sub-block reconstruction error for a block at a given
  * scale multiplier, optionally weighted by importance vector */
 static float compute_block_error_q2k(const float *weights, int block_size,
             float deq = cand_min + scale * (float)l;
             float diff = fabsf(x[i] - deq);
+            /* Apply error norm — fast path for default norm=2.4 */
             float e = diff;
+            if (cfg->norm == 2.4f) {
+                e = fast_pow_2_4(diff);
+            } else if (cfg->norm != 1.0f) {
                 e = powf(diff, cfg->norm);
             }
             /* Apply importance weighting */
                 /* Build per-block CDFs from triality marginals */
                 unsigned int born_rng = 314159;
+                /* Compute tail error once (blocks beyond graph coverage) */
+                float tail_err_q4 = 0.0f;
+                for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
+                    tail_err_q4 += cand_errors[bi][best_candidate[bi]];
+                /* Sparse shot buffer: only track stride-sampled blocks */
+                int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
                 for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
+                    float shot_err = tail_err_q4;
                     for (int64_t gi = 0; gi < graph_blocks; gi++) {
                         /* Normalize marginals to CDF */
                             }
                         }
+                        shot_sparse_q4[gi] = best_bin_cand;
                         shot_err += cand_errors[blk][best_bin_cand];
                     }
                     /* Metropolis acceptance: adopt if better than current best */
                     if (shot_err < beam_total_err) {
+                        for (int64_t gi = 0; gi < graph_blocks; gi++)
+                            best_candidate[gi * stride] = shot_sparse_q4[gi];
                         beam_total_err = shot_err;
                     }
                 }
+                free(shot_sparse_q4);
             }
             free(marg);
                     beam_total_err += candidate_errors[bi][best_candidate[bi]];
                 unsigned int born_rng_q2 = 271828;
+                /* Compute tail error once (blocks beyond graph coverage) */
+                float tail_err = 0.0f;
+                for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
+                    tail_err += candidate_errors[bi][best_candidate[bi]];
+                /* Sparse shot buffer: only track stride-sampled blocks */
+                int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int));
                 for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
+                    float shot_err = tail_err;
                     for (int64_t gi = 0; gi < graph_blocks; gi++) {
                         /* Born sample coarse (d) quhit */
                             }
                         }
+                        shot_sparse[gi] = best_bin_cand;
                         shot_err += candidate_errors[blk][best_bin_cand];
                     }
                     if (shot_err < beam_total_err) {
+                        /* Only now apply the sparse updates to best_candidate */
+                        for (int64_t gi = 0; gi < graph_blocks; gi++)
+                            best_candidate[gi * stride] = shot_sparse[gi];
                         beam_total_err = shot_err;
                     }
                 }
+                free(shot_sparse);
             }
             free(coarse_marg);
      * the perfect bit analog at 2-bit resolution.
      * ══════════════════════════════════════════════════════════════════ */
+    /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
+     * This eliminates ~776K malloc/free cycles from the inner loop.
+     * Each thread reuses its graph via hpc_reset_for_subblock(). */
+    int _n_omp_threads = 1;
+    #ifdef _OPENMP
+    _n_omp_threads = omp_get_max_threads();
+    #endif
+    HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *));
+    for (int _ti = 0; _ti < _n_omp_threads; _ti++)
+        _tl_graphs[_ti] = hpc_create(N_SUB);
     #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
     for (int64_t blk = 0; blk < n_blocks; blk++) {
         const float *block_x = weights + blk * QK_K;
         float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
         /* ── Analog assembly: iterate to convergence ──
+         * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes
+         * after 2-3 passes. Additional iterations produce negligible
+         * change in the committed FP16 values.
+         *   A) Sub-block Shor measurement to find coupled (Ls,Lm) states
          *   B) Optimal q-value assignment
          *   C) WLS solve for (d, dmin) */
+        for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
             /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
              * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
                 }
             }
+            /* Reset thread-local sub-block graph (zero allocations) */
+            int _tid = 0;
+            #ifdef _OPENMP
+            _tid = omp_get_thread_num();
+            #endif
+            HPCGraph *sg = _tl_graphs[_tid];
+            hpc_reset_for_subblock(sg, N_SUB);
+            {
                 float min_sub_err[N_SUB];
                 for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
                     hpc_cz(sg, j, j + 1);
                 /* ── Shor sequential measurement on sub-block graph ──
+                 * Stack-allocated arrays: eliminates 2 calloc/free per iteration */
+                double sub_marg[N_SUB][6];
+                int sub_measured[N_SUB];
+                memset(sub_marg, 0, sizeof(sub_marg));
+                memset(sub_measured, 0, sizeof(sub_measured));
                 shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
                     Ls_blk[j] = state_ls[j][best_v];
                     Lm_blk[j] = state_lm[j][best_v];
                 }
             }
             /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
         }
         /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
+         * The WLS solve may have shifted (d, dmin) after the last Step A.
+         * Neighborhood search ±2 around current values (25 pairs vs 256)
+         * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
         for (int j = 0; j < N_SUB; j++) {
             const float *sx = block_x + 16 * j;
             float best_sub_err = 1e30f;
             uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
+            int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
+            int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
+            int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
+            int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
+            for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
                 float d_sub = dm * (float)try_ls;
+                for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
                     float m_sub = mm * (float)try_lm;
                     float sub_err = 0.0f;
                     for (int k = 0; k < 16; k++) {
         total_err += berr;
     }
+    /* Free thread-local sub-block graphs */
+    for (int _ti = 0; _ti < _n_omp_threads; _ti++)
+        hpc_destroy(_tl_graphs[_ti]);
+    free(_tl_graphs);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);
     if (out_error) *out_error = err;
 }
 #ifndef HEXSTATE_LIBRARY
 /* ═══════════════════════════════════════════════════════════════════════════
  * MAIN
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
+    /* Try to read config.json from model directory */
     char config_path[1024];
+    snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
     const char *config_ptr = NULL;
+    {
         FILE *check = fopen(config_path, "rb");
         if (check) {
             fclose(check);

hexstate_requantize.py CHANGED Viewed

@@ -1,27 +1,15 @@
 #!/usr/bin/env python3
 """
-HExState GGUF Re-Quantizer — GGUF-to-GGUF HPC quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
-and re-quantizes eligible weight tensors using the HExState HPC engine
-(Shor-optimized Griffiths-Niu measurement via libhexstate_q2k.so).
-Quantization tiers:
-  - Attention Q/K/V/O + DeltaNet SSM projections → Q4_0 (HPC-optimized)
-  - FFN / MLP weight matrices                    → Q2_K (HPC-optimized)
-  - Embeddings, norms, biases, LM head           → kept at source precision
-Falls back to a pure numpy Q2_K implementation if the C library is not built.
 Usage:
-    python3 hexstate_requantize.py input.gguf output.gguf [options]
-Options:
-    --config <file>     Load HuggingFace config.json for arch detection
-    --imatrix <file>    Importance matrix for calibrated quantization
-    --keep-metadata     Preserve all GGUF metadata as-is
-    --q2all             Force all eligible tensors to Q2_K
-    --quantize-none     Skip quantization (passthrough)
 """
 import struct
@@ -29,7 +17,6 @@ import sys
 import time
 import os
 import io
-import json
 import ctypes
 import numpy as np
@@ -277,14 +264,14 @@ TYPE_NAME = {
     13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
 }
-# Block sizes and byte sizes for each type (from ggml.c)
 TYPE_BLOCK_SIZE = {
     0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
     8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
     13: 256, 14: 256, 15: 256, 30: 1,
 }
 TYPE_BLOCK_BYTES = {
-    0: 4, 1: 2, 2: 18, 3: 20, 6: 22, 7: 24,
     8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
     13: 176, 14: 210, 15: 292, 30: 2,
 }
@@ -680,85 +667,9 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
     return True
-def load_model_config(config_path):
-    """Load a HuggingFace config.json and extract architecture info.
-    Supports both flat configs (LLaMA, Mistral, Qwen2, etc.) and
-    nested text_config (Qwen 3.5/3.6 multimodal models).
-    Returns dict with: model_type, hidden_size, num_hidden_layers,
-    num_attention_heads, num_key_value_heads, intermediate_size,
-    vocab_size, layer_types, tie_word_embeddings, rope_theta, etc.
-    """
-    with open(config_path, 'r') as f:
-        raw = json.load(f)
-    cfg = {}
-    # Try flat config first, then nested text_config
-    src = raw
-    if 'text_config' in raw and 'hidden_size' not in raw:
-        src = raw['text_config']
-        cfg['is_multimodal'] = True
-    else:
-        cfg['is_multimodal'] = False
-    # Use top-level model_type if text_config doesn't have one
-    cfg['model_type'] = src.get('model_type', raw.get('model_type', 'unknown'))
-    cfg['hidden_size'] = src.get('hidden_size', 0)
-    cfg['num_hidden_layers'] = src.get('num_hidden_layers', 0)
-    cfg['num_attention_heads'] = src.get('num_attention_heads', 0)
-    cfg['num_key_value_heads'] = src.get('num_key_value_heads', 0)
-    cfg['intermediate_size'] = src.get('intermediate_size', 0)
-    cfg['vocab_size'] = src.get('vocab_size', 0)
-    cfg['tie_word_embeddings'] = src.get('tie_word_embeddings',
-                                          raw.get('tie_word_embeddings', False))
-    cfg['layer_types'] = src.get('layer_types', None)
-    cfg['head_dim'] = src.get('head_dim', 0)
-    cfg['rms_norm_eps'] = src.get('rms_norm_eps', 1e-5)
-    # Rope theta — may be nested in rope_parameters
-    rope_params = src.get('rope_parameters', {})
-    cfg['rope_theta'] = rope_params.get('rope_theta',
-                          src.get('rope_theta', 10000.0))
-    # Architecture classification for GGUF compatibility
-    mt = cfg['model_type'].lower()
-    if mt in ('qwen3_5', 'qwen3_5_text', 'qwen3_5_moe'):
-        cfg['gguf_arch'] = 'qwen2'
-        cfg['has_linear_attn'] = True
-    elif mt in ('qwen2',):
-        cfg['gguf_arch'] = 'qwen2'
-        cfg['has_linear_attn'] = False
-    elif mt in ('qwen2_moe',):
-        cfg['gguf_arch'] = 'qwen2moe'
-        cfg['has_linear_attn'] = False
-    elif mt in ('llama', 'mistral'):
-        cfg['gguf_arch'] = 'llama'
-        cfg['has_linear_attn'] = False
-    elif mt in ('phi3', 'phi'):
-        cfg['gguf_arch'] = 'phi3'
-        cfg['has_linear_attn'] = False
-    elif mt in ('gemma', 'gemma2'):
-        cfg['gguf_arch'] = 'gemma'
-        cfg['has_linear_attn'] = False
-    else:
-        cfg['gguf_arch'] = 'llama'  # fallback
-        cfg['has_linear_attn'] = False
-    return cfg
 def main():
     if len(sys.argv) < 3:
-        print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [options]")
-        print()
-        print("  Options:")
-        print("    --config <file>     Load HuggingFace config.json for arch detection")
-        print("    --imatrix <file>    Importance matrix for calibrated quantization")
-        print("    --keep-metadata     Preserve all GGUF metadata as-is")
-        print("    --q2all             Force all eligible tensors to Q2_K")
-        print("    --quantize-none     Skip quantization (passthrough)")
         sys.exit(1)
     input_path = sys.argv[1]
@@ -767,32 +678,6 @@ def main():
     quantize_none = '--quantize-none' in sys.argv
     q2all = '--q2all' in sys.argv
-    # Check for --config
-    model_config = None
-    for i, arg in enumerate(sys.argv):
-        if arg == '--config' and i + 1 < len(sys.argv):
-            cfg_path = sys.argv[i + 1]
-            if os.path.exists(cfg_path):
-                model_config = load_model_config(cfg_path)
-                print(f"  Loaded config: {cfg_path}")
-                print(f"    model_type:       {model_config['model_type']}")
-                print(f"    gguf_arch:        {model_config['gguf_arch']}")
-                print(f"    hidden_size:      {model_config['hidden_size']}")
-                print(f"    layers:           {model_config['num_hidden_layers']}")
-                print(f"    heads:            {model_config['num_attention_heads']}")
-                print(f"    kv_heads:         {model_config['num_key_value_heads']}")
-                print(f"    vocab:            {model_config['vocab_size']}")
-                print(f"    tied_embeddings:  {model_config['tie_word_embeddings']}")
-                if model_config.get('has_linear_attn'):
-                    lt = model_config.get('layer_types', [])
-                    n_lin = lt.count('linear_attention') if lt else 0
-                    n_full = lt.count('full_attention') if lt else 0
-                    print(f"    layer_types:      {n_lin} linear_attn + {n_full} full_attn")
-                print()
-            else:
-                print(f"  WARNING: config file not found: {cfg_path}")
-            break
     # Check for imatrix
     imatrix_data = None
     for i, arg in enumerate(sys.argv):
@@ -967,13 +852,6 @@ def main():
             out_data_offset += out_size
             out_data_offset = align_offset(out_data_offset)
-        # ── Detect Architecture ──
-        arch = 'llama'
-        for key, vtype, val in kv_pairs:
-            if key == 'general.architecture' and vtype == 8:
-                arch = val.decode('utf-8', errors='ignore')
-                break
         # ── Update KV pairs ──
         updated_kv = []
         if keep_metadata:

 #!/usr/bin/env python3
 """
+HExState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
+and re-quantizes eligible weight tensors to Q2_K using numpy.
+This bypasses the tokenizer parsing problem entirely — the source GGUF
+(from llama.cpp's convert_hf_to_gguf.py) has correct metadata.
 Usage:
+    python3 hexstate_requantize.py input.gguf output.gguf
 """
 import struct
 import time
 import os
 import io
 import ctypes
 import numpy as np
     13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
 }
+# Block sizes and byte sizes for each type
 TYPE_BLOCK_SIZE = {
     0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
     8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
     13: 256, 14: 256, 15: 256, 30: 1,
 }
 TYPE_BLOCK_BYTES = {
+    0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
     8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
     13: 176, 14: 210, 15: 292, 30: 2,
 }
     return True
 def main():
     if len(sys.argv) < 3:
+        print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [--keep-metadata]")
         sys.exit(1)
     input_path = sys.argv[1]
     quantize_none = '--quantize-none' in sys.argv
     q2all = '--q2all' in sys.argv
     # Check for imatrix
     imatrix_data = None
     for i, arg in enumerate(sys.argv):
             out_data_offset += out_size
             out_data_offset = align_offset(out_data_offset)
         # ── Update KV pairs ──
         updated_kv = []
         if keep_metadata:

makefile.quantize CHANGED Viewed

@@ -6,17 +6,17 @@
 # ═══════════════════════════════════════════════════════════════════════════
 CC       = gcc
-CFLAGS   = -O2 -std=gnu99 -shared -fPIC -Wall -Wno-unused-function -Wno-unused-variable -fopenmp
 LDFLAGS  = -lm -lgmp -lmpfr -fopenmp
-# Include local directory for HexState headers
-INCLUDES = -I.
 # Source files — quantizer + HExState engine dependencies (no bigint)
 SRCS     = hexstate_quantize.c \
-           quhit_triality.c \
-           quhit_hexagram.c \
-           s6_exotic.c
 TARGET   = libhexstate_q2k.so

 # ═══════════════════════════════════════════════════════════════════════════
 CC       = gcc
+CFLAGS   = -O3 -march=native -ffast-math -fopenmp -std=gnu99 -shared -fPIC -Wall -Wno-unused-function -Wno-unused-variable
 LDFLAGS  = -lm -lgmp -lmpfr -fopenmp
+# Include parent directory for HExState headers
+INCLUDES = -I..
 # Source files — quantizer + HExState engine dependencies (no bigint)
 SRCS     = hexstate_quantize.c \
+           ../quhit_triality.c \
+           ../quhit_hexagram.c \
+           ../s6_exotic.c
 TARGET   = libhexstate_q2k.so