CompressedGemma commited on
Commit
7803d72
Β·
verified Β·
1 Parent(s): 766f12c

Upload 5 files

Browse files
convert_hf_to_gguf.py CHANGED
@@ -7651,6 +7651,22 @@ class Gemma4Model(Gemma3Model):
7651
  yield from super().modify_tensors(data_torch, name, bid)
7652
 
7653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7654
  @ModelBase.register("Gemma4ForConditionalGeneration")
7655
  class Gemma4VisionAudioModel(MmprojModel):
7656
  has_audio_encoder = True
 
7651
  yield from super().modify_tensors(data_torch, name, bid)
7652
 
7653
 
7654
+ @ModelBase.register("Gemma4AssistantForCausalLM")
7655
+ class Gemma4AssistantModel(Gemma4Model):
7656
+ model_arch = gguf.MODEL_ARCH.GEMMA4
7657
+
7658
+ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
7659
+ # Handle assistant-specific projection layers
7660
+ if name == "pre_projection.weight":
7661
+ return "pre_proj.weight"
7662
+ if name == "post_projection.weight":
7663
+ return "post_proj.weight"
7664
+ # Map embed_tokens to token_embd for compatibility
7665
+ if name == "model.embed_tokens.weight":
7666
+ return "token_embd.weight"
7667
+ return super().map_tensor_name(name, try_suffixes)
7668
+
7669
+
7670
  @ModelBase.register("Gemma4ForConditionalGeneration")
7671
  class Gemma4VisionAudioModel(MmprojModel):
7672
  has_audio_encoder = True
download_model.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import sys
4
+ import argparse
5
+ import urllib.parse
6
+ from huggingface_hub import snapshot_download
7
+
8
+ def parse_hf_url(url_or_id):
9
+ """
10
+ Parses a Hugging Face URL or Repo ID and extracts the repo ID and type.
11
+ Example URL: https://huggingface.co/google/gemma-4-26B-A4B-it
12
+ Example Dataset URL: https://huggingface.co/datasets/ggml-org/ci
13
+ """
14
+ # Check if it is a URL or a repo ID
15
+ if not (url_or_id.startswith("http://") or url_or_id.startswith("https://")):
16
+ # If it contains a slash, assume it is user/repo
17
+ return url_or_id, "model"
18
+
19
+ parsed = urllib.parse.urlparse(url_or_id)
20
+ if parsed.netloc not in ("huggingface.co", "www.huggingface.co"):
21
+ raise ValueError(f"URL host must be huggingface.co, got: {parsed.netloc}")
22
+
23
+ path_parts = [p for p in parsed.path.split("/") if p]
24
+ if not path_parts:
25
+ raise ValueError("Hugging Face URL path is empty")
26
+
27
+ repo_type = "model"
28
+ if path_parts[0] in ("datasets", "spaces"):
29
+ repo_type = "dataset" if path_parts[0] == "datasets" else "space"
30
+ path_parts = path_parts[1:]
31
+
32
+ if len(path_parts) < 2:
33
+ if len(path_parts) == 1:
34
+ return path_parts[0], repo_type
35
+ raise ValueError("Could not extract repository ID from Hugging Face URL")
36
+
37
+ repo_id = f"{path_parts[0]}/{path_parts[1]}"
38
+ return repo_id, repo_type
39
+
40
+ def main():
41
+ parser = argparse.ArgumentParser(
42
+ description="Download a Hugging Face model or dataset from a URL or repository ID."
43
+ )
44
+ parser.add_argument(
45
+ "url_or_id",
46
+ type=str,
47
+ help="Hugging Face repository URL (e.g. https://huggingface.co/google/gemma-4-26B-A4B-it) or repository ID (e.g. google/gemma-4-26B-A4B-it)."
48
+ )
49
+ parser.add_argument(
50
+ "--local-dir",
51
+ type=str,
52
+ default=None,
53
+ help="Directory to save the downloaded model. Defaults to a folder matching the repository name in the current directory."
54
+ )
55
+ parser.add_argument(
56
+ "--token",
57
+ type=str,
58
+ default=os.environ.get("HF_TOKEN"),
59
+ help="Hugging Face API token. Can also be set via the HF_TOKEN environment variable."
60
+ )
61
+ parser.add_argument(
62
+ "--exclude",
63
+ type=str,
64
+ nargs="*",
65
+ help="Glob patterns to exclude from download (e.g., *.bin, *.pt)"
66
+ )
67
+ parser.add_argument(
68
+ "--include",
69
+ type=str,
70
+ nargs="*",
71
+ help="Glob patterns to include in download (e.g., *.safetensors)"
72
+ )
73
+ args = parser.parse_args()
74
+
75
+ try:
76
+ repo_id, repo_type = parse_hf_url(args.url_or_id)
77
+ except ValueError as e:
78
+ print(f"Error parsing input URL/ID: {e}", file=sys.stderr)
79
+ sys.exit(1)
80
+
81
+ # Determine local directory if not specified
82
+ if args.local_dir is None:
83
+ repo_name = repo_id.split("/")[-1]
84
+ args.local_dir = os.path.join(os.getcwd(), repo_name)
85
+
86
+ print(f"Repository ID: {repo_id}")
87
+ print(f"Repository Type: {repo_type}")
88
+ print(f"Target Directory: {args.local_dir}")
89
+
90
+ os.makedirs(args.local_dir, exist_ok=True)
91
+
92
+ try:
93
+ downloaded_path = snapshot_download(
94
+ repo_id=repo_id,
95
+ repo_type=repo_type,
96
+ local_dir=args.local_dir,
97
+ local_dir_use_symlinks=False,
98
+ token=args.token,
99
+ ignore_patterns=args.exclude,
100
+ allow_patterns=args.include
101
+ )
102
+ print(f"\nDownload completed successfully!")
103
+ print(f"Files saved in: {downloaded_path}")
104
+ except Exception as e:
105
+ print(f"\nError downloading repository: {e}", file=sys.stderr)
106
+ sys.exit(1)
107
+
108
+ if __name__ == "__main__":
109
+ main()
hexstate_quantize.c CHANGED
@@ -657,6 +657,60 @@ static void init_scale_table(void) {
657
  scale_table_initialized = 1;
658
  }
659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  /* Compute the Q2_K sub-block reconstruction error for a block at a given
661
  * scale multiplier, optionally weighted by importance vector */
662
  static float compute_block_error_q2k(const float *weights, int block_size,
@@ -894,9 +948,11 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
894
 
895
  float deq = cand_min + scale * (float)l;
896
  float diff = fabsf(x[i] - deq);
897
- /* Apply error norm */
898
  float e = diff;
899
- if (cfg->norm != 1.0f) {
 
 
900
  e = powf(diff, cfg->norm);
901
  }
902
  /* Apply importance weighting */
@@ -1760,14 +1816,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1760
 
1761
  /* Build per-block CDFs from triality marginals */
1762
  unsigned int born_rng = 314159;
1763
- int *shot_assignment = (int *)malloc(n_blocks * sizeof(int));
 
 
 
 
 
 
 
1764
 
1765
  for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
1766
- float shot_err = 0.0f;
1767
- /* Init from beam result so tail blocks beyond
1768
- * graph_blocks*stride keep valid indices */
1769
- memcpy(shot_assignment, best_candidate,
1770
- n_blocks * sizeof(int));
1771
 
1772
  for (int64_t gi = 0; gi < graph_blocks; gi++) {
1773
  /* Normalize marginals to CDF */
@@ -1798,19 +1857,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1798
  }
1799
  }
1800
 
1801
- shot_assignment[blk] = best_bin_cand;
1802
  shot_err += cand_errors[blk][best_bin_cand];
1803
  }
1804
 
1805
  /* Metropolis acceptance: adopt if better than current best */
1806
  if (shot_err < beam_total_err) {
1807
- for (int64_t b = 0; b < n_blocks; b++)
1808
- best_candidate[b] = shot_assignment[b];
1809
  beam_total_err = shot_err;
1810
  }
1811
  }
1812
 
1813
- free(shot_assignment);
1814
  }
1815
 
1816
  free(marg);
@@ -2686,14 +2745,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2686
  beam_total_err += candidate_errors[bi][best_candidate[bi]];
2687
 
2688
  unsigned int born_rng_q2 = 271828;
2689
- int *shot_assignment = (int *)malloc(n_blocks * sizeof(int));
 
 
 
 
 
 
2690
 
2691
  for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
2692
- float shot_err = 0.0f;
2693
- /* Init from beam result so tail blocks beyond
2694
- * graph_blocks*stride keep valid indices */
2695
- memcpy(shot_assignment, best_candidate,
2696
- n_blocks * sizeof(int));
2697
 
2698
  for (int64_t gi = 0; gi < graph_blocks; gi++) {
2699
  /* Born sample coarse (d) quhit */
@@ -2738,18 +2799,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2738
  }
2739
  }
2740
 
2741
- shot_assignment[blk] = best_bin_cand;
2742
  shot_err += candidate_errors[blk][best_bin_cand];
2743
  }
2744
 
2745
  if (shot_err < beam_total_err) {
2746
- for (int64_t b = 0; b < n_blocks; b++)
2747
- best_candidate[b] = shot_assignment[b];
 
2748
  beam_total_err = shot_err;
2749
  }
2750
  }
2751
 
2752
- free(shot_assignment);
2753
  }
2754
 
2755
  free(coarse_marg);
@@ -2790,6 +2852,17 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2790
  * the perfect bit analog at 2-bit resolution.
2791
  * ══════════════════════════════════════════════════════════════════ */
2792
 
 
 
 
 
 
 
 
 
 
 
 
2793
  #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
2794
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2795
  const float *block_x = weights + blk * QK_K;
@@ -2804,12 +2877,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2804
  float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2805
 
2806
  /* ── Analog assembly: iterate to convergence ──
2807
- * 5 iterations: enough for the (Ls,Lm) ↔ (d,dmin) coupling
2808
- * to fully stabilize. Each iteration does:
2809
- * A) Sub-block Quhit BP to find coupled (Ls,Lm) states
 
2810
  * B) Optimal q-value assignment
2811
  * C) WLS solve for (d, dmin) */
2812
- for (int ls_iter = 0; ls_iter < 8; ls_iter++) {
2813
 
2814
  /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
2815
  * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
@@ -2861,9 +2935,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2861
  }
2862
  }
2863
 
2864
- /* Build 16-node sub-block graph and run BP */
2865
- HPCGraph *sg = hpc_create(N_SUB);
2866
- if (sg) {
 
 
 
 
 
2867
  float min_sub_err[N_SUB];
2868
  for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
2869
 
@@ -2901,9 +2980,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2901
  hpc_cz(sg, j, j + 1);
2902
 
2903
  /* ── Shor sequential measurement on sub-block graph ──
2904
- * Replaces BP with exact marginals (ported from tesseract_factor.c) */
2905
- double (*sub_marg)[6] = (double (*)[6])calloc(N_SUB, sizeof(double[6]));
2906
- int *sub_measured = (int *)calloc(N_SUB, sizeof(int));
 
 
2907
 
2908
  shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
2909
 
@@ -2920,16 +3001,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2920
  Ls_blk[j] = state_ls[j][best_v];
2921
  Lm_blk[j] = state_lm[j][best_v];
2922
  }
2923
-
2924
- free(sub_marg);
2925
- free(sub_measured);
2926
- hpc_destroy(sg);
2927
- } else {
2928
- /* Fallback to independent local optima if malloc fails */
2929
- for (int j = 0; j < N_SUB; j++) {
2930
- Ls_blk[j] = state_ls[j][0];
2931
- Lm_blk[j] = state_lm[j][0];
2932
- }
2933
  }
2934
 
2935
  /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
@@ -3039,16 +3110,20 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3039
  }
3040
 
3041
  /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
3042
- * The WLS solve may have shifted (d, dmin) after the last Step A,
3043
- * invalidating the Ls/Lm choices. One final exhaustive pass at the
3044
- * EXACT FP16-truncated scales ensures every sub-block is optimal. */
3045
  for (int j = 0; j < N_SUB; j++) {
3046
  const float *sx = block_x + 16 * j;
3047
  float best_sub_err = 1e30f;
3048
  uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
3049
- for (int try_ls = 0; try_ls <= 15; try_ls++) {
 
 
 
 
3050
  float d_sub = dm * (float)try_ls;
3051
- for (int try_lm = 0; try_lm <= 15; try_lm++) {
3052
  float m_sub = mm * (float)try_lm;
3053
  float sub_err = 0.0f;
3054
  for (int k = 0; k < 16; k++) {
@@ -3240,6 +3315,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3240
  total_err += berr;
3241
  }
3242
 
 
 
 
 
 
3243
  free(seeds);
3244
  free(candidate_errors);
3245
  free(candidate_d);
@@ -3907,848 +3987,6 @@ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
3907
  if (out_error) *out_error = err;
3908
  }
3909
 
3910
- /* ═══════════════════════════════════════════════════════════════════════════
3911
- * HPC-Accelerated BPE Tokenizer
3912
- *
3913
- * Uses the Holographic Phase Graph for BPE tokenization.
3914
- *
3915
- * Architecture:
3916
- * 1. Each character position is a SITE in an HPCGraph
3917
- * 2. Token IDs are encoded as local quhit amplitudes via hpc_set_local
3918
- * (modular folding into D=6 phase space)
3919
- * 3. Adjacent positions are CZ-coupled via hpc_cz, creating phase
3920
- * entanglement that encodes pair structure
3921
- * 4. Merge rules are indexed in a hash table: (tok_a, tok_b) β†’ merge_info
3922
- * for O(1) lookup instead of scanning all rules
3923
- * 5. BPE merge = GRAPH CONTRACTION: matched sites contract,
3924
- * CZ edges compact via hpc_compact_edges semantics,
3925
- * and the merged token's amplitude replaces both locals
3926
- *
3927
- * Complexity: O(n_passes Γ— L) instead of O(n_merges Γ— L)
3928
- * Since n_passes << n_merges, this is dramatically faster.
3929
- * ═══════════════════════════════════════════════════════════════════════════ */
3930
-
3931
- /* Merge table entry */
3932
- typedef struct {
3933
- int32_t tok_a;
3934
- int32_t tok_b;
3935
- int32_t merged_id;
3936
- int32_t rank;
3937
- } BPEMerge;
3938
-
3939
- /* Hash table for O(1) merge rule lookup: key = (tok_a, tok_b) */
3940
- #define BPE_HASH_SIZE (1 << 20) /* 1M buckets */
3941
- #define BPE_HASH_EMPTY -1
3942
-
3943
- typedef struct {
3944
- int32_t tok_a;
3945
- int32_t tok_b;
3946
- int32_t merged_id;
3947
- int32_t rank;
3948
- } BPEHashEntry;
3949
-
3950
- static inline uint32_t bpe_hash(int32_t a, int32_t b) {
3951
- /* FNV-1a inspired hash for pair */
3952
- uint64_t h = 14695981039346656037ULL;
3953
- h ^= (uint32_t)a; h *= 1099511628211ULL;
3954
- h ^= (uint32_t)b; h *= 1099511628211ULL;
3955
- return (uint32_t)(h & (BPE_HASH_SIZE - 1));
3956
- }
3957
-
3958
- /*
3959
- * hexstate_bpe_tokenize β€” HPC-accelerated BPE tokenization.
3960
- */
3961
- void hexstate_bpe_tokenize(const int32_t *char_ids, int64_t n_chars,
3962
- const BPEMerge *merges, int32_t n_merges,
3963
- int32_t *output_ids, int64_t *out_n_tokens,
3964
- int verbose)
3965
- {
3966
- hexstate_init();
3967
-
3968
- if (verbose) {
3969
- fprintf(stderr, " HPCΒ·BPE: building phase graph (%ld sites, %d merge rules)...\n",
3970
- (long)n_chars, n_merges);
3971
- }
3972
-
3973
- /* ── Build merge hash table: (tok_a, tok_b) β†’ merge_info ──
3974
- * This replaces the O(n_merges) scan per pair with O(1) lookup. */
3975
- BPEHashEntry *htable = (BPEHashEntry *)malloc(BPE_HASH_SIZE * sizeof(BPEHashEntry));
3976
- if (!htable) {
3977
- fprintf(stderr, "hexstate_bpe_tokenize: hash table alloc failed\n");
3978
- *out_n_tokens = 0;
3979
- return;
3980
- }
3981
- for (int i = 0; i < BPE_HASH_SIZE; i++) {
3982
- htable[i].tok_a = BPE_HASH_EMPTY;
3983
- }
3984
- for (int32_t m = 0; m < n_merges; m++) {
3985
- uint32_t h = bpe_hash(merges[m].tok_a, merges[m].tok_b);
3986
- /* Linear probing */
3987
- for (int p = 0; p < BPE_HASH_SIZE; p++) {
3988
- uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
3989
- if (htable[idx].tok_a == BPE_HASH_EMPTY) {
3990
- htable[idx].tok_a = merges[m].tok_a;
3991
- htable[idx].tok_b = merges[m].tok_b;
3992
- htable[idx].merged_id = merges[m].merged_id;
3993
- htable[idx].rank = merges[m].rank;
3994
- break;
3995
- }
3996
- }
3997
- }
3998
-
3999
- /* ── Create HPCGraph: one site per character ──
4000
- * Each site's local quhit amplitude encodes the token ID,
4001
- * folded into D=6 via modular arithmetic.
4002
- * Adjacent sites are CZ-coupled. */
4003
- HPCGraph *graph = hpc_create((uint64_t)n_chars);
4004
- if (!graph) {
4005
- fprintf(stderr, "hexstate_bpe_tokenize: HPCGraph alloc failed for %ld sites\n",
4006
- (long)n_chars);
4007
- free(htable);
4008
- *out_n_tokens = 0;
4009
- return;
4010
- }
4011
-
4012
- /* Set local amplitudes: token ID β†’ quhit state via triality encoding.
4013
- * Amplitude concentrated on basis state (tok_id mod 6). */
4014
- for (int64_t i = 0; i < n_chars; i++) {
4015
- double re[6] = {0}, im[6] = {0};
4016
- int basis = char_ids[i] % HPC_D;
4017
- re[basis] = 1.0; /* Sharp state on this basis vector */
4018
- hpc_set_local(graph, (uint64_t)i, re, im);
4019
- }
4020
-
4021
- /* Connect adjacent sites with CZ edges β€” this encodes pair structure
4022
- * in the phase graph. Adjacent token interactions become phase
4023
- * entanglement that the contraction process resolves. */
4024
- for (int64_t i = 0; i < n_chars - 1; i++) {
4025
- hpc_cz(graph, (uint64_t)i, (uint64_t)(i + 1));
4026
- }
4027
-
4028
- if (verbose) {
4029
- fprintf(stderr, " HPCΒ·BPE: phase graph ready (%lu sites, %lu CZ edges)\n",
4030
- (unsigned long)graph->n_sites, (unsigned long)graph->cz_edges);
4031
- }
4032
-
4033
- /* ── Working linked list for token sequence ──
4034
- * Parallel to the HPCGraph sites for fast iteration. */
4035
- int32_t *tokens = (int32_t *)malloc(n_chars * sizeof(int32_t));
4036
- int32_t *nxt = (int32_t *)malloc(n_chars * sizeof(int32_t));
4037
- int32_t *prv = (int32_t *)malloc(n_chars * sizeof(int32_t));
4038
- int8_t *alive = (int8_t *)calloc(n_chars, sizeof(int8_t));
4039
-
4040
- for (int64_t i = 0; i < n_chars; i++) {
4041
- tokens[i] = char_ids[i];
4042
- nxt[i] = (i + 1 < n_chars) ? (int32_t)(i + 1) : -1;
4043
- prv[i] = (i > 0) ? (int32_t)(i - 1) : -1;
4044
- alive[i] = 1;
4045
- }
4046
- int64_t n_alive = n_chars;
4047
-
4048
- /* ── Merge loop: find best pair via hash lookup, apply globally ──
4049
- *
4050
- * Instead of iterating n_merges rules and scanning for matches,
4051
- * we scan positions ONCE per pass, look up each adjacent pair in
4052
- * the hash table, and find the globally-best (lowest rank) merge.
4053
- * Then apply that merge to ALL matching pairs in one contraction pass.
4054
- *
4055
- * Each contraction:
4056
- * - Replaces the left site's token with the merged token
4057
- * - Kills the right site (linked list surgery)
4058
- * - Updates the HPCGraph: removes CZ edge between the pair,
4059
- * re-links the merged site's edges to its new neighbor
4060
- * - Accumulates phase via Ο‰^(aΒ·b) multiplication on the quhit */
4061
-
4062
- int pass = 0;
4063
- while (n_alive > 1) {
4064
- /* ── SCAN: find the globally-best merge pair ── */
4065
- int32_t best_rank = 0x7FFFFFFF;
4066
- int32_t best_a = -1, best_b = -1, best_merged = -1;
4067
-
4068
- #pragma omp parallel
4069
- {
4070
- int32_t local_rank = 0x7FFFFFFF;
4071
- int32_t local_a = -1, local_b = -1, local_merged = -1;
4072
-
4073
- #pragma omp for schedule(static) nowait
4074
- for (int64_t i = 0; i < n_chars; i++) {
4075
- if (!alive[i]) continue;
4076
- int32_t ni = nxt[i];
4077
- if (ni < 0 || !alive[ni]) continue;
4078
-
4079
- /* O(1) hash lookup for this pair */
4080
- uint32_t h = bpe_hash(tokens[i], tokens[ni]);
4081
- for (int p = 0; p < 64; p++) { /* bounded probe */
4082
- uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
4083
- if (htable[idx].tok_a == BPE_HASH_EMPTY) break;
4084
- if (htable[idx].tok_a == tokens[i] &&
4085
- htable[idx].tok_b == tokens[ni]) {
4086
- if (htable[idx].rank < local_rank) {
4087
- local_rank = htable[idx].rank;
4088
- local_a = tokens[i];
4089
- local_b = tokens[ni];
4090
- local_merged = htable[idx].merged_id;
4091
- }
4092
- break;
4093
- }
4094
- }
4095
- }
4096
-
4097
- #pragma omp critical
4098
- {
4099
- if (local_rank < best_rank) {
4100
- best_rank = local_rank;
4101
- best_a = local_a;
4102
- best_b = local_b;
4103
- best_merged = local_merged;
4104
- }
4105
- }
4106
- }
4107
-
4108
- if (best_a < 0) break; /* No more mergeable pairs */
4109
-
4110
- /* ── CONTRACT: apply best merge to ALL matching pairs ──
4111
- * Serial pass (linked list surgery must be ordered L→R) */
4112
- int64_t n_merged = 0;
4113
- for (int64_t i = 0; i < n_chars; i++) {
4114
- if (!alive[i]) continue;
4115
- if (tokens[i] != best_a) continue;
4116
- int32_t ni = nxt[i];
4117
- if (ni < 0 || !alive[ni]) continue;
4118
- if (tokens[ni] != best_b) continue;
4119
-
4120
- /* Phase contraction on the HPCGraph:
4121
- * The CZ edge between sites i and ni contracts.
4122
- * Update site i's local state to the merged token. */
4123
- {
4124
- double re[6] = {0}, im[6] = {0};
4125
- int basis = best_merged % HPC_D;
4126
- re[basis] = 1.0;
4127
- hpc_set_local(graph, (uint64_t)i, re, im);
4128
- }
4129
-
4130
- /* Contract token sequence */
4131
- tokens[i] = best_merged;
4132
- alive[ni] = 0;
4133
- n_alive--;
4134
- n_merged++;
4135
-
4136
- /* Linked list surgery */
4137
- int32_t nni = nxt[ni];
4138
- nxt[i] = nni;
4139
- if (nni >= 0) prv[nni] = (int32_t)i;
4140
- }
4141
-
4142
- pass++;
4143
- if (verbose && pass % 100 == 0) {
4144
- fprintf(stderr, "\r HPCΒ·BPE: pass %d, %ld tokens (%.1f%%), "
4145
- "last merge: rank %d, %ld instances ",
4146
- pass, (long)n_alive, 100.0 * n_alive / n_chars,
4147
- best_rank, (long)n_merged);
4148
- }
4149
- }
4150
-
4151
- if (verbose) {
4152
- fprintf(stderr, "\r HPCΒ·BPE: %d passes, %ld β†’ %ld tokens (%.1f%%)%s\n",
4153
- pass, (long)n_chars, (long)n_alive,
4154
- 100.0 * n_alive / n_chars, " ");
4155
- fprintf(stderr, " HPCΒ·BPE: graph stats β€” %lu CZ edges, "
4156
- "avg fidelity %.4f\n",
4157
- (unsigned long)graph->cz_edges, graph->avg_fidelity);
4158
- }
4159
-
4160
- /* Collect surviving tokens */
4161
- int64_t out_idx = 0;
4162
- for (int64_t i = 0; i < n_chars; i++) {
4163
- if (alive[i]) {
4164
- output_ids[out_idx++] = tokens[i];
4165
- }
4166
- }
4167
- *out_n_tokens = out_idx;
4168
-
4169
- /* Cleanup */
4170
- hpc_destroy(graph);
4171
- free(htable);
4172
- free(tokens);
4173
- free(nxt);
4174
- free(prv);
4175
- free(alive);
4176
- }
4177
-
4178
- /* ═══════════════════════════════════════════════════════════════════════════
4179
- * HPC Forward Pass β€” The Graph IS the Computation
4180
- *
4181
- * Architecture mirrors the BPE tokenizer:
4182
- * - Token positions β†’ HPCGraph sites
4183
- * - Hidden dimensions β†’ triality-encoded quhit amplitudes
4184
- * - Weight projections β†’ phase edges between input/output sites
4185
- * - Attention β†’ CZ coupling between Q/K sites + marginal readout
4186
- * - Importance β†’ graph |ψ|Β² marginal probabilities (no separate E[xΒ²])
4187
- *
4188
- * One function does the entire layer: norm β†’ QKV β†’ attention β†’ FFN.
4189
- * Python only handles weight I/O; all compute flows through HPCGraph.
4190
- * ═══════════════════════════════════════════════════════════════════════════ */
4191
-
4192
- /* ── Helper: encode a float vector into an HPCGraph's site amplitudes ──
4193
- *
4194
- * Maps each element x[j] into a D=6 quhit amplitude at site j via
4195
- * triality modular folding. This IS the encoding the BPE tokenizer uses
4196
- * for token IDs β€” same machinery, different domain.
4197
- */
4198
- static void hpc_encode_vector(HPCGraph *g, const float *x, int64_t dim,
4199
- int64_t site_offset)
4200
- {
4201
- for (int64_t j = 0; j < dim; j++) {
4202
- double re[D] = {0}, im[D] = {0};
4203
- float val = x[j];
4204
- float mag = fabsf(val) + 1e-12f;
4205
- /* Modular triality fold: value β†’ phase index in D=6 space */
4206
- int phase = ((int)(mag * 1e3f)) % D;
4207
- if (phase < 0) phase += D;
4208
- re[phase] = sqrt(mag);
4209
- /* Sign β†’ imaginary component (preserves direction) */
4210
- im[phase] = (val < 0) ? -sqrt(mag) * 0.5 : sqrt(mag) * 0.5;
4211
- /* Spread to neighbors for smooth encoding */
4212
- re[(phase + 1) % D] = sqrt(mag) * 0.25;
4213
- re[(phase + 5) % D] = sqrt(mag) * 0.25;
4214
- hpc_set_local(g, site_offset + j, re, im);
4215
- }
4216
- }
4217
-
4218
- /* ── Helper: read importance from graph marginals ──
4219
- *
4220
- * The marginal probability P(site_j = dominant_phase) gives |ψ_j|²,
4221
- * which IS the activation importance for column j. No separate E[xΒ²]
4222
- * accumulation needed β€” the graph's own Born rule computes it.
4223
- */
4224
- static void hpc_read_importance(HPCGraph *g, const float *x, int64_t dim,
4225
- int64_t site_offset, float *importance,
4226
- int64_t M)
4227
- {
4228
- for (int64_t j = 0; j < dim; j++) {
4229
- float mag = fabsf(x[j]) + 1e-12f;
4230
- int phase = ((int)(mag * 1e3f)) % D;
4231
- if (phase < 0) phase += D;
4232
- /* Graph marginal = |ψ_j|² = phase-coherent importance */
4233
- double marg = hpc_marginal(g, site_offset + j, phase);
4234
- /* Modulate raw E[xΒ²] by graph coherence */
4235
- float raw = x[j] * x[j];
4236
- double boost = 1.0 + (marg * D - 1.0) * 0.5;
4237
- if (boost < 0.5) boost = 0.5;
4238
- if (boost > 2.0) boost = 2.0;
4239
- importance[j] += raw * (float)boost * M;
4240
- }
4241
- }
4242
-
4243
- /* ── Helper: graph-based matmul ──
4244
- *
4245
- * Computes out = x @ W.T using standard arithmetic, BUT simultaneously
4246
- * builds an HPCGraph over input columns, CZ-couples them, and extracts
4247
- * importance via marginal probabilities.
4248
- *
4249
- * The graph encodes inter-column phase coherence: columns whose activation
4250
- * patterns are phase-aligned (coherent in the D=6 space) get boosted
4251
- * importance. This is what raw E[xΒ²] misses.
4252
- */
4253
- static void hpc_matmul_graph(const float *x, const float *weight, float *out,
4254
- float *importance, int64_t *count,
4255
- int64_t M, int64_t K, int64_t N, int trans_w)
4256
- {
4257
- /* Build HPCGraph over input columns for importance */
4258
- int64_t stride = (K > 512) ? K / 512 : 1;
4259
- int64_t n_sites = (K + stride - 1) / stride;
4260
- HPCGraph *g = hpc_create(n_sites);
4261
- float *col_energy = (float *)calloc(K, sizeof(float));
4262
-
4263
- if (g && col_energy) {
4264
- /* Compute per-column energies */
4265
- #pragma omp parallel for schedule(static)
4266
- for (int64_t j = 0; j < K; j++) {
4267
- float s = 0.0f;
4268
- for (int64_t i = 0; i < M; i++) {
4269
- float v = x[i * K + j];
4270
- s += v * v;
4271
- }
4272
- col_energy[j] = s;
4273
- }
4274
-
4275
- /* Encode column energies as quhit amplitudes */
4276
- for (int64_t s = 0; s < n_sites; s++) {
4277
- int64_t j = s * stride;
4278
- if (j >= K) break;
4279
- double re[D] = {0}, im[D] = {0};
4280
- float e = col_energy[j];
4281
- int phase = ((int)(e * 1e3f)) % D;
4282
- if (phase < 0) phase += D;
4283
- re[phase] = sqrt(e + 1e-12);
4284
- re[(phase + 1) % D] = sqrt(e + 1e-12) * 0.25;
4285
- re[(phase + 5) % D] = sqrt(e + 1e-12) * 0.25;
4286
- hpc_set_local(g, s, re, im);
4287
- }
4288
-
4289
- /* CZ-couple adjacent sites β€” phase coherence propagation */
4290
- for (int64_t s = 0; s < n_sites - 1; s++)
4291
- hpc_cz(g, s, s + 1);
4292
-
4293
- /* Read importance via graph marginals */
4294
- double fidelity = g->avg_fidelity;
4295
- for (int64_t s = 0; s < n_sites; s++) {
4296
- int64_t j0 = s * stride;
4297
- int64_t j1 = (s + 1) * stride;
4298
- if (j1 > K) j1 = K;
4299
- float e = col_energy[j0];
4300
- int phase = ((int)(e * 1e3f)) % D;
4301
- if (phase < 0) phase += D;
4302
- double marg = hpc_marginal(g, s, phase);
4303
- double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5;
4304
- if (boost < 0.5) boost = 0.5;
4305
- if (boost > 2.0) boost = 2.0;
4306
- for (int64_t j = j0; j < j1; j++)
4307
- importance[j] += col_energy[j] * (float)boost;
4308
- }
4309
- if (count) *count += M;
4310
- }
4311
-
4312
- /* Matmul: out = x @ W.T (trans_w=0) or x @ W (trans_w=1) */
4313
- #pragma omp parallel for schedule(static)
4314
- for (int64_t i = 0; i < M; i++) {
4315
- const float *xi = x + i * K;
4316
- float *oi = out + i * N;
4317
- if (trans_w) {
4318
- for (int64_t n = 0; n < N; n++) {
4319
- float dot = 0.0f;
4320
- for (int64_t k = 0; k < K; k++)
4321
- dot += xi[k] * weight[k * N + n];
4322
- oi[n] = dot;
4323
- }
4324
- } else {
4325
- for (int64_t n = 0; n < N; n++) {
4326
- const float *wn = weight + n * K;
4327
- float dot = 0.0f;
4328
- for (int64_t k = 0; k < K; k++)
4329
- dot += xi[k] * wn[k];
4330
- oi[n] = dot;
4331
- }
4332
- }
4333
- }
4334
-
4335
- if (col_energy) free(col_energy);
4336
- if (g) hpc_destroy(g);
4337
- }
4338
-
4339
- /* ── Helper: RMS norm (OpenMP) ── */
4340
- void hexstate_rms_norm(const float *x, const float *w, float *out,
4341
- int64_t seq, int64_t dim, float eps)
4342
- {
4343
- #pragma omp parallel for schedule(static)
4344
- for (int64_t i = 0; i < seq; i++) {
4345
- const float *row = x + i * dim;
4346
- float *orow = out + i * dim;
4347
- float ss = 0.0f;
4348
- for (int64_t j = 0; j < dim; j++) ss += row[j] * row[j];
4349
- float inv = 1.0f / sqrtf(ss / dim + eps);
4350
- for (int64_t j = 0; j < dim; j++) orow[j] = row[j] * inv * w[j];
4351
- }
4352
- }
4353
-
4354
- /* ── Helper: SiLU activation ── */
4355
- static void hpc_silu(float *x, int64_t n)
4356
- {
4357
- #pragma omp parallel for schedule(static)
4358
- for (int64_t i = 0; i < n; i++)
4359
- x[i] = x[i] / (1.0f + expf(-x[i]));
4360
- }
4361
- /* ═══════════════════════════════════════════════════════════════════════════
4362
- * hexstate_forward_layer β€” Complete layer forward pass via HPCGraph
4363
- *
4364
- * One C call does: RMS norm β†’ QKV projection β†’ HPC linear attention β†’
4365
- * gate projection β†’ SSM (optional) β†’ FFN
4366
- *
4367
- * The HPCGraph is used for:
4368
- * 1. Importance recording: graph marginals give phase-coherent |ψ|²
4369
- * 2. Attention: CZ coupling between Q/K head sites + marginal readout
4370
- * determines per-head attention weights for the linear accumulator
4371
- * 3. Cross-head coherence: adjacent heads are CZ-coupled, so GQA
4372
- * structure emerges from the graph topology
4373
- *
4374
- * Parameters:
4375
- * hidden: [seq_len Γ— n_embd], modified in-place
4376
- * norm_w: [n_embd] attention norm weights
4377
- * qkv_w: [qkv_dim Γ— n_embd] fused QKV weights (NULL if separate)
4378
- * q_w/k_w/v_w: separate QKV weights (NULL if fused)
4379
- * gate_w: [n_embd Γ— attn_out_dim] gate/output projection
4380
- * o_w: [n_embd Γ— v_total_dim] output projection (separate path)
4381
- * ffn_norm_w: [n_embd] FFN norm weights
4382
- * ffn_gate/up/down: FFN weights
4383
- * imp_*: importance accumulators (one per weight matrix)
4384
- * cnt_*: sample counts per weight
4385
- * seq/embd/heads/hd/ffn_dim: architecture dimensions
4386
- * eps: RMS norm epsilon
4387
- * ═══════════════════════════════════════════════════════════════════════════ */
4388
- void hexstate_forward_layer(
4389
- float *hidden,
4390
- /* Attention weights */
4391
- const float *norm_w,
4392
- const float *qkv_w, int64_t qkv_dim,
4393
- const float *q_w, int64_t q_dim,
4394
- const float *k_w, int64_t k_dim,
4395
- const float *v_w, int64_t v_dim,
4396
- const float *gate_w, int64_t gate_rows,
4397
- const float *o_w, int64_t o_cols,
4398
- /* FFN weights */
4399
- const float *ffn_norm_w,
4400
- const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w,
4401
- int64_t ffn_dim,
4402
- /* Importance accumulators (NULL to skip) */
4403
- float *imp_qkv, int64_t *cnt_qkv,
4404
- float *imp_q, int64_t *cnt_q,
4405
- float *imp_k, int64_t *cnt_k,
4406
- float *imp_v, int64_t *cnt_v,
4407
- float *imp_gate, int64_t *cnt_gate,
4408
- float *imp_o, int64_t *cnt_o,
4409
- float *imp_ffn_gate, int64_t *cnt_ffn_gate,
4410
- float *imp_ffn_up, int64_t *cnt_ffn_up,
4411
- float *imp_ffn_down, int64_t *cnt_ffn_down,
4412
- /* Architecture */
4413
- int64_t seq_len, int64_t n_embd, int64_t n_head, int64_t n_head_kv,
4414
- int64_t head_dim, float eps)
4415
- {
4416
- float *normed = (float *)malloc(seq_len * n_embd * sizeof(float));
4417
- if (!normed) return;
4418
-
4419
- /* ══════════════ Phase 1: Attention Norm ══════════════ */
4420
- hexstate_rms_norm(hidden, norm_w, normed, seq_len, n_embd, eps);
4421
-
4422
- /* ══════════════ Phase 2: QKV Projection via HPC Graph ══════════════ */
4423
- float *attn_out = (float *)calloc(seq_len * n_embd, sizeof(float));
4424
- if (!attn_out) { free(normed); return; }
4425
-
4426
- if (qkv_w && qkv_dim > 0) {
4427
- /* ── Fused QKV path (Qwen 3.6) ── */
4428
- float *qkv = (float *)malloc(seq_len * qkv_dim * sizeof(float));
4429
- if (!qkv) { free(normed); free(attn_out); return; }
4430
-
4431
- /* Graph-based matmul: importance via HPCGraph marginals */
4432
- hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv,
4433
- seq_len, n_embd, qkv_dim, 0);
4434
-
4435
- /* Split Q, K, V */
4436
- int64_t q_total = n_head * head_dim;
4437
- int64_t kv_total = n_head_kv * head_dim;
4438
- float *Q = qkv; /* [seq, q_total] */
4439
- float *K = qkv + q_total; /* offset per row */
4440
- float *V = qkv + q_total + kv_total; /* offset per row */
4441
-
4442
- /* ── HPC Linear Attention: graph IS the attention ──
4443
- *
4444
- * Create HPCGraph with n_head sites.
4445
- * Each head is a site. KΒ·V interaction energy β†’ quhit amplitude.
4446
- * CZ edges between adjacent heads β†’ cross-head phase coherence.
4447
- * hpc_marginal(h) β†’ attention weight for head h.
4448
- *
4449
- * Running state S[h] accumulates KβŠ—V, weighted by coherence.
4450
- * This is causal linear attention where the HPC graph determines
4451
- * HOW MUCH each head contributes at each timestep.
4452
- */
4453
- HPCGraph *attn_graph = hpc_create(n_head);
4454
- float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float));
4455
- float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float));
4456
- int64_t inner_dim = n_head * head_dim;
4457
- float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
4458
-
4459
- if (attn_graph && S && z_acc && attn_inner) {
4460
- for (int64_t t = 0; t < seq_len; t++) {
4461
- /* Extract Q/K/V for this timestep (handle strided layout) */
4462
- float *qt_base = qkv + t * qkv_dim;
4463
- float *kt_base = qt_base + q_total;
4464
- float *vt_base = kt_base + kv_total;
4465
-
4466
- /* Encode KΒ·V energy into graph sites */
4467
- for (int64_t h = 0; h < n_head; h++) {
4468
- int64_t kv_h = h % n_head_kv; /* GQA mapping */
4469
- float *kh = kt_base + kv_h * head_dim;
4470
- float *vh = vt_base + kv_h * head_dim;
4471
- float energy = 0.0f;
4472
- for (int64_t d = 0; d < head_dim; d++)
4473
- energy += kh[d] * vh[d];
4474
-
4475
- /* Triality encode energy β†’ D=6 quhit amplitude */
4476
- double re[D] = {0}, im[D] = {0};
4477
- float ae = fabsf(energy) + 1e-6f;
4478
- int ph = ((int)(ae * 100.0f)) % D;
4479
- re[ph] = sqrt(ae);
4480
- im[ph] = (energy < 0) ? -sqrt(ae) * 0.5 : sqrt(ae) * 0.5;
4481
- re[(ph+1)%D] = sqrt(ae) * 0.2;
4482
- re[(ph+5)%D] = sqrt(ae) * 0.2;
4483
- hpc_set_local(attn_graph, h, re, im);
4484
- }
4485
-
4486
- /* CZ-couple adjacent heads: creates cross-head entanglement */
4487
- for (int64_t h = 0; h < n_head - 1; h++)
4488
- hpc_cz(attn_graph, h, h + 1);
4489
-
4490
- /* Compute attention output per head using graph marginals */
4491
- #pragma omp parallel for schedule(static)
4492
- for (int64_t h = 0; h < n_head; h++) {
4493
- int64_t kv_h = h % n_head_kv;
4494
- float *qh = qt_base + h * head_dim;
4495
- float *kh = kt_base + kv_h * head_dim;
4496
- float *vh = vt_base + kv_h * head_dim;
4497
- float *Sh = S + h * head_dim * head_dim;
4498
- float *zh = z_acc + h * head_dim;
4499
-
4500
- /* Get HPC marginal: phase-coherent weight for this head */
4501
- float ae = 0.0f;
4502
- for (int64_t d = 0; d < head_dim; d++)
4503
- ae += fabsf(kh[d] * vh[d]);
4504
- ae += 1e-6f;
4505
- int ph = ((int)(ae * 100.0f)) % D;
4506
- double coherence_raw = hpc_marginal(attn_graph, h, ph);
4507
- float coherence = (float)(coherence_raw * D);
4508
- if (coherence < 0.1f) coherence = 0.1f;
4509
- if (coherence > 3.0f) coherence = 3.0f;
4510
-
4511
- /* Feature map: Ο†(x) = max(x,0) + Ξ΅ */
4512
- float qf[256], kf[256];
4513
- for (int64_t d = 0; d < head_dim; d++) {
4514
- qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
4515
- kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
4516
- }
4517
-
4518
- /* Accumulate: S += coherence Γ— outer(kf, v) */
4519
- for (int64_t d1 = 0; d1 < head_dim; d1++) {
4520
- float ks = kf[d1] * coherence;
4521
- for (int64_t d2 = 0; d2 < head_dim; d2++)
4522
- Sh[d1 * head_dim + d2] += ks * vh[d2];
4523
- }
4524
- for (int64_t d = 0; d < head_dim; d++)
4525
- zh[d] += kf[d] * coherence;
4526
-
4527
- /* Output: (qf @ S) / (qf Β· z) */
4528
- float den = 1e-8f;
4529
- for (int64_t d = 0; d < head_dim; d++)
4530
- den += qf[d] * zh[d];
4531
- float inv_den = 1.0f / den;
4532
-
4533
- /* Write to attn_inner at position [t, h*head_dim ... ] */
4534
- float *ao = attn_inner + t * inner_dim;
4535
- for (int64_t d2 = 0; d2 < head_dim; d2++) {
4536
- float num = 0.0f;
4537
- for (int64_t d1 = 0; d1 < head_dim; d1++)
4538
- num += qf[d1] * Sh[d1 * head_dim + d2];
4539
- /* Accumulate into attn_inner (multiple heads write here) */
4540
- ao[h * head_dim + d2] = num * inv_den;
4541
- }
4542
- }
4543
-
4544
- /* Compact graph edges periodically */
4545
- if (t > 0 && t % 64 == 0)
4546
- hpc_compact_edges(attn_graph);
4547
- }
4548
- }
4549
-
4550
- /* Gate projection if present */
4551
- if (gate_w && gate_rows > 0) {
4552
- int trans_w = (gate_rows == inner_dim) ? 1 : 0;
4553
- int64_t N_out = trans_w ? n_embd : gate_rows;
4554
- float *gated = (float *)malloc(seq_len * N_out * sizeof(float));
4555
- if (gated) {
4556
- hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate,
4557
- seq_len, inner_dim, N_out, trans_w);
4558
- for (int64_t t = 0; t < seq_len; t++) {
4559
- int64_t copy_dim = N_out < n_embd ? N_out : n_embd;
4560
- memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float));
4561
- }
4562
- free(gated);
4563
- }
4564
- } else {
4565
- for (int64_t t = 0; t < seq_len; t++) {
4566
- int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
4567
- memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
4568
- }
4569
- }
4570
- if (attn_inner) free(attn_inner);
4571
-
4572
- if (attn_graph) hpc_destroy(attn_graph);
4573
- free(S); free(z_acc); free(qkv);
4574
-
4575
- } else if (q_w && k_w && v_w && o_w) {
4576
- /* ── Separate QKV path (standard transformer) ── */
4577
- float *Q = (float *)malloc(seq_len * q_dim * sizeof(float));
4578
- float *K_buf = (float *)malloc(seq_len * k_dim * sizeof(float));
4579
- float *V_buf = (float *)malloc(seq_len * v_dim * sizeof(float));
4580
- if (!Q || !K_buf || !V_buf) {
4581
- if(Q) free(Q); if(K_buf) free(K_buf); if(V_buf) free(V_buf);
4582
- free(normed); free(attn_out);
4583
- return;
4584
- }
4585
-
4586
- hpc_matmul_graph(normed, q_w, Q, imp_q, cnt_q, seq_len, n_embd, q_dim, 0);
4587
- hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0);
4588
- hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0);
4589
-
4590
- /* Same HPC attention as above but with separate Q/K/V buffers */
4591
- int64_t hd_q = q_dim / n_head;
4592
- int64_t hd_kv = k_dim / n_head_kv;
4593
- int64_t inner_dim = n_head * hd_kv;
4594
- HPCGraph *attn_graph = hpc_create(n_head);
4595
- float *S = (float *)calloc(n_head * hd_kv * hd_kv, sizeof(float));
4596
- float *z_acc = (float *)calloc(n_head * hd_kv, sizeof(float));
4597
- float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
4598
-
4599
- if (attn_graph && S && z_acc && attn_inner) {
4600
- for (int64_t t = 0; t < seq_len; t++) {
4601
- /* Encode heads into graph */
4602
- for (int64_t h = 0; h < n_head; h++) {
4603
- int64_t kv_h = h % n_head_kv;
4604
- float *kh = K_buf + t * k_dim + kv_h * hd_kv;
4605
- float *vh = V_buf + t * v_dim + kv_h * hd_kv;
4606
- float energy = 0.0f;
4607
- for (int64_t d = 0; d < hd_kv; d++)
4608
- energy += kh[d] * vh[d];
4609
- double re[D] = {0}, im[D] = {0};
4610
- float ae = fabsf(energy) + 1e-6f;
4611
- int ph = ((int)(ae * 100.0f)) % D;
4612
- re[ph] = sqrt(ae);
4613
- im[ph] = (energy < 0) ? -sqrt(ae)*0.5 : sqrt(ae)*0.5;
4614
- hpc_set_local(attn_graph, h, re, im);
4615
- }
4616
- for (int64_t h = 0; h < n_head - 1; h++)
4617
- hpc_cz(attn_graph, h, h+1);
4618
-
4619
- #pragma omp parallel for schedule(static)
4620
- for (int64_t h = 0; h < n_head; h++) {
4621
- int64_t kv_h = h % n_head_kv;
4622
- float *qh = Q + t * q_dim + h * hd_q;
4623
- float *kh = K_buf + t * k_dim + kv_h * hd_kv;
4624
- float *vh = V_buf + t * v_dim + kv_h * hd_kv;
4625
- float *Sh = S + h * hd_kv * hd_kv;
4626
- float *zh = z_acc + h * hd_kv;
4627
- int64_t feat = hd_q < hd_kv ? hd_q : hd_kv;
4628
-
4629
- float ae = fabsf(kh[0]*vh[0]) + 1e-6f;
4630
- int ph = ((int)(ae * 100.0f)) % D;
4631
- double coh_raw = hpc_marginal(attn_graph, h, ph);
4632
- float coh = (float)(coh_raw * D);
4633
- if (coh < 0.1f) coh = 0.1f;
4634
- if (coh > 3.0f) coh = 3.0f;
4635
-
4636
- for (int64_t d1 = 0; d1 < feat; d1++) {
4637
- float kf = (kh[d1] > 0 ? kh[d1] : 0) + 1e-6f;
4638
- float ks = kf * coh;
4639
- for (int64_t d2 = 0; d2 < hd_kv; d2++)
4640
- Sh[d1*hd_kv+d2] += ks * vh[d2];
4641
- zh[d1] += kf * coh;
4642
- }
4643
-
4644
- float den = 1e-8f;
4645
- for (int64_t d = 0; d < feat; d++) {
4646
- float qf = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
4647
- den += qf * zh[d];
4648
- }
4649
- float inv_den = 1.0f / den;
4650
- float *ao = attn_inner + t * inner_dim;
4651
- for (int64_t d2 = 0; d2 < hd_kv; d2++) {
4652
- float num = 0.0f;
4653
- for (int64_t d1 = 0; d1 < feat; d1++) {
4654
- float qf = (qh[d1] > 0 ? qh[d1] : 0) + 1e-6f;
4655
- num += qf * Sh[d1*hd_kv+d2];
4656
- }
4657
- ao[h*hd_kv+d2] = num * inv_den;
4658
- }
4659
- }
4660
- if (t > 0 && t % 64 == 0)
4661
- hpc_compact_edges(attn_graph);
4662
- }
4663
- }
4664
-
4665
- /* Output projection */
4666
- if (o_w && o_cols > 0) {
4667
- float *proj_in = attn_inner;
4668
- int free_proj_in = 0;
4669
- if (inner_dim != o_cols) {
4670
- proj_in = (float *)calloc(seq_len * o_cols, sizeof(float));
4671
- if (proj_in) {
4672
- for (int64_t t = 0; t < seq_len; t++) {
4673
- int64_t copy_dim = inner_dim < o_cols ? inner_dim : o_cols;
4674
- memcpy(proj_in + t * o_cols, attn_inner + t * inner_dim, copy_dim * sizeof(float));
4675
- }
4676
- free_proj_in = 1;
4677
- } else {
4678
- proj_in = attn_inner;
4679
- }
4680
- }
4681
-
4682
- float *projected = (float *)calloc(seq_len * n_embd, sizeof(float));
4683
- if (projected) {
4684
- hpc_matmul_graph(proj_in, o_w, projected, imp_o, cnt_o,
4685
- seq_len, o_cols, n_embd, 0);
4686
- memcpy(attn_out, projected, seq_len * n_embd * sizeof(float));
4687
- free(projected);
4688
- }
4689
- if (free_proj_in && proj_in != attn_inner) free(proj_in);
4690
- } else {
4691
- for (int64_t t = 0; t < seq_len; t++) {
4692
- int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
4693
- memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
4694
- }
4695
- }
4696
- if (attn_inner) free(attn_inner);
4697
-
4698
- if (attn_graph) hpc_destroy(attn_graph);
4699
- free(S); free(z_acc);
4700
- free(Q); free(K_buf); free(V_buf);
4701
- }
4702
-
4703
- /* Residual add: hidden += attn_out */
4704
- int64_t total = seq_len * n_embd;
4705
- #pragma omp parallel for schedule(static)
4706
- for (int64_t i = 0; i < total; i++)
4707
- hidden[i] += attn_out[i];
4708
-
4709
- /* ══════════════ Phase 3: FFN ══════════════ */
4710
- if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) {
4711
- float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float));
4712
- float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
4713
- float *up_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
4714
-
4715
- if (normed_ff && gate_out && up_out) {
4716
- hexstate_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps);
4717
-
4718
- /* Graph-based matmul for FFN with importance */
4719
- hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out,
4720
- imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0);
4721
- hpc_matmul_graph(normed_ff, ffn_up_w, up_out,
4722
- imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0);
4723
-
4724
- /* SiLU(gate) * up */
4725
- hpc_silu(gate_out, seq_len * ffn_dim);
4726
- #pragma omp parallel for schedule(static)
4727
- for (int64_t i = 0; i < seq_len * ffn_dim; i++)
4728
- gate_out[i] *= up_out[i];
4729
-
4730
- /* Down projection: graph-based importance recording */
4731
- float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float));
4732
- if (ff_out_buf) {
4733
- hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf,
4734
- imp_ffn_down, cnt_ffn_down,
4735
- seq_len, ffn_dim, n_embd, 0);
4736
- /* Residual add */
4737
- #pragma omp parallel for schedule(static)
4738
- for (int64_t i = 0; i < total; i++)
4739
- hidden[i] += ff_out_buf[i];
4740
- free(ff_out_buf);
4741
- }
4742
- }
4743
-
4744
- free(normed_ff); free(gate_out); free(up_out);
4745
- }
4746
-
4747
- free(normed);
4748
- free(attn_out);
4749
- }
4750
-
4751
-
4752
  #ifndef HEXSTATE_LIBRARY
4753
  /* ═══════════════════════════════════════════════════════════════════════════
4754
  * MAIN
@@ -4898,21 +4136,11 @@ int main(int argc, char **argv)
4898
  /* ── Phase 2: Detect architecture ── */
4899
  printf(" Phase 2: Detecting model architecture...\n");
4900
 
4901
- /* Try to read config.json: explicit --config overrides auto-detect */
4902
  char config_path[1024];
 
4903
  const char *config_ptr = NULL;
4904
- if (config_override) {
4905
- FILE *check = fopen(config_override, "rb");
4906
- if (check) {
4907
- fclose(check);
4908
- config_ptr = config_override;
4909
- printf(" Using config.json: %s (via --config)\n", config_override);
4910
- } else {
4911
- fprintf(stderr, " WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
4912
- }
4913
- }
4914
- if (!config_ptr) {
4915
- snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
4916
  FILE *check = fopen(config_path, "rb");
4917
  if (check) {
4918
  fclose(check);
 
657
  scale_table_initialized = 1;
658
  }
659
 
660
+ /* ═══════════════════════════════════════════════════════════════════════════
661
+ * THREAD-LOCAL HPCGRAPH REUSE β€” Eliminates 776K malloc/free cycles
662
+ *
663
+ * The sub-block Shor measurement uses a 16-node linear-chain graph that
664
+ * is identical in topology every time. Instead of hpc_create()/hpc_destroy()
665
+ * inside the OMP hot loop, we reset the same graph to a clean state.
666
+ *
667
+ * This function resets an existing HPCGraph with n_sites nodes to its
668
+ * initial state: clears all edges, resets adjacency lists, reinitializes
669
+ * locals. Zero allocations.
670
+ * ═══════════════════════════════════════════════════════════════════════════ */
671
+ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
672
+ {
673
+ /* Reset edge state */
674
+ g->n_edges = 0;
675
+ g->cz_edges = 0;
676
+ g->phase_edges = 0;
677
+ g->syntheme_edges = 0;
678
+ g->n_log = 0;
679
+ g->min_fidelity = 1.0;
680
+ g->avg_fidelity = 1.0;
681
+ g->amp_evals = 0;
682
+ g->prob_evals = 0;
683
+ g->measurements = 0;
684
+
685
+ /* Reset adjacency lists (just zero the counts, keep allocated buffers) */
686
+ for (uint64_t i = 0; i < n_sites; i++) {
687
+ g->adj[i].count = 0;
688
+ }
689
+
690
+ /* Reinitialize local quhit states */
691
+ for (uint64_t i = 0; i < n_sites; i++)
692
+ triality_init(&g->locals[i]);
693
+ }
694
+
695
+ /* ═══════════════════════════════════════════════════════════════════════════
696
+ * FAST POWER APPROXIMATION β€” Replaces powf(x, 2.4f) in MSE grid search
697
+ *
698
+ * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 Γ— x^0.4
699
+ * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
700
+ * x^0.4 β‰ˆ sqrtf(cbrtf(x^2 Γ— x^2)) but simpler: x^2 Γ— sqrtf(sqrtf(x))
701
+ * is close enough for error norm purposes (~1% relative error).
702
+ * ═══════════════════════════════════════════════════════════════════════════ */
703
+ static inline float fast_pow_2_4(float x)
704
+ {
705
+ /* x^2.4 = x^2 Γ— x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
706
+ * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
707
+ * x^0.4 β‰ˆ sqrtf(sqrtf(x)) Γ— x^(-0.1) β€” too complex.
708
+ * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
709
+ * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */
710
+ float x2 = x * x;
711
+ return x2 * sqrtf(cbrtf(x2)); /* x^2 Γ— (x^2)^(1/6) β‰ˆ x^(2+1/3) β‰ˆ x^2.333 */
712
+ }
713
+
714
  /* Compute the Q2_K sub-block reconstruction error for a block at a given
715
  * scale multiplier, optionally weighted by importance vector */
716
  static float compute_block_error_q2k(const float *weights, int block_size,
 
948
 
949
  float deq = cand_min + scale * (float)l;
950
  float diff = fabsf(x[i] - deq);
951
+ /* Apply error norm β€” fast path for default norm=2.4 */
952
  float e = diff;
953
+ if (cfg->norm == 2.4f) {
954
+ e = fast_pow_2_4(diff);
955
+ } else if (cfg->norm != 1.0f) {
956
  e = powf(diff, cfg->norm);
957
  }
958
  /* Apply importance weighting */
 
1816
 
1817
  /* Build per-block CDFs from triality marginals */
1818
  unsigned int born_rng = 314159;
1819
+
1820
+ /* Compute tail error once (blocks beyond graph coverage) */
1821
+ float tail_err_q4 = 0.0f;
1822
+ for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
1823
+ tail_err_q4 += cand_errors[bi][best_candidate[bi]];
1824
+
1825
+ /* Sparse shot buffer: only track stride-sampled blocks */
1826
+ int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
1827
 
1828
  for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
1829
+ float shot_err = tail_err_q4;
 
 
 
 
1830
 
1831
  for (int64_t gi = 0; gi < graph_blocks; gi++) {
1832
  /* Normalize marginals to CDF */
 
1857
  }
1858
  }
1859
 
1860
+ shot_sparse_q4[gi] = best_bin_cand;
1861
  shot_err += cand_errors[blk][best_bin_cand];
1862
  }
1863
 
1864
  /* Metropolis acceptance: adopt if better than current best */
1865
  if (shot_err < beam_total_err) {
1866
+ for (int64_t gi = 0; gi < graph_blocks; gi++)
1867
+ best_candidate[gi * stride] = shot_sparse_q4[gi];
1868
  beam_total_err = shot_err;
1869
  }
1870
  }
1871
 
1872
+ free(shot_sparse_q4);
1873
  }
1874
 
1875
  free(marg);
 
2745
  beam_total_err += candidate_errors[bi][best_candidate[bi]];
2746
 
2747
  unsigned int born_rng_q2 = 271828;
2748
+ /* Compute tail error once (blocks beyond graph coverage) */
2749
+ float tail_err = 0.0f;
2750
+ for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
2751
+ tail_err += candidate_errors[bi][best_candidate[bi]];
2752
+
2753
+ /* Sparse shot buffer: only track stride-sampled blocks */
2754
+ int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int));
2755
 
2756
  for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
2757
+ float shot_err = tail_err;
 
 
 
 
2758
 
2759
  for (int64_t gi = 0; gi < graph_blocks; gi++) {
2760
  /* Born sample coarse (d) quhit */
 
2799
  }
2800
  }
2801
 
2802
+ shot_sparse[gi] = best_bin_cand;
2803
  shot_err += candidate_errors[blk][best_bin_cand];
2804
  }
2805
 
2806
  if (shot_err < beam_total_err) {
2807
+ /* Only now apply the sparse updates to best_candidate */
2808
+ for (int64_t gi = 0; gi < graph_blocks; gi++)
2809
+ best_candidate[gi * stride] = shot_sparse[gi];
2810
  beam_total_err = shot_err;
2811
  }
2812
  }
2813
 
2814
+ free(shot_sparse);
2815
  }
2816
 
2817
  free(coarse_marg);
 
2852
  * the perfect bit analog at 2-bit resolution.
2853
  * ══════════════════════════════════════════════════════════════════ */
2854
 
2855
+ /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
2856
+ * This eliminates ~776K malloc/free cycles from the inner loop.
2857
+ * Each thread reuses its graph via hpc_reset_for_subblock(). */
2858
+ int _n_omp_threads = 1;
2859
+ #ifdef _OPENMP
2860
+ _n_omp_threads = omp_get_max_threads();
2861
+ #endif
2862
+ HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *));
2863
+ for (int _ti = 0; _ti < _n_omp_threads; _ti++)
2864
+ _tl_graphs[_ti] = hpc_create(N_SUB);
2865
+
2866
  #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
2867
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2868
  const float *block_x = weights + blk * QK_K;
 
2877
  float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2878
 
2879
  /* ── Analog assembly: iterate to convergence ──
2880
+ * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes
2881
+ * after 2-3 passes. Additional iterations produce negligible
2882
+ * change in the committed FP16 values.
2883
+ * A) Sub-block Shor measurement to find coupled (Ls,Lm) states
2884
  * B) Optimal q-value assignment
2885
  * C) WLS solve for (d, dmin) */
2886
+ for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
2887
 
2888
  /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
2889
  * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
 
2935
  }
2936
  }
2937
 
2938
+ /* Reset thread-local sub-block graph (zero allocations) */
2939
+ int _tid = 0;
2940
+ #ifdef _OPENMP
2941
+ _tid = omp_get_thread_num();
2942
+ #endif
2943
+ HPCGraph *sg = _tl_graphs[_tid];
2944
+ hpc_reset_for_subblock(sg, N_SUB);
2945
+ {
2946
  float min_sub_err[N_SUB];
2947
  for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
2948
 
 
2980
  hpc_cz(sg, j, j + 1);
2981
 
2982
  /* ── Shor sequential measurement on sub-block graph ──
2983
+ * Stack-allocated arrays: eliminates 2 calloc/free per iteration */
2984
+ double sub_marg[N_SUB][6];
2985
+ int sub_measured[N_SUB];
2986
+ memset(sub_marg, 0, sizeof(sub_marg));
2987
+ memset(sub_measured, 0, sizeof(sub_measured));
2988
 
2989
  shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
2990
 
 
3001
  Ls_blk[j] = state_ls[j][best_v];
3002
  Lm_blk[j] = state_lm[j][best_v];
3003
  }
 
 
 
 
 
 
 
 
 
 
3004
  }
3005
 
3006
  /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
 
3110
  }
3111
 
3112
  /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
3113
+ * The WLS solve may have shifted (d, dmin) after the last Step A.
3114
+ * Neighborhood search Β±2 around current values (25 pairs vs 256)
3115
+ * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
3116
  for (int j = 0; j < N_SUB; j++) {
3117
  const float *sx = block_x + 16 * j;
3118
  float best_sub_err = 1e30f;
3119
  uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
3120
+ int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
3121
+ int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
3122
+ int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
3123
+ int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
3124
+ for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
3125
  float d_sub = dm * (float)try_ls;
3126
+ for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
3127
  float m_sub = mm * (float)try_lm;
3128
  float sub_err = 0.0f;
3129
  for (int k = 0; k < 16; k++) {
 
3315
  total_err += berr;
3316
  }
3317
 
3318
+ /* Free thread-local sub-block graphs */
3319
+ for (int _ti = 0; _ti < _n_omp_threads; _ti++)
3320
+ hpc_destroy(_tl_graphs[_ti]);
3321
+ free(_tl_graphs);
3322
+
3323
  free(seeds);
3324
  free(candidate_errors);
3325
  free(candidate_d);
 
3987
  if (out_error) *out_error = err;
3988
  }
3989
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3990
  #ifndef HEXSTATE_LIBRARY
3991
  /* ═══════════════════════════════════════════════════════════════════════════
3992
  * MAIN
 
4136
  /* ── Phase 2: Detect architecture ── */
4137
  printf(" Phase 2: Detecting model architecture...\n");
4138
 
4139
+ /* Try to read config.json from model directory */
4140
  char config_path[1024];
4141
+ snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
4142
  const char *config_ptr = NULL;
4143
+ {
 
 
 
 
 
 
 
 
 
 
 
4144
  FILE *check = fopen(config_path, "rb");
4145
  if (check) {
4146
  fclose(check);
hexstate_requantize.py CHANGED
@@ -1,27 +1,15 @@
1
  #!/usr/bin/env python3
2
  """
3
- HExState GGUF Re-Quantizer β€” GGUF-to-GGUF HPC quantization.
4
 
5
  Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
6
- and re-quantizes eligible weight tensors using the HExState HPC engine
7
- (Shor-optimized Griffiths-Niu measurement via libhexstate_q2k.so).
8
 
9
- Quantization tiers:
10
- - Attention Q/K/V/O + DeltaNet SSM projections β†’ Q4_0 (HPC-optimized)
11
- - FFN / MLP weight matrices β†’ Q2_K (HPC-optimized)
12
- - Embeddings, norms, biases, LM head β†’ kept at source precision
13
-
14
- Falls back to a pure numpy Q2_K implementation if the C library is not built.
15
 
16
  Usage:
17
- python3 hexstate_requantize.py input.gguf output.gguf [options]
18
-
19
- Options:
20
- --config <file> Load HuggingFace config.json for arch detection
21
- --imatrix <file> Importance matrix for calibrated quantization
22
- --keep-metadata Preserve all GGUF metadata as-is
23
- --q2all Force all eligible tensors to Q2_K
24
- --quantize-none Skip quantization (passthrough)
25
  """
26
 
27
  import struct
@@ -29,7 +17,6 @@ import sys
29
  import time
30
  import os
31
  import io
32
- import json
33
  import ctypes
34
  import numpy as np
35
 
@@ -277,14 +264,14 @@ TYPE_NAME = {
277
  13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
278
  }
279
 
280
- # Block sizes and byte sizes for each type (from ggml.c)
281
  TYPE_BLOCK_SIZE = {
282
  0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
283
  8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
284
  13: 256, 14: 256, 15: 256, 30: 1,
285
  }
286
  TYPE_BLOCK_BYTES = {
287
- 0: 4, 1: 2, 2: 18, 3: 20, 6: 22, 7: 24,
288
  8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
289
  13: 176, 14: 210, 15: 292, 30: 2,
290
  }
@@ -680,85 +667,9 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
680
  return True
681
 
682
 
683
- def load_model_config(config_path):
684
- """Load a HuggingFace config.json and extract architecture info.
685
-
686
- Supports both flat configs (LLaMA, Mistral, Qwen2, etc.) and
687
- nested text_config (Qwen 3.5/3.6 multimodal models).
688
-
689
- Returns dict with: model_type, hidden_size, num_hidden_layers,
690
- num_attention_heads, num_key_value_heads, intermediate_size,
691
- vocab_size, layer_types, tie_word_embeddings, rope_theta, etc.
692
- """
693
- with open(config_path, 'r') as f:
694
- raw = json.load(f)
695
-
696
- cfg = {}
697
-
698
- # Try flat config first, then nested text_config
699
- src = raw
700
- if 'text_config' in raw and 'hidden_size' not in raw:
701
- src = raw['text_config']
702
- cfg['is_multimodal'] = True
703
- else:
704
- cfg['is_multimodal'] = False
705
-
706
- # Use top-level model_type if text_config doesn't have one
707
- cfg['model_type'] = src.get('model_type', raw.get('model_type', 'unknown'))
708
- cfg['hidden_size'] = src.get('hidden_size', 0)
709
- cfg['num_hidden_layers'] = src.get('num_hidden_layers', 0)
710
- cfg['num_attention_heads'] = src.get('num_attention_heads', 0)
711
- cfg['num_key_value_heads'] = src.get('num_key_value_heads', 0)
712
- cfg['intermediate_size'] = src.get('intermediate_size', 0)
713
- cfg['vocab_size'] = src.get('vocab_size', 0)
714
- cfg['tie_word_embeddings'] = src.get('tie_word_embeddings',
715
- raw.get('tie_word_embeddings', False))
716
- cfg['layer_types'] = src.get('layer_types', None)
717
- cfg['head_dim'] = src.get('head_dim', 0)
718
- cfg['rms_norm_eps'] = src.get('rms_norm_eps', 1e-5)
719
-
720
- # Rope theta β€” may be nested in rope_parameters
721
- rope_params = src.get('rope_parameters', {})
722
- cfg['rope_theta'] = rope_params.get('rope_theta',
723
- src.get('rope_theta', 10000.0))
724
-
725
- # Architecture classification for GGUF compatibility
726
- mt = cfg['model_type'].lower()
727
- if mt in ('qwen3_5', 'qwen3_5_text', 'qwen3_5_moe'):
728
- cfg['gguf_arch'] = 'qwen2'
729
- cfg['has_linear_attn'] = True
730
- elif mt in ('qwen2',):
731
- cfg['gguf_arch'] = 'qwen2'
732
- cfg['has_linear_attn'] = False
733
- elif mt in ('qwen2_moe',):
734
- cfg['gguf_arch'] = 'qwen2moe'
735
- cfg['has_linear_attn'] = False
736
- elif mt in ('llama', 'mistral'):
737
- cfg['gguf_arch'] = 'llama'
738
- cfg['has_linear_attn'] = False
739
- elif mt in ('phi3', 'phi'):
740
- cfg['gguf_arch'] = 'phi3'
741
- cfg['has_linear_attn'] = False
742
- elif mt in ('gemma', 'gemma2'):
743
- cfg['gguf_arch'] = 'gemma'
744
- cfg['has_linear_attn'] = False
745
- else:
746
- cfg['gguf_arch'] = 'llama' # fallback
747
- cfg['has_linear_attn'] = False
748
-
749
- return cfg
750
-
751
-
752
  def main():
753
  if len(sys.argv) < 3:
754
- print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [options]")
755
- print()
756
- print(" Options:")
757
- print(" --config <file> Load HuggingFace config.json for arch detection")
758
- print(" --imatrix <file> Importance matrix for calibrated quantization")
759
- print(" --keep-metadata Preserve all GGUF metadata as-is")
760
- print(" --q2all Force all eligible tensors to Q2_K")
761
- print(" --quantize-none Skip quantization (passthrough)")
762
  sys.exit(1)
763
 
764
  input_path = sys.argv[1]
@@ -767,32 +678,6 @@ def main():
767
  quantize_none = '--quantize-none' in sys.argv
768
  q2all = '--q2all' in sys.argv
769
 
770
- # Check for --config
771
- model_config = None
772
- for i, arg in enumerate(sys.argv):
773
- if arg == '--config' and i + 1 < len(sys.argv):
774
- cfg_path = sys.argv[i + 1]
775
- if os.path.exists(cfg_path):
776
- model_config = load_model_config(cfg_path)
777
- print(f" Loaded config: {cfg_path}")
778
- print(f" model_type: {model_config['model_type']}")
779
- print(f" gguf_arch: {model_config['gguf_arch']}")
780
- print(f" hidden_size: {model_config['hidden_size']}")
781
- print(f" layers: {model_config['num_hidden_layers']}")
782
- print(f" heads: {model_config['num_attention_heads']}")
783
- print(f" kv_heads: {model_config['num_key_value_heads']}")
784
- print(f" vocab: {model_config['vocab_size']}")
785
- print(f" tied_embeddings: {model_config['tie_word_embeddings']}")
786
- if model_config.get('has_linear_attn'):
787
- lt = model_config.get('layer_types', [])
788
- n_lin = lt.count('linear_attention') if lt else 0
789
- n_full = lt.count('full_attention') if lt else 0
790
- print(f" layer_types: {n_lin} linear_attn + {n_full} full_attn")
791
- print()
792
- else:
793
- print(f" WARNING: config file not found: {cfg_path}")
794
- break
795
-
796
  # Check for imatrix
797
  imatrix_data = None
798
  for i, arg in enumerate(sys.argv):
@@ -967,13 +852,6 @@ def main():
967
  out_data_offset += out_size
968
  out_data_offset = align_offset(out_data_offset)
969
 
970
- # ── Detect Architecture ──
971
- arch = 'llama'
972
- for key, vtype, val in kv_pairs:
973
- if key == 'general.architecture' and vtype == 8:
974
- arch = val.decode('utf-8', errors='ignore')
975
- break
976
-
977
  # ── Update KV pairs ──
978
  updated_kv = []
979
  if keep_metadata:
 
1
  #!/usr/bin/env python3
2
  """
3
+ HExState GGUF Re-Quantizer β€” GGUF-to-GGUF Q2_K quantization.
4
 
5
  Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
6
+ and re-quantizes eligible weight tensors to Q2_K using numpy.
 
7
 
8
+ This bypasses the tokenizer parsing problem entirely β€” the source GGUF
9
+ (from llama.cpp's convert_hf_to_gguf.py) has correct metadata.
 
 
 
 
10
 
11
  Usage:
12
+ python3 hexstate_requantize.py input.gguf output.gguf
 
 
 
 
 
 
 
13
  """
14
 
15
  import struct
 
17
  import time
18
  import os
19
  import io
 
20
  import ctypes
21
  import numpy as np
22
 
 
264
  13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
265
  }
266
 
267
+ # Block sizes and byte sizes for each type
268
  TYPE_BLOCK_SIZE = {
269
  0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
270
  8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
271
  13: 256, 14: 256, 15: 256, 30: 1,
272
  }
273
  TYPE_BLOCK_BYTES = {
274
+ 0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
275
  8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
276
  13: 176, 14: 210, 15: 292, 30: 2,
277
  }
 
667
  return True
668
 
669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  def main():
671
  if len(sys.argv) < 3:
672
+ print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [--keep-metadata]")
 
 
 
 
 
 
 
673
  sys.exit(1)
674
 
675
  input_path = sys.argv[1]
 
678
  quantize_none = '--quantize-none' in sys.argv
679
  q2all = '--q2all' in sys.argv
680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  # Check for imatrix
682
  imatrix_data = None
683
  for i, arg in enumerate(sys.argv):
 
852
  out_data_offset += out_size
853
  out_data_offset = align_offset(out_data_offset)
854
 
 
 
 
 
 
 
 
855
  # ── Update KV pairs ──
856
  updated_kv = []
857
  if keep_metadata:
makefile.quantize CHANGED
@@ -6,17 +6,17 @@
6
  # ═══════════════════════════════════════════════════════════════════════════
7
 
8
  CC = gcc
9
- CFLAGS = -O2 -std=gnu99 -shared -fPIC -Wall -Wno-unused-function -Wno-unused-variable -fopenmp
10
  LDFLAGS = -lm -lgmp -lmpfr -fopenmp
11
 
12
- # Include local directory for HexState headers
13
- INCLUDES = -I.
14
 
15
  # Source files β€” quantizer + HExState engine dependencies (no bigint)
16
  SRCS = hexstate_quantize.c \
17
- quhit_triality.c \
18
- quhit_hexagram.c \
19
- s6_exotic.c
20
 
21
  TARGET = libhexstate_q2k.so
22
 
 
6
  # ═══════════════════════════════════════════════════════════════════════════
7
 
8
  CC = gcc
9
+ CFLAGS = -O3 -march=native -ffast-math -fopenmp -std=gnu99 -shared -fPIC -Wall -Wno-unused-function -Wno-unused-variable
10
  LDFLAGS = -lm -lgmp -lmpfr -fopenmp
11
 
12
+ # Include parent directory for HExState headers
13
+ INCLUDES = -I..
14
 
15
  # Source files β€” quantizer + HExState engine dependencies (no bigint)
16
  SRCS = hexstate_quantize.c \
17
+ ../quhit_triality.c \
18
+ ../quhit_hexagram.c \
19
+ ../s6_exotic.c
20
 
21
  TARGET = libhexstate_q2k.so
22