Upload 5 files
Browse files- convert_hf_to_gguf.py +16 -0
- download_model.py +109 -0
- hexstate_quantize.c +130 -902
- hexstate_requantize.py +8 -130
- makefile.quantize +6 -6
convert_hf_to_gguf.py
CHANGED
|
@@ -7651,6 +7651,22 @@ class Gemma4Model(Gemma3Model):
|
|
| 7651 |
yield from super().modify_tensors(data_torch, name, bid)
|
| 7652 |
|
| 7653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7654 |
@ModelBase.register("Gemma4ForConditionalGeneration")
|
| 7655 |
class Gemma4VisionAudioModel(MmprojModel):
|
| 7656 |
has_audio_encoder = True
|
|
|
|
| 7651 |
yield from super().modify_tensors(data_torch, name, bid)
|
| 7652 |
|
| 7653 |
|
| 7654 |
+
@ModelBase.register("Gemma4AssistantForCausalLM")
|
| 7655 |
+
class Gemma4AssistantModel(Gemma4Model):
|
| 7656 |
+
model_arch = gguf.MODEL_ARCH.GEMMA4
|
| 7657 |
+
|
| 7658 |
+
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
| 7659 |
+
# Handle assistant-specific projection layers
|
| 7660 |
+
if name == "pre_projection.weight":
|
| 7661 |
+
return "pre_proj.weight"
|
| 7662 |
+
if name == "post_projection.weight":
|
| 7663 |
+
return "post_proj.weight"
|
| 7664 |
+
# Map embed_tokens to token_embd for compatibility
|
| 7665 |
+
if name == "model.embed_tokens.weight":
|
| 7666 |
+
return "token_embd.weight"
|
| 7667 |
+
return super().map_tensor_name(name, try_suffixes)
|
| 7668 |
+
|
| 7669 |
+
|
| 7670 |
@ModelBase.register("Gemma4ForConditionalGeneration")
|
| 7671 |
class Gemma4VisionAudioModel(MmprojModel):
|
| 7672 |
has_audio_encoder = True
|
download_model.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import argparse
|
| 5 |
+
import urllib.parse
|
| 6 |
+
from huggingface_hub import snapshot_download
|
| 7 |
+
|
| 8 |
+
def parse_hf_url(url_or_id):
|
| 9 |
+
"""
|
| 10 |
+
Parses a Hugging Face URL or Repo ID and extracts the repo ID and type.
|
| 11 |
+
Example URL: https://huggingface.co/google/gemma-4-26B-A4B-it
|
| 12 |
+
Example Dataset URL: https://huggingface.co/datasets/ggml-org/ci
|
| 13 |
+
"""
|
| 14 |
+
# Check if it is a URL or a repo ID
|
| 15 |
+
if not (url_or_id.startswith("http://") or url_or_id.startswith("https://")):
|
| 16 |
+
# If it contains a slash, assume it is user/repo
|
| 17 |
+
return url_or_id, "model"
|
| 18 |
+
|
| 19 |
+
parsed = urllib.parse.urlparse(url_or_id)
|
| 20 |
+
if parsed.netloc not in ("huggingface.co", "www.huggingface.co"):
|
| 21 |
+
raise ValueError(f"URL host must be huggingface.co, got: {parsed.netloc}")
|
| 22 |
+
|
| 23 |
+
path_parts = [p for p in parsed.path.split("/") if p]
|
| 24 |
+
if not path_parts:
|
| 25 |
+
raise ValueError("Hugging Face URL path is empty")
|
| 26 |
+
|
| 27 |
+
repo_type = "model"
|
| 28 |
+
if path_parts[0] in ("datasets", "spaces"):
|
| 29 |
+
repo_type = "dataset" if path_parts[0] == "datasets" else "space"
|
| 30 |
+
path_parts = path_parts[1:]
|
| 31 |
+
|
| 32 |
+
if len(path_parts) < 2:
|
| 33 |
+
if len(path_parts) == 1:
|
| 34 |
+
return path_parts[0], repo_type
|
| 35 |
+
raise ValueError("Could not extract repository ID from Hugging Face URL")
|
| 36 |
+
|
| 37 |
+
repo_id = f"{path_parts[0]}/{path_parts[1]}"
|
| 38 |
+
return repo_id, repo_type
|
| 39 |
+
|
| 40 |
+
def main():
|
| 41 |
+
parser = argparse.ArgumentParser(
|
| 42 |
+
description="Download a Hugging Face model or dataset from a URL or repository ID."
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"url_or_id",
|
| 46 |
+
type=str,
|
| 47 |
+
help="Hugging Face repository URL (e.g. https://huggingface.co/google/gemma-4-26B-A4B-it) or repository ID (e.g. google/gemma-4-26B-A4B-it)."
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--local-dir",
|
| 51 |
+
type=str,
|
| 52 |
+
default=None,
|
| 53 |
+
help="Directory to save the downloaded model. Defaults to a folder matching the repository name in the current directory."
|
| 54 |
+
)
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--token",
|
| 57 |
+
type=str,
|
| 58 |
+
default=os.environ.get("HF_TOKEN"),
|
| 59 |
+
help="Hugging Face API token. Can also be set via the HF_TOKEN environment variable."
|
| 60 |
+
)
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--exclude",
|
| 63 |
+
type=str,
|
| 64 |
+
nargs="*",
|
| 65 |
+
help="Glob patterns to exclude from download (e.g., *.bin, *.pt)"
|
| 66 |
+
)
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--include",
|
| 69 |
+
type=str,
|
| 70 |
+
nargs="*",
|
| 71 |
+
help="Glob patterns to include in download (e.g., *.safetensors)"
|
| 72 |
+
)
|
| 73 |
+
args = parser.parse_args()
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
repo_id, repo_type = parse_hf_url(args.url_or_id)
|
| 77 |
+
except ValueError as e:
|
| 78 |
+
print(f"Error parsing input URL/ID: {e}", file=sys.stderr)
|
| 79 |
+
sys.exit(1)
|
| 80 |
+
|
| 81 |
+
# Determine local directory if not specified
|
| 82 |
+
if args.local_dir is None:
|
| 83 |
+
repo_name = repo_id.split("/")[-1]
|
| 84 |
+
args.local_dir = os.path.join(os.getcwd(), repo_name)
|
| 85 |
+
|
| 86 |
+
print(f"Repository ID: {repo_id}")
|
| 87 |
+
print(f"Repository Type: {repo_type}")
|
| 88 |
+
print(f"Target Directory: {args.local_dir}")
|
| 89 |
+
|
| 90 |
+
os.makedirs(args.local_dir, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
downloaded_path = snapshot_download(
|
| 94 |
+
repo_id=repo_id,
|
| 95 |
+
repo_type=repo_type,
|
| 96 |
+
local_dir=args.local_dir,
|
| 97 |
+
local_dir_use_symlinks=False,
|
| 98 |
+
token=args.token,
|
| 99 |
+
ignore_patterns=args.exclude,
|
| 100 |
+
allow_patterns=args.include
|
| 101 |
+
)
|
| 102 |
+
print(f"\nDownload completed successfully!")
|
| 103 |
+
print(f"Files saved in: {downloaded_path}")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"\nError downloading repository: {e}", file=sys.stderr)
|
| 106 |
+
sys.exit(1)
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
main()
|
hexstate_quantize.c
CHANGED
|
@@ -657,6 +657,60 @@ static void init_scale_table(void) {
|
|
| 657 |
scale_table_initialized = 1;
|
| 658 |
}
|
| 659 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
/* Compute the Q2_K sub-block reconstruction error for a block at a given
|
| 661 |
* scale multiplier, optionally weighted by importance vector */
|
| 662 |
static float compute_block_error_q2k(const float *weights, int block_size,
|
|
@@ -894,9 +948,11 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
|
|
| 894 |
|
| 895 |
float deq = cand_min + scale * (float)l;
|
| 896 |
float diff = fabsf(x[i] - deq);
|
| 897 |
-
/* Apply error norm */
|
| 898 |
float e = diff;
|
| 899 |
-
if (cfg->norm
|
|
|
|
|
|
|
| 900 |
e = powf(diff, cfg->norm);
|
| 901 |
}
|
| 902 |
/* Apply importance weighting */
|
|
@@ -1760,14 +1816,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1760 |
|
| 1761 |
/* Build per-block CDFs from triality marginals */
|
| 1762 |
unsigned int born_rng = 314159;
|
| 1763 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1764 |
|
| 1765 |
for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
|
| 1766 |
-
float shot_err =
|
| 1767 |
-
/* Init from beam result so tail blocks beyond
|
| 1768 |
-
* graph_blocks*stride keep valid indices */
|
| 1769 |
-
memcpy(shot_assignment, best_candidate,
|
| 1770 |
-
n_blocks * sizeof(int));
|
| 1771 |
|
| 1772 |
for (int64_t gi = 0; gi < graph_blocks; gi++) {
|
| 1773 |
/* Normalize marginals to CDF */
|
|
@@ -1798,19 +1857,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1798 |
}
|
| 1799 |
}
|
| 1800 |
|
| 1801 |
-
|
| 1802 |
shot_err += cand_errors[blk][best_bin_cand];
|
| 1803 |
}
|
| 1804 |
|
| 1805 |
/* Metropolis acceptance: adopt if better than current best */
|
| 1806 |
if (shot_err < beam_total_err) {
|
| 1807 |
-
for (int64_t
|
| 1808 |
-
best_candidate[
|
| 1809 |
beam_total_err = shot_err;
|
| 1810 |
}
|
| 1811 |
}
|
| 1812 |
|
| 1813 |
-
free(
|
| 1814 |
}
|
| 1815 |
|
| 1816 |
free(marg);
|
|
@@ -2686,14 +2745,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2686 |
beam_total_err += candidate_errors[bi][best_candidate[bi]];
|
| 2687 |
|
| 2688 |
unsigned int born_rng_q2 = 271828;
|
| 2689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2690 |
|
| 2691 |
for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
|
| 2692 |
-
float shot_err =
|
| 2693 |
-
/* Init from beam result so tail blocks beyond
|
| 2694 |
-
* graph_blocks*stride keep valid indices */
|
| 2695 |
-
memcpy(shot_assignment, best_candidate,
|
| 2696 |
-
n_blocks * sizeof(int));
|
| 2697 |
|
| 2698 |
for (int64_t gi = 0; gi < graph_blocks; gi++) {
|
| 2699 |
/* Born sample coarse (d) quhit */
|
|
@@ -2738,18 +2799,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2738 |
}
|
| 2739 |
}
|
| 2740 |
|
| 2741 |
-
|
| 2742 |
shot_err += candidate_errors[blk][best_bin_cand];
|
| 2743 |
}
|
| 2744 |
|
| 2745 |
if (shot_err < beam_total_err) {
|
| 2746 |
-
|
| 2747 |
-
|
|
|
|
| 2748 |
beam_total_err = shot_err;
|
| 2749 |
}
|
| 2750 |
}
|
| 2751 |
|
| 2752 |
-
free(
|
| 2753 |
}
|
| 2754 |
|
| 2755 |
free(coarse_marg);
|
|
@@ -2790,6 +2852,17 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2790 |
* the perfect bit analog at 2-bit resolution.
|
| 2791 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 2792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2793 |
#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
|
| 2794 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
| 2795 |
const float *block_x = weights + blk * QK_K;
|
|
@@ -2804,12 +2877,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2804 |
float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 2805 |
|
| 2806 |
/* ββ Analog assembly: iterate to convergence ββ
|
| 2807 |
-
*
|
| 2808 |
-
*
|
| 2809 |
-
*
|
|
|
|
| 2810 |
* B) Optimal q-value assignment
|
| 2811 |
* C) WLS solve for (d, dmin) */
|
| 2812 |
-
for (int ls_iter = 0; ls_iter <
|
| 2813 |
|
| 2814 |
/* ββ Step A: Sub-block Quhit BP (Strategy 1) ββ
|
| 2815 |
* For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
|
|
@@ -2861,9 +2935,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2861 |
}
|
| 2862 |
}
|
| 2863 |
|
| 2864 |
-
/*
|
| 2865 |
-
|
| 2866 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2867 |
float min_sub_err[N_SUB];
|
| 2868 |
for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
|
| 2869 |
|
|
@@ -2901,9 +2980,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2901 |
hpc_cz(sg, j, j + 1);
|
| 2902 |
|
| 2903 |
/* ββ Shor sequential measurement on sub-block graph ββ
|
| 2904 |
-
*
|
| 2905 |
-
double
|
| 2906 |
-
int
|
|
|
|
|
|
|
| 2907 |
|
| 2908 |
shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
|
| 2909 |
|
|
@@ -2920,16 +3001,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2920 |
Ls_blk[j] = state_ls[j][best_v];
|
| 2921 |
Lm_blk[j] = state_lm[j][best_v];
|
| 2922 |
}
|
| 2923 |
-
|
| 2924 |
-
free(sub_marg);
|
| 2925 |
-
free(sub_measured);
|
| 2926 |
-
hpc_destroy(sg);
|
| 2927 |
-
} else {
|
| 2928 |
-
/* Fallback to independent local optima if malloc fails */
|
| 2929 |
-
for (int j = 0; j < N_SUB; j++) {
|
| 2930 |
-
Ls_blk[j] = state_ls[j][0];
|
| 2931 |
-
Lm_blk[j] = state_lm[j][0];
|
| 2932 |
-
}
|
| 2933 |
}
|
| 2934 |
|
| 2935 |
/* ββ Step B: Quantize q-values with optimal Ls/Lm ββ */
|
|
@@ -3039,16 +3110,20 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3039 |
}
|
| 3040 |
|
| 3041 |
/* ββ Final Ls/Lm re-optimization at committed FP16 (d, dmin) ββ
|
| 3042 |
-
* The WLS solve may have shifted (d, dmin) after the last Step A
|
| 3043 |
-
*
|
| 3044 |
-
*
|
| 3045 |
for (int j = 0; j < N_SUB; j++) {
|
| 3046 |
const float *sx = block_x + 16 * j;
|
| 3047 |
float best_sub_err = 1e30f;
|
| 3048 |
uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
|
| 3049 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3050 |
float d_sub = dm * (float)try_ls;
|
| 3051 |
-
for (int try_lm =
|
| 3052 |
float m_sub = mm * (float)try_lm;
|
| 3053 |
float sub_err = 0.0f;
|
| 3054 |
for (int k = 0; k < 16; k++) {
|
|
@@ -3240,6 +3315,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3240 |
total_err += berr;
|
| 3241 |
}
|
| 3242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3243 |
free(seeds);
|
| 3244 |
free(candidate_errors);
|
| 3245 |
free(candidate_d);
|
|
@@ -3907,848 +3987,6 @@ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 3907 |
if (out_error) *out_error = err;
|
| 3908 |
}
|
| 3909 |
|
| 3910 |
-
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3911 |
-
* HPC-Accelerated BPE Tokenizer
|
| 3912 |
-
*
|
| 3913 |
-
* Uses the Holographic Phase Graph for BPE tokenization.
|
| 3914 |
-
*
|
| 3915 |
-
* Architecture:
|
| 3916 |
-
* 1. Each character position is a SITE in an HPCGraph
|
| 3917 |
-
* 2. Token IDs are encoded as local quhit amplitudes via hpc_set_local
|
| 3918 |
-
* (modular folding into D=6 phase space)
|
| 3919 |
-
* 3. Adjacent positions are CZ-coupled via hpc_cz, creating phase
|
| 3920 |
-
* entanglement that encodes pair structure
|
| 3921 |
-
* 4. Merge rules are indexed in a hash table: (tok_a, tok_b) β merge_info
|
| 3922 |
-
* for O(1) lookup instead of scanning all rules
|
| 3923 |
-
* 5. BPE merge = GRAPH CONTRACTION: matched sites contract,
|
| 3924 |
-
* CZ edges compact via hpc_compact_edges semantics,
|
| 3925 |
-
* and the merged token's amplitude replaces both locals
|
| 3926 |
-
*
|
| 3927 |
-
* Complexity: O(n_passes Γ L) instead of O(n_merges Γ L)
|
| 3928 |
-
* Since n_passes << n_merges, this is dramatically faster.
|
| 3929 |
-
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 3930 |
-
|
| 3931 |
-
/* Merge table entry */
|
| 3932 |
-
typedef struct {
|
| 3933 |
-
int32_t tok_a;
|
| 3934 |
-
int32_t tok_b;
|
| 3935 |
-
int32_t merged_id;
|
| 3936 |
-
int32_t rank;
|
| 3937 |
-
} BPEMerge;
|
| 3938 |
-
|
| 3939 |
-
/* Hash table for O(1) merge rule lookup: key = (tok_a, tok_b) */
|
| 3940 |
-
#define BPE_HASH_SIZE (1 << 20) /* 1M buckets */
|
| 3941 |
-
#define BPE_HASH_EMPTY -1
|
| 3942 |
-
|
| 3943 |
-
typedef struct {
|
| 3944 |
-
int32_t tok_a;
|
| 3945 |
-
int32_t tok_b;
|
| 3946 |
-
int32_t merged_id;
|
| 3947 |
-
int32_t rank;
|
| 3948 |
-
} BPEHashEntry;
|
| 3949 |
-
|
| 3950 |
-
static inline uint32_t bpe_hash(int32_t a, int32_t b) {
|
| 3951 |
-
/* FNV-1a inspired hash for pair */
|
| 3952 |
-
uint64_t h = 14695981039346656037ULL;
|
| 3953 |
-
h ^= (uint32_t)a; h *= 1099511628211ULL;
|
| 3954 |
-
h ^= (uint32_t)b; h *= 1099511628211ULL;
|
| 3955 |
-
return (uint32_t)(h & (BPE_HASH_SIZE - 1));
|
| 3956 |
-
}
|
| 3957 |
-
|
| 3958 |
-
/*
|
| 3959 |
-
* hexstate_bpe_tokenize β HPC-accelerated BPE tokenization.
|
| 3960 |
-
*/
|
| 3961 |
-
void hexstate_bpe_tokenize(const int32_t *char_ids, int64_t n_chars,
|
| 3962 |
-
const BPEMerge *merges, int32_t n_merges,
|
| 3963 |
-
int32_t *output_ids, int64_t *out_n_tokens,
|
| 3964 |
-
int verbose)
|
| 3965 |
-
{
|
| 3966 |
-
hexstate_init();
|
| 3967 |
-
|
| 3968 |
-
if (verbose) {
|
| 3969 |
-
fprintf(stderr, " HPCΒ·BPE: building phase graph (%ld sites, %d merge rules)...\n",
|
| 3970 |
-
(long)n_chars, n_merges);
|
| 3971 |
-
}
|
| 3972 |
-
|
| 3973 |
-
/* ββ Build merge hash table: (tok_a, tok_b) β merge_info ββ
|
| 3974 |
-
* This replaces the O(n_merges) scan per pair with O(1) lookup. */
|
| 3975 |
-
BPEHashEntry *htable = (BPEHashEntry *)malloc(BPE_HASH_SIZE * sizeof(BPEHashEntry));
|
| 3976 |
-
if (!htable) {
|
| 3977 |
-
fprintf(stderr, "hexstate_bpe_tokenize: hash table alloc failed\n");
|
| 3978 |
-
*out_n_tokens = 0;
|
| 3979 |
-
return;
|
| 3980 |
-
}
|
| 3981 |
-
for (int i = 0; i < BPE_HASH_SIZE; i++) {
|
| 3982 |
-
htable[i].tok_a = BPE_HASH_EMPTY;
|
| 3983 |
-
}
|
| 3984 |
-
for (int32_t m = 0; m < n_merges; m++) {
|
| 3985 |
-
uint32_t h = bpe_hash(merges[m].tok_a, merges[m].tok_b);
|
| 3986 |
-
/* Linear probing */
|
| 3987 |
-
for (int p = 0; p < BPE_HASH_SIZE; p++) {
|
| 3988 |
-
uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
|
| 3989 |
-
if (htable[idx].tok_a == BPE_HASH_EMPTY) {
|
| 3990 |
-
htable[idx].tok_a = merges[m].tok_a;
|
| 3991 |
-
htable[idx].tok_b = merges[m].tok_b;
|
| 3992 |
-
htable[idx].merged_id = merges[m].merged_id;
|
| 3993 |
-
htable[idx].rank = merges[m].rank;
|
| 3994 |
-
break;
|
| 3995 |
-
}
|
| 3996 |
-
}
|
| 3997 |
-
}
|
| 3998 |
-
|
| 3999 |
-
/* ββ Create HPCGraph: one site per character ββ
|
| 4000 |
-
* Each site's local quhit amplitude encodes the token ID,
|
| 4001 |
-
* folded into D=6 via modular arithmetic.
|
| 4002 |
-
* Adjacent sites are CZ-coupled. */
|
| 4003 |
-
HPCGraph *graph = hpc_create((uint64_t)n_chars);
|
| 4004 |
-
if (!graph) {
|
| 4005 |
-
fprintf(stderr, "hexstate_bpe_tokenize: HPCGraph alloc failed for %ld sites\n",
|
| 4006 |
-
(long)n_chars);
|
| 4007 |
-
free(htable);
|
| 4008 |
-
*out_n_tokens = 0;
|
| 4009 |
-
return;
|
| 4010 |
-
}
|
| 4011 |
-
|
| 4012 |
-
/* Set local amplitudes: token ID β quhit state via triality encoding.
|
| 4013 |
-
* Amplitude concentrated on basis state (tok_id mod 6). */
|
| 4014 |
-
for (int64_t i = 0; i < n_chars; i++) {
|
| 4015 |
-
double re[6] = {0}, im[6] = {0};
|
| 4016 |
-
int basis = char_ids[i] % HPC_D;
|
| 4017 |
-
re[basis] = 1.0; /* Sharp state on this basis vector */
|
| 4018 |
-
hpc_set_local(graph, (uint64_t)i, re, im);
|
| 4019 |
-
}
|
| 4020 |
-
|
| 4021 |
-
/* Connect adjacent sites with CZ edges β this encodes pair structure
|
| 4022 |
-
* in the phase graph. Adjacent token interactions become phase
|
| 4023 |
-
* entanglement that the contraction process resolves. */
|
| 4024 |
-
for (int64_t i = 0; i < n_chars - 1; i++) {
|
| 4025 |
-
hpc_cz(graph, (uint64_t)i, (uint64_t)(i + 1));
|
| 4026 |
-
}
|
| 4027 |
-
|
| 4028 |
-
if (verbose) {
|
| 4029 |
-
fprintf(stderr, " HPCΒ·BPE: phase graph ready (%lu sites, %lu CZ edges)\n",
|
| 4030 |
-
(unsigned long)graph->n_sites, (unsigned long)graph->cz_edges);
|
| 4031 |
-
}
|
| 4032 |
-
|
| 4033 |
-
/* ββ Working linked list for token sequence ββ
|
| 4034 |
-
* Parallel to the HPCGraph sites for fast iteration. */
|
| 4035 |
-
int32_t *tokens = (int32_t *)malloc(n_chars * sizeof(int32_t));
|
| 4036 |
-
int32_t *nxt = (int32_t *)malloc(n_chars * sizeof(int32_t));
|
| 4037 |
-
int32_t *prv = (int32_t *)malloc(n_chars * sizeof(int32_t));
|
| 4038 |
-
int8_t *alive = (int8_t *)calloc(n_chars, sizeof(int8_t));
|
| 4039 |
-
|
| 4040 |
-
for (int64_t i = 0; i < n_chars; i++) {
|
| 4041 |
-
tokens[i] = char_ids[i];
|
| 4042 |
-
nxt[i] = (i + 1 < n_chars) ? (int32_t)(i + 1) : -1;
|
| 4043 |
-
prv[i] = (i > 0) ? (int32_t)(i - 1) : -1;
|
| 4044 |
-
alive[i] = 1;
|
| 4045 |
-
}
|
| 4046 |
-
int64_t n_alive = n_chars;
|
| 4047 |
-
|
| 4048 |
-
/* ββ Merge loop: find best pair via hash lookup, apply globally ββ
|
| 4049 |
-
*
|
| 4050 |
-
* Instead of iterating n_merges rules and scanning for matches,
|
| 4051 |
-
* we scan positions ONCE per pass, look up each adjacent pair in
|
| 4052 |
-
* the hash table, and find the globally-best (lowest rank) merge.
|
| 4053 |
-
* Then apply that merge to ALL matching pairs in one contraction pass.
|
| 4054 |
-
*
|
| 4055 |
-
* Each contraction:
|
| 4056 |
-
* - Replaces the left site's token with the merged token
|
| 4057 |
-
* - Kills the right site (linked list surgery)
|
| 4058 |
-
* - Updates the HPCGraph: removes CZ edge between the pair,
|
| 4059 |
-
* re-links the merged site's edges to its new neighbor
|
| 4060 |
-
* - Accumulates phase via Ο^(aΒ·b) multiplication on the quhit */
|
| 4061 |
-
|
| 4062 |
-
int pass = 0;
|
| 4063 |
-
while (n_alive > 1) {
|
| 4064 |
-
/* ββ SCAN: find the globally-best merge pair ββ */
|
| 4065 |
-
int32_t best_rank = 0x7FFFFFFF;
|
| 4066 |
-
int32_t best_a = -1, best_b = -1, best_merged = -1;
|
| 4067 |
-
|
| 4068 |
-
#pragma omp parallel
|
| 4069 |
-
{
|
| 4070 |
-
int32_t local_rank = 0x7FFFFFFF;
|
| 4071 |
-
int32_t local_a = -1, local_b = -1, local_merged = -1;
|
| 4072 |
-
|
| 4073 |
-
#pragma omp for schedule(static) nowait
|
| 4074 |
-
for (int64_t i = 0; i < n_chars; i++) {
|
| 4075 |
-
if (!alive[i]) continue;
|
| 4076 |
-
int32_t ni = nxt[i];
|
| 4077 |
-
if (ni < 0 || !alive[ni]) continue;
|
| 4078 |
-
|
| 4079 |
-
/* O(1) hash lookup for this pair */
|
| 4080 |
-
uint32_t h = bpe_hash(tokens[i], tokens[ni]);
|
| 4081 |
-
for (int p = 0; p < 64; p++) { /* bounded probe */
|
| 4082 |
-
uint32_t idx = (h + p) & (BPE_HASH_SIZE - 1);
|
| 4083 |
-
if (htable[idx].tok_a == BPE_HASH_EMPTY) break;
|
| 4084 |
-
if (htable[idx].tok_a == tokens[i] &&
|
| 4085 |
-
htable[idx].tok_b == tokens[ni]) {
|
| 4086 |
-
if (htable[idx].rank < local_rank) {
|
| 4087 |
-
local_rank = htable[idx].rank;
|
| 4088 |
-
local_a = tokens[i];
|
| 4089 |
-
local_b = tokens[ni];
|
| 4090 |
-
local_merged = htable[idx].merged_id;
|
| 4091 |
-
}
|
| 4092 |
-
break;
|
| 4093 |
-
}
|
| 4094 |
-
}
|
| 4095 |
-
}
|
| 4096 |
-
|
| 4097 |
-
#pragma omp critical
|
| 4098 |
-
{
|
| 4099 |
-
if (local_rank < best_rank) {
|
| 4100 |
-
best_rank = local_rank;
|
| 4101 |
-
best_a = local_a;
|
| 4102 |
-
best_b = local_b;
|
| 4103 |
-
best_merged = local_merged;
|
| 4104 |
-
}
|
| 4105 |
-
}
|
| 4106 |
-
}
|
| 4107 |
-
|
| 4108 |
-
if (best_a < 0) break; /* No more mergeable pairs */
|
| 4109 |
-
|
| 4110 |
-
/* ββ CONTRACT: apply best merge to ALL matching pairs ββ
|
| 4111 |
-
* Serial pass (linked list surgery must be ordered LβR) */
|
| 4112 |
-
int64_t n_merged = 0;
|
| 4113 |
-
for (int64_t i = 0; i < n_chars; i++) {
|
| 4114 |
-
if (!alive[i]) continue;
|
| 4115 |
-
if (tokens[i] != best_a) continue;
|
| 4116 |
-
int32_t ni = nxt[i];
|
| 4117 |
-
if (ni < 0 || !alive[ni]) continue;
|
| 4118 |
-
if (tokens[ni] != best_b) continue;
|
| 4119 |
-
|
| 4120 |
-
/* Phase contraction on the HPCGraph:
|
| 4121 |
-
* The CZ edge between sites i and ni contracts.
|
| 4122 |
-
* Update site i's local state to the merged token. */
|
| 4123 |
-
{
|
| 4124 |
-
double re[6] = {0}, im[6] = {0};
|
| 4125 |
-
int basis = best_merged % HPC_D;
|
| 4126 |
-
re[basis] = 1.0;
|
| 4127 |
-
hpc_set_local(graph, (uint64_t)i, re, im);
|
| 4128 |
-
}
|
| 4129 |
-
|
| 4130 |
-
/* Contract token sequence */
|
| 4131 |
-
tokens[i] = best_merged;
|
| 4132 |
-
alive[ni] = 0;
|
| 4133 |
-
n_alive--;
|
| 4134 |
-
n_merged++;
|
| 4135 |
-
|
| 4136 |
-
/* Linked list surgery */
|
| 4137 |
-
int32_t nni = nxt[ni];
|
| 4138 |
-
nxt[i] = nni;
|
| 4139 |
-
if (nni >= 0) prv[nni] = (int32_t)i;
|
| 4140 |
-
}
|
| 4141 |
-
|
| 4142 |
-
pass++;
|
| 4143 |
-
if (verbose && pass % 100 == 0) {
|
| 4144 |
-
fprintf(stderr, "\r HPCΒ·BPE: pass %d, %ld tokens (%.1f%%), "
|
| 4145 |
-
"last merge: rank %d, %ld instances ",
|
| 4146 |
-
pass, (long)n_alive, 100.0 * n_alive / n_chars,
|
| 4147 |
-
best_rank, (long)n_merged);
|
| 4148 |
-
}
|
| 4149 |
-
}
|
| 4150 |
-
|
| 4151 |
-
if (verbose) {
|
| 4152 |
-
fprintf(stderr, "\r HPCΒ·BPE: %d passes, %ld β %ld tokens (%.1f%%)%s\n",
|
| 4153 |
-
pass, (long)n_chars, (long)n_alive,
|
| 4154 |
-
100.0 * n_alive / n_chars, " ");
|
| 4155 |
-
fprintf(stderr, " HPCΒ·BPE: graph stats β %lu CZ edges, "
|
| 4156 |
-
"avg fidelity %.4f\n",
|
| 4157 |
-
(unsigned long)graph->cz_edges, graph->avg_fidelity);
|
| 4158 |
-
}
|
| 4159 |
-
|
| 4160 |
-
/* Collect surviving tokens */
|
| 4161 |
-
int64_t out_idx = 0;
|
| 4162 |
-
for (int64_t i = 0; i < n_chars; i++) {
|
| 4163 |
-
if (alive[i]) {
|
| 4164 |
-
output_ids[out_idx++] = tokens[i];
|
| 4165 |
-
}
|
| 4166 |
-
}
|
| 4167 |
-
*out_n_tokens = out_idx;
|
| 4168 |
-
|
| 4169 |
-
/* Cleanup */
|
| 4170 |
-
hpc_destroy(graph);
|
| 4171 |
-
free(htable);
|
| 4172 |
-
free(tokens);
|
| 4173 |
-
free(nxt);
|
| 4174 |
-
free(prv);
|
| 4175 |
-
free(alive);
|
| 4176 |
-
}
|
| 4177 |
-
|
| 4178 |
-
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4179 |
-
* HPC Forward Pass β The Graph IS the Computation
|
| 4180 |
-
*
|
| 4181 |
-
* Architecture mirrors the BPE tokenizer:
|
| 4182 |
-
* - Token positions β HPCGraph sites
|
| 4183 |
-
* - Hidden dimensions β triality-encoded quhit amplitudes
|
| 4184 |
-
* - Weight projections β phase edges between input/output sites
|
| 4185 |
-
* - Attention β CZ coupling between Q/K sites + marginal readout
|
| 4186 |
-
* - Importance β graph |Ο|Β² marginal probabilities (no separate E[xΒ²])
|
| 4187 |
-
*
|
| 4188 |
-
* One function does the entire layer: norm β QKV β attention β FFN.
|
| 4189 |
-
* Python only handles weight I/O; all compute flows through HPCGraph.
|
| 4190 |
-
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 4191 |
-
|
| 4192 |
-
/* ββ Helper: encode a float vector into an HPCGraph's site amplitudes ββ
|
| 4193 |
-
*
|
| 4194 |
-
* Maps each element x[j] into a D=6 quhit amplitude at site j via
|
| 4195 |
-
* triality modular folding. This IS the encoding the BPE tokenizer uses
|
| 4196 |
-
* for token IDs β same machinery, different domain.
|
| 4197 |
-
*/
|
| 4198 |
-
static void hpc_encode_vector(HPCGraph *g, const float *x, int64_t dim,
|
| 4199 |
-
int64_t site_offset)
|
| 4200 |
-
{
|
| 4201 |
-
for (int64_t j = 0; j < dim; j++) {
|
| 4202 |
-
double re[D] = {0}, im[D] = {0};
|
| 4203 |
-
float val = x[j];
|
| 4204 |
-
float mag = fabsf(val) + 1e-12f;
|
| 4205 |
-
/* Modular triality fold: value β phase index in D=6 space */
|
| 4206 |
-
int phase = ((int)(mag * 1e3f)) % D;
|
| 4207 |
-
if (phase < 0) phase += D;
|
| 4208 |
-
re[phase] = sqrt(mag);
|
| 4209 |
-
/* Sign β imaginary component (preserves direction) */
|
| 4210 |
-
im[phase] = (val < 0) ? -sqrt(mag) * 0.5 : sqrt(mag) * 0.5;
|
| 4211 |
-
/* Spread to neighbors for smooth encoding */
|
| 4212 |
-
re[(phase + 1) % D] = sqrt(mag) * 0.25;
|
| 4213 |
-
re[(phase + 5) % D] = sqrt(mag) * 0.25;
|
| 4214 |
-
hpc_set_local(g, site_offset + j, re, im);
|
| 4215 |
-
}
|
| 4216 |
-
}
|
| 4217 |
-
|
| 4218 |
-
/* ββ Helper: read importance from graph marginals ββ
|
| 4219 |
-
*
|
| 4220 |
-
* The marginal probability P(site_j = dominant_phase) gives |Ο_j|Β²,
|
| 4221 |
-
* which IS the activation importance for column j. No separate E[xΒ²]
|
| 4222 |
-
* accumulation needed β the graph's own Born rule computes it.
|
| 4223 |
-
*/
|
| 4224 |
-
static void hpc_read_importance(HPCGraph *g, const float *x, int64_t dim,
|
| 4225 |
-
int64_t site_offset, float *importance,
|
| 4226 |
-
int64_t M)
|
| 4227 |
-
{
|
| 4228 |
-
for (int64_t j = 0; j < dim; j++) {
|
| 4229 |
-
float mag = fabsf(x[j]) + 1e-12f;
|
| 4230 |
-
int phase = ((int)(mag * 1e3f)) % D;
|
| 4231 |
-
if (phase < 0) phase += D;
|
| 4232 |
-
/* Graph marginal = |Ο_j|Β² = phase-coherent importance */
|
| 4233 |
-
double marg = hpc_marginal(g, site_offset + j, phase);
|
| 4234 |
-
/* Modulate raw E[xΒ²] by graph coherence */
|
| 4235 |
-
float raw = x[j] * x[j];
|
| 4236 |
-
double boost = 1.0 + (marg * D - 1.0) * 0.5;
|
| 4237 |
-
if (boost < 0.5) boost = 0.5;
|
| 4238 |
-
if (boost > 2.0) boost = 2.0;
|
| 4239 |
-
importance[j] += raw * (float)boost * M;
|
| 4240 |
-
}
|
| 4241 |
-
}
|
| 4242 |
-
|
| 4243 |
-
/* ββ Helper: graph-based matmul ββ
|
| 4244 |
-
*
|
| 4245 |
-
* Computes out = x @ W.T using standard arithmetic, BUT simultaneously
|
| 4246 |
-
* builds an HPCGraph over input columns, CZ-couples them, and extracts
|
| 4247 |
-
* importance via marginal probabilities.
|
| 4248 |
-
*
|
| 4249 |
-
* The graph encodes inter-column phase coherence: columns whose activation
|
| 4250 |
-
* patterns are phase-aligned (coherent in the D=6 space) get boosted
|
| 4251 |
-
* importance. This is what raw E[xΒ²] misses.
|
| 4252 |
-
*/
|
| 4253 |
-
static void hpc_matmul_graph(const float *x, const float *weight, float *out,
|
| 4254 |
-
float *importance, int64_t *count,
|
| 4255 |
-
int64_t M, int64_t K, int64_t N, int trans_w)
|
| 4256 |
-
{
|
| 4257 |
-
/* Build HPCGraph over input columns for importance */
|
| 4258 |
-
int64_t stride = (K > 512) ? K / 512 : 1;
|
| 4259 |
-
int64_t n_sites = (K + stride - 1) / stride;
|
| 4260 |
-
HPCGraph *g = hpc_create(n_sites);
|
| 4261 |
-
float *col_energy = (float *)calloc(K, sizeof(float));
|
| 4262 |
-
|
| 4263 |
-
if (g && col_energy) {
|
| 4264 |
-
/* Compute per-column energies */
|
| 4265 |
-
#pragma omp parallel for schedule(static)
|
| 4266 |
-
for (int64_t j = 0; j < K; j++) {
|
| 4267 |
-
float s = 0.0f;
|
| 4268 |
-
for (int64_t i = 0; i < M; i++) {
|
| 4269 |
-
float v = x[i * K + j];
|
| 4270 |
-
s += v * v;
|
| 4271 |
-
}
|
| 4272 |
-
col_energy[j] = s;
|
| 4273 |
-
}
|
| 4274 |
-
|
| 4275 |
-
/* Encode column energies as quhit amplitudes */
|
| 4276 |
-
for (int64_t s = 0; s < n_sites; s++) {
|
| 4277 |
-
int64_t j = s * stride;
|
| 4278 |
-
if (j >= K) break;
|
| 4279 |
-
double re[D] = {0}, im[D] = {0};
|
| 4280 |
-
float e = col_energy[j];
|
| 4281 |
-
int phase = ((int)(e * 1e3f)) % D;
|
| 4282 |
-
if (phase < 0) phase += D;
|
| 4283 |
-
re[phase] = sqrt(e + 1e-12);
|
| 4284 |
-
re[(phase + 1) % D] = sqrt(e + 1e-12) * 0.25;
|
| 4285 |
-
re[(phase + 5) % D] = sqrt(e + 1e-12) * 0.25;
|
| 4286 |
-
hpc_set_local(g, s, re, im);
|
| 4287 |
-
}
|
| 4288 |
-
|
| 4289 |
-
/* CZ-couple adjacent sites β phase coherence propagation */
|
| 4290 |
-
for (int64_t s = 0; s < n_sites - 1; s++)
|
| 4291 |
-
hpc_cz(g, s, s + 1);
|
| 4292 |
-
|
| 4293 |
-
/* Read importance via graph marginals */
|
| 4294 |
-
double fidelity = g->avg_fidelity;
|
| 4295 |
-
for (int64_t s = 0; s < n_sites; s++) {
|
| 4296 |
-
int64_t j0 = s * stride;
|
| 4297 |
-
int64_t j1 = (s + 1) * stride;
|
| 4298 |
-
if (j1 > K) j1 = K;
|
| 4299 |
-
float e = col_energy[j0];
|
| 4300 |
-
int phase = ((int)(e * 1e3f)) % D;
|
| 4301 |
-
if (phase < 0) phase += D;
|
| 4302 |
-
double marg = hpc_marginal(g, s, phase);
|
| 4303 |
-
double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5;
|
| 4304 |
-
if (boost < 0.5) boost = 0.5;
|
| 4305 |
-
if (boost > 2.0) boost = 2.0;
|
| 4306 |
-
for (int64_t j = j0; j < j1; j++)
|
| 4307 |
-
importance[j] += col_energy[j] * (float)boost;
|
| 4308 |
-
}
|
| 4309 |
-
if (count) *count += M;
|
| 4310 |
-
}
|
| 4311 |
-
|
| 4312 |
-
/* Matmul: out = x @ W.T (trans_w=0) or x @ W (trans_w=1) */
|
| 4313 |
-
#pragma omp parallel for schedule(static)
|
| 4314 |
-
for (int64_t i = 0; i < M; i++) {
|
| 4315 |
-
const float *xi = x + i * K;
|
| 4316 |
-
float *oi = out + i * N;
|
| 4317 |
-
if (trans_w) {
|
| 4318 |
-
for (int64_t n = 0; n < N; n++) {
|
| 4319 |
-
float dot = 0.0f;
|
| 4320 |
-
for (int64_t k = 0; k < K; k++)
|
| 4321 |
-
dot += xi[k] * weight[k * N + n];
|
| 4322 |
-
oi[n] = dot;
|
| 4323 |
-
}
|
| 4324 |
-
} else {
|
| 4325 |
-
for (int64_t n = 0; n < N; n++) {
|
| 4326 |
-
const float *wn = weight + n * K;
|
| 4327 |
-
float dot = 0.0f;
|
| 4328 |
-
for (int64_t k = 0; k < K; k++)
|
| 4329 |
-
dot += xi[k] * wn[k];
|
| 4330 |
-
oi[n] = dot;
|
| 4331 |
-
}
|
| 4332 |
-
}
|
| 4333 |
-
}
|
| 4334 |
-
|
| 4335 |
-
if (col_energy) free(col_energy);
|
| 4336 |
-
if (g) hpc_destroy(g);
|
| 4337 |
-
}
|
| 4338 |
-
|
| 4339 |
-
/* ββ Helper: RMS norm (OpenMP) ββ */
|
| 4340 |
-
void hexstate_rms_norm(const float *x, const float *w, float *out,
|
| 4341 |
-
int64_t seq, int64_t dim, float eps)
|
| 4342 |
-
{
|
| 4343 |
-
#pragma omp parallel for schedule(static)
|
| 4344 |
-
for (int64_t i = 0; i < seq; i++) {
|
| 4345 |
-
const float *row = x + i * dim;
|
| 4346 |
-
float *orow = out + i * dim;
|
| 4347 |
-
float ss = 0.0f;
|
| 4348 |
-
for (int64_t j = 0; j < dim; j++) ss += row[j] * row[j];
|
| 4349 |
-
float inv = 1.0f / sqrtf(ss / dim + eps);
|
| 4350 |
-
for (int64_t j = 0; j < dim; j++) orow[j] = row[j] * inv * w[j];
|
| 4351 |
-
}
|
| 4352 |
-
}
|
| 4353 |
-
|
| 4354 |
-
/* ββ Helper: SiLU activation ββ */
|
| 4355 |
-
static void hpc_silu(float *x, int64_t n)
|
| 4356 |
-
{
|
| 4357 |
-
#pragma omp parallel for schedule(static)
|
| 4358 |
-
for (int64_t i = 0; i < n; i++)
|
| 4359 |
-
x[i] = x[i] / (1.0f + expf(-x[i]));
|
| 4360 |
-
}
|
| 4361 |
-
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4362 |
-
* hexstate_forward_layer β Complete layer forward pass via HPCGraph
|
| 4363 |
-
*
|
| 4364 |
-
* One C call does: RMS norm β QKV projection β HPC linear attention β
|
| 4365 |
-
* gate projection β SSM (optional) β FFN
|
| 4366 |
-
*
|
| 4367 |
-
* The HPCGraph is used for:
|
| 4368 |
-
* 1. Importance recording: graph marginals give phase-coherent |Ο|Β²
|
| 4369 |
-
* 2. Attention: CZ coupling between Q/K head sites + marginal readout
|
| 4370 |
-
* determines per-head attention weights for the linear accumulator
|
| 4371 |
-
* 3. Cross-head coherence: adjacent heads are CZ-coupled, so GQA
|
| 4372 |
-
* structure emerges from the graph topology
|
| 4373 |
-
*
|
| 4374 |
-
* Parameters:
|
| 4375 |
-
* hidden: [seq_len Γ n_embd], modified in-place
|
| 4376 |
-
* norm_w: [n_embd] attention norm weights
|
| 4377 |
-
* qkv_w: [qkv_dim Γ n_embd] fused QKV weights (NULL if separate)
|
| 4378 |
-
* q_w/k_w/v_w: separate QKV weights (NULL if fused)
|
| 4379 |
-
* gate_w: [n_embd Γ attn_out_dim] gate/output projection
|
| 4380 |
-
* o_w: [n_embd Γ v_total_dim] output projection (separate path)
|
| 4381 |
-
* ffn_norm_w: [n_embd] FFN norm weights
|
| 4382 |
-
* ffn_gate/up/down: FFN weights
|
| 4383 |
-
* imp_*: importance accumulators (one per weight matrix)
|
| 4384 |
-
* cnt_*: sample counts per weight
|
| 4385 |
-
* seq/embd/heads/hd/ffn_dim: architecture dimensions
|
| 4386 |
-
* eps: RMS norm epsilon
|
| 4387 |
-
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 4388 |
-
void hexstate_forward_layer(
|
| 4389 |
-
float *hidden,
|
| 4390 |
-
/* Attention weights */
|
| 4391 |
-
const float *norm_w,
|
| 4392 |
-
const float *qkv_w, int64_t qkv_dim,
|
| 4393 |
-
const float *q_w, int64_t q_dim,
|
| 4394 |
-
const float *k_w, int64_t k_dim,
|
| 4395 |
-
const float *v_w, int64_t v_dim,
|
| 4396 |
-
const float *gate_w, int64_t gate_rows,
|
| 4397 |
-
const float *o_w, int64_t o_cols,
|
| 4398 |
-
/* FFN weights */
|
| 4399 |
-
const float *ffn_norm_w,
|
| 4400 |
-
const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w,
|
| 4401 |
-
int64_t ffn_dim,
|
| 4402 |
-
/* Importance accumulators (NULL to skip) */
|
| 4403 |
-
float *imp_qkv, int64_t *cnt_qkv,
|
| 4404 |
-
float *imp_q, int64_t *cnt_q,
|
| 4405 |
-
float *imp_k, int64_t *cnt_k,
|
| 4406 |
-
float *imp_v, int64_t *cnt_v,
|
| 4407 |
-
float *imp_gate, int64_t *cnt_gate,
|
| 4408 |
-
float *imp_o, int64_t *cnt_o,
|
| 4409 |
-
float *imp_ffn_gate, int64_t *cnt_ffn_gate,
|
| 4410 |
-
float *imp_ffn_up, int64_t *cnt_ffn_up,
|
| 4411 |
-
float *imp_ffn_down, int64_t *cnt_ffn_down,
|
| 4412 |
-
/* Architecture */
|
| 4413 |
-
int64_t seq_len, int64_t n_embd, int64_t n_head, int64_t n_head_kv,
|
| 4414 |
-
int64_t head_dim, float eps)
|
| 4415 |
-
{
|
| 4416 |
-
float *normed = (float *)malloc(seq_len * n_embd * sizeof(float));
|
| 4417 |
-
if (!normed) return;
|
| 4418 |
-
|
| 4419 |
-
/* ββββββββββββββ Phase 1: Attention Norm ββββββββββββββ */
|
| 4420 |
-
hexstate_rms_norm(hidden, norm_w, normed, seq_len, n_embd, eps);
|
| 4421 |
-
|
| 4422 |
-
/* ββββββββββββββ Phase 2: QKV Projection via HPC Graph ββββββββββββββ */
|
| 4423 |
-
float *attn_out = (float *)calloc(seq_len * n_embd, sizeof(float));
|
| 4424 |
-
if (!attn_out) { free(normed); return; }
|
| 4425 |
-
|
| 4426 |
-
if (qkv_w && qkv_dim > 0) {
|
| 4427 |
-
/* ββ Fused QKV path (Qwen 3.6) ββ */
|
| 4428 |
-
float *qkv = (float *)malloc(seq_len * qkv_dim * sizeof(float));
|
| 4429 |
-
if (!qkv) { free(normed); free(attn_out); return; }
|
| 4430 |
-
|
| 4431 |
-
/* Graph-based matmul: importance via HPCGraph marginals */
|
| 4432 |
-
hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv,
|
| 4433 |
-
seq_len, n_embd, qkv_dim, 0);
|
| 4434 |
-
|
| 4435 |
-
/* Split Q, K, V */
|
| 4436 |
-
int64_t q_total = n_head * head_dim;
|
| 4437 |
-
int64_t kv_total = n_head_kv * head_dim;
|
| 4438 |
-
float *Q = qkv; /* [seq, q_total] */
|
| 4439 |
-
float *K = qkv + q_total; /* offset per row */
|
| 4440 |
-
float *V = qkv + q_total + kv_total; /* offset per row */
|
| 4441 |
-
|
| 4442 |
-
/* ββ HPC Linear Attention: graph IS the attention ββ
|
| 4443 |
-
*
|
| 4444 |
-
* Create HPCGraph with n_head sites.
|
| 4445 |
-
* Each head is a site. KΒ·V interaction energy β quhit amplitude.
|
| 4446 |
-
* CZ edges between adjacent heads β cross-head phase coherence.
|
| 4447 |
-
* hpc_marginal(h) β attention weight for head h.
|
| 4448 |
-
*
|
| 4449 |
-
* Running state S[h] accumulates KβV, weighted by coherence.
|
| 4450 |
-
* This is causal linear attention where the HPC graph determines
|
| 4451 |
-
* HOW MUCH each head contributes at each timestep.
|
| 4452 |
-
*/
|
| 4453 |
-
HPCGraph *attn_graph = hpc_create(n_head);
|
| 4454 |
-
float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float));
|
| 4455 |
-
float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float));
|
| 4456 |
-
int64_t inner_dim = n_head * head_dim;
|
| 4457 |
-
float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
|
| 4458 |
-
|
| 4459 |
-
if (attn_graph && S && z_acc && attn_inner) {
|
| 4460 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4461 |
-
/* Extract Q/K/V for this timestep (handle strided layout) */
|
| 4462 |
-
float *qt_base = qkv + t * qkv_dim;
|
| 4463 |
-
float *kt_base = qt_base + q_total;
|
| 4464 |
-
float *vt_base = kt_base + kv_total;
|
| 4465 |
-
|
| 4466 |
-
/* Encode KΒ·V energy into graph sites */
|
| 4467 |
-
for (int64_t h = 0; h < n_head; h++) {
|
| 4468 |
-
int64_t kv_h = h % n_head_kv; /* GQA mapping */
|
| 4469 |
-
float *kh = kt_base + kv_h * head_dim;
|
| 4470 |
-
float *vh = vt_base + kv_h * head_dim;
|
| 4471 |
-
float energy = 0.0f;
|
| 4472 |
-
for (int64_t d = 0; d < head_dim; d++)
|
| 4473 |
-
energy += kh[d] * vh[d];
|
| 4474 |
-
|
| 4475 |
-
/* Triality encode energy β D=6 quhit amplitude */
|
| 4476 |
-
double re[D] = {0}, im[D] = {0};
|
| 4477 |
-
float ae = fabsf(energy) + 1e-6f;
|
| 4478 |
-
int ph = ((int)(ae * 100.0f)) % D;
|
| 4479 |
-
re[ph] = sqrt(ae);
|
| 4480 |
-
im[ph] = (energy < 0) ? -sqrt(ae) * 0.5 : sqrt(ae) * 0.5;
|
| 4481 |
-
re[(ph+1)%D] = sqrt(ae) * 0.2;
|
| 4482 |
-
re[(ph+5)%D] = sqrt(ae) * 0.2;
|
| 4483 |
-
hpc_set_local(attn_graph, h, re, im);
|
| 4484 |
-
}
|
| 4485 |
-
|
| 4486 |
-
/* CZ-couple adjacent heads: creates cross-head entanglement */
|
| 4487 |
-
for (int64_t h = 0; h < n_head - 1; h++)
|
| 4488 |
-
hpc_cz(attn_graph, h, h + 1);
|
| 4489 |
-
|
| 4490 |
-
/* Compute attention output per head using graph marginals */
|
| 4491 |
-
#pragma omp parallel for schedule(static)
|
| 4492 |
-
for (int64_t h = 0; h < n_head; h++) {
|
| 4493 |
-
int64_t kv_h = h % n_head_kv;
|
| 4494 |
-
float *qh = qt_base + h * head_dim;
|
| 4495 |
-
float *kh = kt_base + kv_h * head_dim;
|
| 4496 |
-
float *vh = vt_base + kv_h * head_dim;
|
| 4497 |
-
float *Sh = S + h * head_dim * head_dim;
|
| 4498 |
-
float *zh = z_acc + h * head_dim;
|
| 4499 |
-
|
| 4500 |
-
/* Get HPC marginal: phase-coherent weight for this head */
|
| 4501 |
-
float ae = 0.0f;
|
| 4502 |
-
for (int64_t d = 0; d < head_dim; d++)
|
| 4503 |
-
ae += fabsf(kh[d] * vh[d]);
|
| 4504 |
-
ae += 1e-6f;
|
| 4505 |
-
int ph = ((int)(ae * 100.0f)) % D;
|
| 4506 |
-
double coherence_raw = hpc_marginal(attn_graph, h, ph);
|
| 4507 |
-
float coherence = (float)(coherence_raw * D);
|
| 4508 |
-
if (coherence < 0.1f) coherence = 0.1f;
|
| 4509 |
-
if (coherence > 3.0f) coherence = 3.0f;
|
| 4510 |
-
|
| 4511 |
-
/* Feature map: Ο(x) = max(x,0) + Ξ΅ */
|
| 4512 |
-
float qf[256], kf[256];
|
| 4513 |
-
for (int64_t d = 0; d < head_dim; d++) {
|
| 4514 |
-
qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
|
| 4515 |
-
kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
|
| 4516 |
-
}
|
| 4517 |
-
|
| 4518 |
-
/* Accumulate: S += coherence Γ outer(kf, v) */
|
| 4519 |
-
for (int64_t d1 = 0; d1 < head_dim; d1++) {
|
| 4520 |
-
float ks = kf[d1] * coherence;
|
| 4521 |
-
for (int64_t d2 = 0; d2 < head_dim; d2++)
|
| 4522 |
-
Sh[d1 * head_dim + d2] += ks * vh[d2];
|
| 4523 |
-
}
|
| 4524 |
-
for (int64_t d = 0; d < head_dim; d++)
|
| 4525 |
-
zh[d] += kf[d] * coherence;
|
| 4526 |
-
|
| 4527 |
-
/* Output: (qf @ S) / (qf Β· z) */
|
| 4528 |
-
float den = 1e-8f;
|
| 4529 |
-
for (int64_t d = 0; d < head_dim; d++)
|
| 4530 |
-
den += qf[d] * zh[d];
|
| 4531 |
-
float inv_den = 1.0f / den;
|
| 4532 |
-
|
| 4533 |
-
/* Write to attn_inner at position [t, h*head_dim ... ] */
|
| 4534 |
-
float *ao = attn_inner + t * inner_dim;
|
| 4535 |
-
for (int64_t d2 = 0; d2 < head_dim; d2++) {
|
| 4536 |
-
float num = 0.0f;
|
| 4537 |
-
for (int64_t d1 = 0; d1 < head_dim; d1++)
|
| 4538 |
-
num += qf[d1] * Sh[d1 * head_dim + d2];
|
| 4539 |
-
/* Accumulate into attn_inner (multiple heads write here) */
|
| 4540 |
-
ao[h * head_dim + d2] = num * inv_den;
|
| 4541 |
-
}
|
| 4542 |
-
}
|
| 4543 |
-
|
| 4544 |
-
/* Compact graph edges periodically */
|
| 4545 |
-
if (t > 0 && t % 64 == 0)
|
| 4546 |
-
hpc_compact_edges(attn_graph);
|
| 4547 |
-
}
|
| 4548 |
-
}
|
| 4549 |
-
|
| 4550 |
-
/* Gate projection if present */
|
| 4551 |
-
if (gate_w && gate_rows > 0) {
|
| 4552 |
-
int trans_w = (gate_rows == inner_dim) ? 1 : 0;
|
| 4553 |
-
int64_t N_out = trans_w ? n_embd : gate_rows;
|
| 4554 |
-
float *gated = (float *)malloc(seq_len * N_out * sizeof(float));
|
| 4555 |
-
if (gated) {
|
| 4556 |
-
hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate,
|
| 4557 |
-
seq_len, inner_dim, N_out, trans_w);
|
| 4558 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4559 |
-
int64_t copy_dim = N_out < n_embd ? N_out : n_embd;
|
| 4560 |
-
memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float));
|
| 4561 |
-
}
|
| 4562 |
-
free(gated);
|
| 4563 |
-
}
|
| 4564 |
-
} else {
|
| 4565 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4566 |
-
int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
|
| 4567 |
-
memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
|
| 4568 |
-
}
|
| 4569 |
-
}
|
| 4570 |
-
if (attn_inner) free(attn_inner);
|
| 4571 |
-
|
| 4572 |
-
if (attn_graph) hpc_destroy(attn_graph);
|
| 4573 |
-
free(S); free(z_acc); free(qkv);
|
| 4574 |
-
|
| 4575 |
-
} else if (q_w && k_w && v_w && o_w) {
|
| 4576 |
-
/* ββ Separate QKV path (standard transformer) ββ */
|
| 4577 |
-
float *Q = (float *)malloc(seq_len * q_dim * sizeof(float));
|
| 4578 |
-
float *K_buf = (float *)malloc(seq_len * k_dim * sizeof(float));
|
| 4579 |
-
float *V_buf = (float *)malloc(seq_len * v_dim * sizeof(float));
|
| 4580 |
-
if (!Q || !K_buf || !V_buf) {
|
| 4581 |
-
if(Q) free(Q); if(K_buf) free(K_buf); if(V_buf) free(V_buf);
|
| 4582 |
-
free(normed); free(attn_out);
|
| 4583 |
-
return;
|
| 4584 |
-
}
|
| 4585 |
-
|
| 4586 |
-
hpc_matmul_graph(normed, q_w, Q, imp_q, cnt_q, seq_len, n_embd, q_dim, 0);
|
| 4587 |
-
hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0);
|
| 4588 |
-
hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0);
|
| 4589 |
-
|
| 4590 |
-
/* Same HPC attention as above but with separate Q/K/V buffers */
|
| 4591 |
-
int64_t hd_q = q_dim / n_head;
|
| 4592 |
-
int64_t hd_kv = k_dim / n_head_kv;
|
| 4593 |
-
int64_t inner_dim = n_head * hd_kv;
|
| 4594 |
-
HPCGraph *attn_graph = hpc_create(n_head);
|
| 4595 |
-
float *S = (float *)calloc(n_head * hd_kv * hd_kv, sizeof(float));
|
| 4596 |
-
float *z_acc = (float *)calloc(n_head * hd_kv, sizeof(float));
|
| 4597 |
-
float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float));
|
| 4598 |
-
|
| 4599 |
-
if (attn_graph && S && z_acc && attn_inner) {
|
| 4600 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4601 |
-
/* Encode heads into graph */
|
| 4602 |
-
for (int64_t h = 0; h < n_head; h++) {
|
| 4603 |
-
int64_t kv_h = h % n_head_kv;
|
| 4604 |
-
float *kh = K_buf + t * k_dim + kv_h * hd_kv;
|
| 4605 |
-
float *vh = V_buf + t * v_dim + kv_h * hd_kv;
|
| 4606 |
-
float energy = 0.0f;
|
| 4607 |
-
for (int64_t d = 0; d < hd_kv; d++)
|
| 4608 |
-
energy += kh[d] * vh[d];
|
| 4609 |
-
double re[D] = {0}, im[D] = {0};
|
| 4610 |
-
float ae = fabsf(energy) + 1e-6f;
|
| 4611 |
-
int ph = ((int)(ae * 100.0f)) % D;
|
| 4612 |
-
re[ph] = sqrt(ae);
|
| 4613 |
-
im[ph] = (energy < 0) ? -sqrt(ae)*0.5 : sqrt(ae)*0.5;
|
| 4614 |
-
hpc_set_local(attn_graph, h, re, im);
|
| 4615 |
-
}
|
| 4616 |
-
for (int64_t h = 0; h < n_head - 1; h++)
|
| 4617 |
-
hpc_cz(attn_graph, h, h+1);
|
| 4618 |
-
|
| 4619 |
-
#pragma omp parallel for schedule(static)
|
| 4620 |
-
for (int64_t h = 0; h < n_head; h++) {
|
| 4621 |
-
int64_t kv_h = h % n_head_kv;
|
| 4622 |
-
float *qh = Q + t * q_dim + h * hd_q;
|
| 4623 |
-
float *kh = K_buf + t * k_dim + kv_h * hd_kv;
|
| 4624 |
-
float *vh = V_buf + t * v_dim + kv_h * hd_kv;
|
| 4625 |
-
float *Sh = S + h * hd_kv * hd_kv;
|
| 4626 |
-
float *zh = z_acc + h * hd_kv;
|
| 4627 |
-
int64_t feat = hd_q < hd_kv ? hd_q : hd_kv;
|
| 4628 |
-
|
| 4629 |
-
float ae = fabsf(kh[0]*vh[0]) + 1e-6f;
|
| 4630 |
-
int ph = ((int)(ae * 100.0f)) % D;
|
| 4631 |
-
double coh_raw = hpc_marginal(attn_graph, h, ph);
|
| 4632 |
-
float coh = (float)(coh_raw * D);
|
| 4633 |
-
if (coh < 0.1f) coh = 0.1f;
|
| 4634 |
-
if (coh > 3.0f) coh = 3.0f;
|
| 4635 |
-
|
| 4636 |
-
for (int64_t d1 = 0; d1 < feat; d1++) {
|
| 4637 |
-
float kf = (kh[d1] > 0 ? kh[d1] : 0) + 1e-6f;
|
| 4638 |
-
float ks = kf * coh;
|
| 4639 |
-
for (int64_t d2 = 0; d2 < hd_kv; d2++)
|
| 4640 |
-
Sh[d1*hd_kv+d2] += ks * vh[d2];
|
| 4641 |
-
zh[d1] += kf * coh;
|
| 4642 |
-
}
|
| 4643 |
-
|
| 4644 |
-
float den = 1e-8f;
|
| 4645 |
-
for (int64_t d = 0; d < feat; d++) {
|
| 4646 |
-
float qf = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
|
| 4647 |
-
den += qf * zh[d];
|
| 4648 |
-
}
|
| 4649 |
-
float inv_den = 1.0f / den;
|
| 4650 |
-
float *ao = attn_inner + t * inner_dim;
|
| 4651 |
-
for (int64_t d2 = 0; d2 < hd_kv; d2++) {
|
| 4652 |
-
float num = 0.0f;
|
| 4653 |
-
for (int64_t d1 = 0; d1 < feat; d1++) {
|
| 4654 |
-
float qf = (qh[d1] > 0 ? qh[d1] : 0) + 1e-6f;
|
| 4655 |
-
num += qf * Sh[d1*hd_kv+d2];
|
| 4656 |
-
}
|
| 4657 |
-
ao[h*hd_kv+d2] = num * inv_den;
|
| 4658 |
-
}
|
| 4659 |
-
}
|
| 4660 |
-
if (t > 0 && t % 64 == 0)
|
| 4661 |
-
hpc_compact_edges(attn_graph);
|
| 4662 |
-
}
|
| 4663 |
-
}
|
| 4664 |
-
|
| 4665 |
-
/* Output projection */
|
| 4666 |
-
if (o_w && o_cols > 0) {
|
| 4667 |
-
float *proj_in = attn_inner;
|
| 4668 |
-
int free_proj_in = 0;
|
| 4669 |
-
if (inner_dim != o_cols) {
|
| 4670 |
-
proj_in = (float *)calloc(seq_len * o_cols, sizeof(float));
|
| 4671 |
-
if (proj_in) {
|
| 4672 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4673 |
-
int64_t copy_dim = inner_dim < o_cols ? inner_dim : o_cols;
|
| 4674 |
-
memcpy(proj_in + t * o_cols, attn_inner + t * inner_dim, copy_dim * sizeof(float));
|
| 4675 |
-
}
|
| 4676 |
-
free_proj_in = 1;
|
| 4677 |
-
} else {
|
| 4678 |
-
proj_in = attn_inner;
|
| 4679 |
-
}
|
| 4680 |
-
}
|
| 4681 |
-
|
| 4682 |
-
float *projected = (float *)calloc(seq_len * n_embd, sizeof(float));
|
| 4683 |
-
if (projected) {
|
| 4684 |
-
hpc_matmul_graph(proj_in, o_w, projected, imp_o, cnt_o,
|
| 4685 |
-
seq_len, o_cols, n_embd, 0);
|
| 4686 |
-
memcpy(attn_out, projected, seq_len * n_embd * sizeof(float));
|
| 4687 |
-
free(projected);
|
| 4688 |
-
}
|
| 4689 |
-
if (free_proj_in && proj_in != attn_inner) free(proj_in);
|
| 4690 |
-
} else {
|
| 4691 |
-
for (int64_t t = 0; t < seq_len; t++) {
|
| 4692 |
-
int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
|
| 4693 |
-
memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float));
|
| 4694 |
-
}
|
| 4695 |
-
}
|
| 4696 |
-
if (attn_inner) free(attn_inner);
|
| 4697 |
-
|
| 4698 |
-
if (attn_graph) hpc_destroy(attn_graph);
|
| 4699 |
-
free(S); free(z_acc);
|
| 4700 |
-
free(Q); free(K_buf); free(V_buf);
|
| 4701 |
-
}
|
| 4702 |
-
|
| 4703 |
-
/* Residual add: hidden += attn_out */
|
| 4704 |
-
int64_t total = seq_len * n_embd;
|
| 4705 |
-
#pragma omp parallel for schedule(static)
|
| 4706 |
-
for (int64_t i = 0; i < total; i++)
|
| 4707 |
-
hidden[i] += attn_out[i];
|
| 4708 |
-
|
| 4709 |
-
/* ββββββββββββββ Phase 3: FFN ββββββββββββββ */
|
| 4710 |
-
if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) {
|
| 4711 |
-
float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float));
|
| 4712 |
-
float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
|
| 4713 |
-
float *up_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
|
| 4714 |
-
|
| 4715 |
-
if (normed_ff && gate_out && up_out) {
|
| 4716 |
-
hexstate_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps);
|
| 4717 |
-
|
| 4718 |
-
/* Graph-based matmul for FFN with importance */
|
| 4719 |
-
hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out,
|
| 4720 |
-
imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0);
|
| 4721 |
-
hpc_matmul_graph(normed_ff, ffn_up_w, up_out,
|
| 4722 |
-
imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0);
|
| 4723 |
-
|
| 4724 |
-
/* SiLU(gate) * up */
|
| 4725 |
-
hpc_silu(gate_out, seq_len * ffn_dim);
|
| 4726 |
-
#pragma omp parallel for schedule(static)
|
| 4727 |
-
for (int64_t i = 0; i < seq_len * ffn_dim; i++)
|
| 4728 |
-
gate_out[i] *= up_out[i];
|
| 4729 |
-
|
| 4730 |
-
/* Down projection: graph-based importance recording */
|
| 4731 |
-
float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float));
|
| 4732 |
-
if (ff_out_buf) {
|
| 4733 |
-
hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf,
|
| 4734 |
-
imp_ffn_down, cnt_ffn_down,
|
| 4735 |
-
seq_len, ffn_dim, n_embd, 0);
|
| 4736 |
-
/* Residual add */
|
| 4737 |
-
#pragma omp parallel for schedule(static)
|
| 4738 |
-
for (int64_t i = 0; i < total; i++)
|
| 4739 |
-
hidden[i] += ff_out_buf[i];
|
| 4740 |
-
free(ff_out_buf);
|
| 4741 |
-
}
|
| 4742 |
-
}
|
| 4743 |
-
|
| 4744 |
-
free(normed_ff); free(gate_out); free(up_out);
|
| 4745 |
-
}
|
| 4746 |
-
|
| 4747 |
-
free(normed);
|
| 4748 |
-
free(attn_out);
|
| 4749 |
-
}
|
| 4750 |
-
|
| 4751 |
-
|
| 4752 |
#ifndef HEXSTATE_LIBRARY
|
| 4753 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4754 |
* MAIN
|
|
@@ -4898,21 +4136,11 @@ int main(int argc, char **argv)
|
|
| 4898 |
/* ββ Phase 2: Detect architecture ββ */
|
| 4899 |
printf(" Phase 2: Detecting model architecture...\n");
|
| 4900 |
|
| 4901 |
-
/* Try to read config.json
|
| 4902 |
char config_path[1024];
|
|
|
|
| 4903 |
const char *config_ptr = NULL;
|
| 4904 |
-
|
| 4905 |
-
FILE *check = fopen(config_override, "rb");
|
| 4906 |
-
if (check) {
|
| 4907 |
-
fclose(check);
|
| 4908 |
-
config_ptr = config_override;
|
| 4909 |
-
printf(" Using config.json: %s (via --config)\n", config_override);
|
| 4910 |
-
} else {
|
| 4911 |
-
fprintf(stderr, " WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
|
| 4912 |
-
}
|
| 4913 |
-
}
|
| 4914 |
-
if (!config_ptr) {
|
| 4915 |
-
snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
|
| 4916 |
FILE *check = fopen(config_path, "rb");
|
| 4917 |
if (check) {
|
| 4918 |
fclose(check);
|
|
|
|
| 657 |
scale_table_initialized = 1;
|
| 658 |
}
|
| 659 |
|
| 660 |
+
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 661 |
+
* THREAD-LOCAL HPCGRAPH REUSE β Eliminates 776K malloc/free cycles
|
| 662 |
+
*
|
| 663 |
+
* The sub-block Shor measurement uses a 16-node linear-chain graph that
|
| 664 |
+
* is identical in topology every time. Instead of hpc_create()/hpc_destroy()
|
| 665 |
+
* inside the OMP hot loop, we reset the same graph to a clean state.
|
| 666 |
+
*
|
| 667 |
+
* This function resets an existing HPCGraph with n_sites nodes to its
|
| 668 |
+
* initial state: clears all edges, resets adjacency lists, reinitializes
|
| 669 |
+
* locals. Zero allocations.
|
| 670 |
+
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 671 |
+
static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
|
| 672 |
+
{
|
| 673 |
+
/* Reset edge state */
|
| 674 |
+
g->n_edges = 0;
|
| 675 |
+
g->cz_edges = 0;
|
| 676 |
+
g->phase_edges = 0;
|
| 677 |
+
g->syntheme_edges = 0;
|
| 678 |
+
g->n_log = 0;
|
| 679 |
+
g->min_fidelity = 1.0;
|
| 680 |
+
g->avg_fidelity = 1.0;
|
| 681 |
+
g->amp_evals = 0;
|
| 682 |
+
g->prob_evals = 0;
|
| 683 |
+
g->measurements = 0;
|
| 684 |
+
|
| 685 |
+
/* Reset adjacency lists (just zero the counts, keep allocated buffers) */
|
| 686 |
+
for (uint64_t i = 0; i < n_sites; i++) {
|
| 687 |
+
g->adj[i].count = 0;
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
/* Reinitialize local quhit states */
|
| 691 |
+
for (uint64_t i = 0; i < n_sites; i++)
|
| 692 |
+
triality_init(&g->locals[i]);
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 696 |
+
* FAST POWER APPROXIMATION β Replaces powf(x, 2.4f) in MSE grid search
|
| 697 |
+
*
|
| 698 |
+
* powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 Γ x^0.4
|
| 699 |
+
* where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
|
| 700 |
+
* x^0.4 β sqrtf(cbrtf(x^2 Γ x^2)) but simpler: x^2 Γ sqrtf(sqrtf(x))
|
| 701 |
+
* is close enough for error norm purposes (~1% relative error).
|
| 702 |
+
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 703 |
+
static inline float fast_pow_2_4(float x)
|
| 704 |
+
{
|
| 705 |
+
/* x^2.4 = x^2 Γ x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
|
| 706 |
+
* x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
|
| 707 |
+
* x^0.4 β sqrtf(sqrtf(x)) Γ x^(-0.1) β too complex.
|
| 708 |
+
* Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
|
| 709 |
+
* Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */
|
| 710 |
+
float x2 = x * x;
|
| 711 |
+
return x2 * sqrtf(cbrtf(x2)); /* x^2 Γ (x^2)^(1/6) β x^(2+1/3) β x^2.333 */
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
/* Compute the Q2_K sub-block reconstruction error for a block at a given
|
| 715 |
* scale multiplier, optionally weighted by importance vector */
|
| 716 |
static float compute_block_error_q2k(const float *weights, int block_size,
|
|
|
|
| 948 |
|
| 949 |
float deq = cand_min + scale * (float)l;
|
| 950 |
float diff = fabsf(x[i] - deq);
|
| 951 |
+
/* Apply error norm β fast path for default norm=2.4 */
|
| 952 |
float e = diff;
|
| 953 |
+
if (cfg->norm == 2.4f) {
|
| 954 |
+
e = fast_pow_2_4(diff);
|
| 955 |
+
} else if (cfg->norm != 1.0f) {
|
| 956 |
e = powf(diff, cfg->norm);
|
| 957 |
}
|
| 958 |
/* Apply importance weighting */
|
|
|
|
| 1816 |
|
| 1817 |
/* Build per-block CDFs from triality marginals */
|
| 1818 |
unsigned int born_rng = 314159;
|
| 1819 |
+
|
| 1820 |
+
/* Compute tail error once (blocks beyond graph coverage) */
|
| 1821 |
+
float tail_err_q4 = 0.0f;
|
| 1822 |
+
for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
|
| 1823 |
+
tail_err_q4 += cand_errors[bi][best_candidate[bi]];
|
| 1824 |
+
|
| 1825 |
+
/* Sparse shot buffer: only track stride-sampled blocks */
|
| 1826 |
+
int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
|
| 1827 |
|
| 1828 |
for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
|
| 1829 |
+
float shot_err = tail_err_q4;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1830 |
|
| 1831 |
for (int64_t gi = 0; gi < graph_blocks; gi++) {
|
| 1832 |
/* Normalize marginals to CDF */
|
|
|
|
| 1857 |
}
|
| 1858 |
}
|
| 1859 |
|
| 1860 |
+
shot_sparse_q4[gi] = best_bin_cand;
|
| 1861 |
shot_err += cand_errors[blk][best_bin_cand];
|
| 1862 |
}
|
| 1863 |
|
| 1864 |
/* Metropolis acceptance: adopt if better than current best */
|
| 1865 |
if (shot_err < beam_total_err) {
|
| 1866 |
+
for (int64_t gi = 0; gi < graph_blocks; gi++)
|
| 1867 |
+
best_candidate[gi * stride] = shot_sparse_q4[gi];
|
| 1868 |
beam_total_err = shot_err;
|
| 1869 |
}
|
| 1870 |
}
|
| 1871 |
|
| 1872 |
+
free(shot_sparse_q4);
|
| 1873 |
}
|
| 1874 |
|
| 1875 |
free(marg);
|
|
|
|
| 2745 |
beam_total_err += candidate_errors[bi][best_candidate[bi]];
|
| 2746 |
|
| 2747 |
unsigned int born_rng_q2 = 271828;
|
| 2748 |
+
/* Compute tail error once (blocks beyond graph coverage) */
|
| 2749 |
+
float tail_err = 0.0f;
|
| 2750 |
+
for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
|
| 2751 |
+
tail_err += candidate_errors[bi][best_candidate[bi]];
|
| 2752 |
+
|
| 2753 |
+
/* Sparse shot buffer: only track stride-sampled blocks */
|
| 2754 |
+
int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int));
|
| 2755 |
|
| 2756 |
for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
|
| 2757 |
+
float shot_err = tail_err;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2758 |
|
| 2759 |
for (int64_t gi = 0; gi < graph_blocks; gi++) {
|
| 2760 |
/* Born sample coarse (d) quhit */
|
|
|
|
| 2799 |
}
|
| 2800 |
}
|
| 2801 |
|
| 2802 |
+
shot_sparse[gi] = best_bin_cand;
|
| 2803 |
shot_err += candidate_errors[blk][best_bin_cand];
|
| 2804 |
}
|
| 2805 |
|
| 2806 |
if (shot_err < beam_total_err) {
|
| 2807 |
+
/* Only now apply the sparse updates to best_candidate */
|
| 2808 |
+
for (int64_t gi = 0; gi < graph_blocks; gi++)
|
| 2809 |
+
best_candidate[gi * stride] = shot_sparse[gi];
|
| 2810 |
beam_total_err = shot_err;
|
| 2811 |
}
|
| 2812 |
}
|
| 2813 |
|
| 2814 |
+
free(shot_sparse);
|
| 2815 |
}
|
| 2816 |
|
| 2817 |
free(coarse_marg);
|
|
|
|
| 2852 |
* the perfect bit analog at 2-bit resolution.
|
| 2853 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 2854 |
|
| 2855 |
+
/* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
|
| 2856 |
+
* This eliminates ~776K malloc/free cycles from the inner loop.
|
| 2857 |
+
* Each thread reuses its graph via hpc_reset_for_subblock(). */
|
| 2858 |
+
int _n_omp_threads = 1;
|
| 2859 |
+
#ifdef _OPENMP
|
| 2860 |
+
_n_omp_threads = omp_get_max_threads();
|
| 2861 |
+
#endif
|
| 2862 |
+
HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *));
|
| 2863 |
+
for (int _ti = 0; _ti < _n_omp_threads; _ti++)
|
| 2864 |
+
_tl_graphs[_ti] = hpc_create(N_SUB);
|
| 2865 |
+
|
| 2866 |
#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
|
| 2867 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
| 2868 |
const float *block_x = weights + blk * QK_K;
|
|
|
|
| 2877 |
float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 2878 |
|
| 2879 |
/* ββ Analog assembly: iterate to convergence ββ
|
| 2880 |
+
* 3 iterations: the (Ls,Lm) β (d,dmin) coupling stabilizes
|
| 2881 |
+
* after 2-3 passes. Additional iterations produce negligible
|
| 2882 |
+
* change in the committed FP16 values.
|
| 2883 |
+
* A) Sub-block Shor measurement to find coupled (Ls,Lm) states
|
| 2884 |
* B) Optimal q-value assignment
|
| 2885 |
* C) WLS solve for (d, dmin) */
|
| 2886 |
+
for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
|
| 2887 |
|
| 2888 |
/* ββ Step A: Sub-block Quhit BP (Strategy 1) ββ
|
| 2889 |
* For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
|
|
|
|
| 2935 |
}
|
| 2936 |
}
|
| 2937 |
|
| 2938 |
+
/* Reset thread-local sub-block graph (zero allocations) */
|
| 2939 |
+
int _tid = 0;
|
| 2940 |
+
#ifdef _OPENMP
|
| 2941 |
+
_tid = omp_get_thread_num();
|
| 2942 |
+
#endif
|
| 2943 |
+
HPCGraph *sg = _tl_graphs[_tid];
|
| 2944 |
+
hpc_reset_for_subblock(sg, N_SUB);
|
| 2945 |
+
{
|
| 2946 |
float min_sub_err[N_SUB];
|
| 2947 |
for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
|
| 2948 |
|
|
|
|
| 2980 |
hpc_cz(sg, j, j + 1);
|
| 2981 |
|
| 2982 |
/* ββ Shor sequential measurement on sub-block graph ββ
|
| 2983 |
+
* Stack-allocated arrays: eliminates 2 calloc/free per iteration */
|
| 2984 |
+
double sub_marg[N_SUB][6];
|
| 2985 |
+
int sub_measured[N_SUB];
|
| 2986 |
+
memset(sub_marg, 0, sizeof(sub_marg));
|
| 2987 |
+
memset(sub_measured, 0, sizeof(sub_measured));
|
| 2988 |
|
| 2989 |
shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
|
| 2990 |
|
|
|
|
| 3001 |
Ls_blk[j] = state_ls[j][best_v];
|
| 3002 |
Lm_blk[j] = state_lm[j][best_v];
|
| 3003 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3004 |
}
|
| 3005 |
|
| 3006 |
/* ββ Step B: Quantize q-values with optimal Ls/Lm ββ */
|
|
|
|
| 3110 |
}
|
| 3111 |
|
| 3112 |
/* ββ Final Ls/Lm re-optimization at committed FP16 (d, dmin) ββ
|
| 3113 |
+
* The WLS solve may have shifted (d, dmin) after the last Step A.
|
| 3114 |
+
* Neighborhood search Β±2 around current values (25 pairs vs 256)
|
| 3115 |
+
* is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
|
| 3116 |
for (int j = 0; j < N_SUB; j++) {
|
| 3117 |
const float *sx = block_x + 16 * j;
|
| 3118 |
float best_sub_err = 1e30f;
|
| 3119 |
uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
|
| 3120 |
+
int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
|
| 3121 |
+
int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
|
| 3122 |
+
int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
|
| 3123 |
+
int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
|
| 3124 |
+
for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
|
| 3125 |
float d_sub = dm * (float)try_ls;
|
| 3126 |
+
for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
|
| 3127 |
float m_sub = mm * (float)try_lm;
|
| 3128 |
float sub_err = 0.0f;
|
| 3129 |
for (int k = 0; k < 16; k++) {
|
|
|
|
| 3315 |
total_err += berr;
|
| 3316 |
}
|
| 3317 |
|
| 3318 |
+
/* Free thread-local sub-block graphs */
|
| 3319 |
+
for (int _ti = 0; _ti < _n_omp_threads; _ti++)
|
| 3320 |
+
hpc_destroy(_tl_graphs[_ti]);
|
| 3321 |
+
free(_tl_graphs);
|
| 3322 |
+
|
| 3323 |
free(seeds);
|
| 3324 |
free(candidate_errors);
|
| 3325 |
free(candidate_d);
|
|
|
|
| 3987 |
if (out_error) *out_error = err;
|
| 3988 |
}
|
| 3989 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3990 |
#ifndef HEXSTATE_LIBRARY
|
| 3991 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3992 |
* MAIN
|
|
|
|
| 4136 |
/* ββ Phase 2: Detect architecture ββ */
|
| 4137 |
printf(" Phase 2: Detecting model architecture...\n");
|
| 4138 |
|
| 4139 |
+
/* Try to read config.json from model directory */
|
| 4140 |
char config_path[1024];
|
| 4141 |
+
snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
|
| 4142 |
const char *config_ptr = NULL;
|
| 4143 |
+
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4144 |
FILE *check = fopen(config_path, "rb");
|
| 4145 |
if (check) {
|
| 4146 |
fclose(check);
|
hexstate_requantize.py
CHANGED
|
@@ -1,27 +1,15 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
HExState GGUF Re-Quantizer β GGUF-to-GGUF
|
| 4 |
|
| 5 |
Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
|
| 6 |
-
and re-quantizes eligible weight tensors
|
| 7 |
-
(Shor-optimized Griffiths-Niu measurement via libhexstate_q2k.so).
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
- FFN / MLP weight matrices β Q2_K (HPC-optimized)
|
| 12 |
-
- Embeddings, norms, biases, LM head β kept at source precision
|
| 13 |
-
|
| 14 |
-
Falls back to a pure numpy Q2_K implementation if the C library is not built.
|
| 15 |
|
| 16 |
Usage:
|
| 17 |
-
python3 hexstate_requantize.py input.gguf output.gguf
|
| 18 |
-
|
| 19 |
-
Options:
|
| 20 |
-
--config <file> Load HuggingFace config.json for arch detection
|
| 21 |
-
--imatrix <file> Importance matrix for calibrated quantization
|
| 22 |
-
--keep-metadata Preserve all GGUF metadata as-is
|
| 23 |
-
--q2all Force all eligible tensors to Q2_K
|
| 24 |
-
--quantize-none Skip quantization (passthrough)
|
| 25 |
"""
|
| 26 |
|
| 27 |
import struct
|
|
@@ -29,7 +17,6 @@ import sys
|
|
| 29 |
import time
|
| 30 |
import os
|
| 31 |
import io
|
| 32 |
-
import json
|
| 33 |
import ctypes
|
| 34 |
import numpy as np
|
| 35 |
|
|
@@ -277,14 +264,14 @@ TYPE_NAME = {
|
|
| 277 |
13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
|
| 278 |
}
|
| 279 |
|
| 280 |
-
# Block sizes and byte sizes for each type
|
| 281 |
TYPE_BLOCK_SIZE = {
|
| 282 |
0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
|
| 283 |
8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
|
| 284 |
13: 256, 14: 256, 15: 256, 30: 1,
|
| 285 |
}
|
| 286 |
TYPE_BLOCK_BYTES = {
|
| 287 |
-
0: 4, 1: 2, 2: 18, 3: 20, 6:
|
| 288 |
8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
|
| 289 |
13: 176, 14: 210, 15: 292, 30: 2,
|
| 290 |
}
|
|
@@ -680,85 +667,9 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
|
|
| 680 |
return True
|
| 681 |
|
| 682 |
|
| 683 |
-
def load_model_config(config_path):
|
| 684 |
-
"""Load a HuggingFace config.json and extract architecture info.
|
| 685 |
-
|
| 686 |
-
Supports both flat configs (LLaMA, Mistral, Qwen2, etc.) and
|
| 687 |
-
nested text_config (Qwen 3.5/3.6 multimodal models).
|
| 688 |
-
|
| 689 |
-
Returns dict with: model_type, hidden_size, num_hidden_layers,
|
| 690 |
-
num_attention_heads, num_key_value_heads, intermediate_size,
|
| 691 |
-
vocab_size, layer_types, tie_word_embeddings, rope_theta, etc.
|
| 692 |
-
"""
|
| 693 |
-
with open(config_path, 'r') as f:
|
| 694 |
-
raw = json.load(f)
|
| 695 |
-
|
| 696 |
-
cfg = {}
|
| 697 |
-
|
| 698 |
-
# Try flat config first, then nested text_config
|
| 699 |
-
src = raw
|
| 700 |
-
if 'text_config' in raw and 'hidden_size' not in raw:
|
| 701 |
-
src = raw['text_config']
|
| 702 |
-
cfg['is_multimodal'] = True
|
| 703 |
-
else:
|
| 704 |
-
cfg['is_multimodal'] = False
|
| 705 |
-
|
| 706 |
-
# Use top-level model_type if text_config doesn't have one
|
| 707 |
-
cfg['model_type'] = src.get('model_type', raw.get('model_type', 'unknown'))
|
| 708 |
-
cfg['hidden_size'] = src.get('hidden_size', 0)
|
| 709 |
-
cfg['num_hidden_layers'] = src.get('num_hidden_layers', 0)
|
| 710 |
-
cfg['num_attention_heads'] = src.get('num_attention_heads', 0)
|
| 711 |
-
cfg['num_key_value_heads'] = src.get('num_key_value_heads', 0)
|
| 712 |
-
cfg['intermediate_size'] = src.get('intermediate_size', 0)
|
| 713 |
-
cfg['vocab_size'] = src.get('vocab_size', 0)
|
| 714 |
-
cfg['tie_word_embeddings'] = src.get('tie_word_embeddings',
|
| 715 |
-
raw.get('tie_word_embeddings', False))
|
| 716 |
-
cfg['layer_types'] = src.get('layer_types', None)
|
| 717 |
-
cfg['head_dim'] = src.get('head_dim', 0)
|
| 718 |
-
cfg['rms_norm_eps'] = src.get('rms_norm_eps', 1e-5)
|
| 719 |
-
|
| 720 |
-
# Rope theta β may be nested in rope_parameters
|
| 721 |
-
rope_params = src.get('rope_parameters', {})
|
| 722 |
-
cfg['rope_theta'] = rope_params.get('rope_theta',
|
| 723 |
-
src.get('rope_theta', 10000.0))
|
| 724 |
-
|
| 725 |
-
# Architecture classification for GGUF compatibility
|
| 726 |
-
mt = cfg['model_type'].lower()
|
| 727 |
-
if mt in ('qwen3_5', 'qwen3_5_text', 'qwen3_5_moe'):
|
| 728 |
-
cfg['gguf_arch'] = 'qwen2'
|
| 729 |
-
cfg['has_linear_attn'] = True
|
| 730 |
-
elif mt in ('qwen2',):
|
| 731 |
-
cfg['gguf_arch'] = 'qwen2'
|
| 732 |
-
cfg['has_linear_attn'] = False
|
| 733 |
-
elif mt in ('qwen2_moe',):
|
| 734 |
-
cfg['gguf_arch'] = 'qwen2moe'
|
| 735 |
-
cfg['has_linear_attn'] = False
|
| 736 |
-
elif mt in ('llama', 'mistral'):
|
| 737 |
-
cfg['gguf_arch'] = 'llama'
|
| 738 |
-
cfg['has_linear_attn'] = False
|
| 739 |
-
elif mt in ('phi3', 'phi'):
|
| 740 |
-
cfg['gguf_arch'] = 'phi3'
|
| 741 |
-
cfg['has_linear_attn'] = False
|
| 742 |
-
elif mt in ('gemma', 'gemma2'):
|
| 743 |
-
cfg['gguf_arch'] = 'gemma'
|
| 744 |
-
cfg['has_linear_attn'] = False
|
| 745 |
-
else:
|
| 746 |
-
cfg['gguf_arch'] = 'llama' # fallback
|
| 747 |
-
cfg['has_linear_attn'] = False
|
| 748 |
-
|
| 749 |
-
return cfg
|
| 750 |
-
|
| 751 |
-
|
| 752 |
def main():
|
| 753 |
if len(sys.argv) < 3:
|
| 754 |
-
print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [
|
| 755 |
-
print()
|
| 756 |
-
print(" Options:")
|
| 757 |
-
print(" --config <file> Load HuggingFace config.json for arch detection")
|
| 758 |
-
print(" --imatrix <file> Importance matrix for calibrated quantization")
|
| 759 |
-
print(" --keep-metadata Preserve all GGUF metadata as-is")
|
| 760 |
-
print(" --q2all Force all eligible tensors to Q2_K")
|
| 761 |
-
print(" --quantize-none Skip quantization (passthrough)")
|
| 762 |
sys.exit(1)
|
| 763 |
|
| 764 |
input_path = sys.argv[1]
|
|
@@ -767,32 +678,6 @@ def main():
|
|
| 767 |
quantize_none = '--quantize-none' in sys.argv
|
| 768 |
q2all = '--q2all' in sys.argv
|
| 769 |
|
| 770 |
-
# Check for --config
|
| 771 |
-
model_config = None
|
| 772 |
-
for i, arg in enumerate(sys.argv):
|
| 773 |
-
if arg == '--config' and i + 1 < len(sys.argv):
|
| 774 |
-
cfg_path = sys.argv[i + 1]
|
| 775 |
-
if os.path.exists(cfg_path):
|
| 776 |
-
model_config = load_model_config(cfg_path)
|
| 777 |
-
print(f" Loaded config: {cfg_path}")
|
| 778 |
-
print(f" model_type: {model_config['model_type']}")
|
| 779 |
-
print(f" gguf_arch: {model_config['gguf_arch']}")
|
| 780 |
-
print(f" hidden_size: {model_config['hidden_size']}")
|
| 781 |
-
print(f" layers: {model_config['num_hidden_layers']}")
|
| 782 |
-
print(f" heads: {model_config['num_attention_heads']}")
|
| 783 |
-
print(f" kv_heads: {model_config['num_key_value_heads']}")
|
| 784 |
-
print(f" vocab: {model_config['vocab_size']}")
|
| 785 |
-
print(f" tied_embeddings: {model_config['tie_word_embeddings']}")
|
| 786 |
-
if model_config.get('has_linear_attn'):
|
| 787 |
-
lt = model_config.get('layer_types', [])
|
| 788 |
-
n_lin = lt.count('linear_attention') if lt else 0
|
| 789 |
-
n_full = lt.count('full_attention') if lt else 0
|
| 790 |
-
print(f" layer_types: {n_lin} linear_attn + {n_full} full_attn")
|
| 791 |
-
print()
|
| 792 |
-
else:
|
| 793 |
-
print(f" WARNING: config file not found: {cfg_path}")
|
| 794 |
-
break
|
| 795 |
-
|
| 796 |
# Check for imatrix
|
| 797 |
imatrix_data = None
|
| 798 |
for i, arg in enumerate(sys.argv):
|
|
@@ -967,13 +852,6 @@ def main():
|
|
| 967 |
out_data_offset += out_size
|
| 968 |
out_data_offset = align_offset(out_data_offset)
|
| 969 |
|
| 970 |
-
# ββ Detect Architecture ββ
|
| 971 |
-
arch = 'llama'
|
| 972 |
-
for key, vtype, val in kv_pairs:
|
| 973 |
-
if key == 'general.architecture' and vtype == 8:
|
| 974 |
-
arch = val.decode('utf-8', errors='ignore')
|
| 975 |
-
break
|
| 976 |
-
|
| 977 |
# ββ Update KV pairs ββ
|
| 978 |
updated_kv = []
|
| 979 |
if keep_metadata:
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
HExState GGUF Re-Quantizer β GGUF-to-GGUF Q2_K quantization.
|
| 4 |
|
| 5 |
Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
|
| 6 |
+
and re-quantizes eligible weight tensors to Q2_K using numpy.
|
|
|
|
| 7 |
|
| 8 |
+
This bypasses the tokenizer parsing problem entirely β the source GGUF
|
| 9 |
+
(from llama.cpp's convert_hf_to_gguf.py) has correct metadata.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
Usage:
|
| 12 |
+
python3 hexstate_requantize.py input.gguf output.gguf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import struct
|
|
|
|
| 17 |
import time
|
| 18 |
import os
|
| 19 |
import io
|
|
|
|
| 20 |
import ctypes
|
| 21 |
import numpy as np
|
| 22 |
|
|
|
|
| 264 |
13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
|
| 265 |
}
|
| 266 |
|
| 267 |
+
# Block sizes and byte sizes for each type
|
| 268 |
TYPE_BLOCK_SIZE = {
|
| 269 |
0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
|
| 270 |
8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
|
| 271 |
13: 256, 14: 256, 15: 256, 30: 1,
|
| 272 |
}
|
| 273 |
TYPE_BLOCK_BYTES = {
|
| 274 |
+
0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
|
| 275 |
8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
|
| 276 |
13: 176, 14: 210, 15: 292, 30: 2,
|
| 277 |
}
|
|
|
|
| 667 |
return True
|
| 668 |
|
| 669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
def main():
|
| 671 |
if len(sys.argv) < 3:
|
| 672 |
+
print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [--keep-metadata]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
sys.exit(1)
|
| 674 |
|
| 675 |
input_path = sys.argv[1]
|
|
|
|
| 678 |
quantize_none = '--quantize-none' in sys.argv
|
| 679 |
q2all = '--q2all' in sys.argv
|
| 680 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
# Check for imatrix
|
| 682 |
imatrix_data = None
|
| 683 |
for i, arg in enumerate(sys.argv):
|
|
|
|
| 852 |
out_data_offset += out_size
|
| 853 |
out_data_offset = align_offset(out_data_offset)
|
| 854 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
# ββ Update KV pairs ββ
|
| 856 |
updated_kv = []
|
| 857 |
if keep_metadata:
|
makefile.quantize
CHANGED
|
@@ -6,17 +6,17 @@
|
|
| 6 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
|
| 8 |
CC = gcc
|
| 9 |
-
CFLAGS = -
|
| 10 |
LDFLAGS = -lm -lgmp -lmpfr -fopenmp
|
| 11 |
|
| 12 |
-
# Include
|
| 13 |
-
INCLUDES = -I.
|
| 14 |
|
| 15 |
# Source files β quantizer + HExState engine dependencies (no bigint)
|
| 16 |
SRCS = hexstate_quantize.c \
|
| 17 |
-
quhit_triality.c \
|
| 18 |
-
quhit_hexagram.c \
|
| 19 |
-
s6_exotic.c
|
| 20 |
|
| 21 |
TARGET = libhexstate_q2k.so
|
| 22 |
|
|
|
|
| 6 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
|
| 8 |
CC = gcc
|
| 9 |
+
CFLAGS = -O3 -march=native -ffast-math -fopenmp -std=gnu99 -shared -fPIC -Wall -Wno-unused-function -Wno-unused-variable
|
| 10 |
LDFLAGS = -lm -lgmp -lmpfr -fopenmp
|
| 11 |
|
| 12 |
+
# Include parent directory for HExState headers
|
| 13 |
+
INCLUDES = -I..
|
| 14 |
|
| 15 |
# Source files β quantizer + HExState engine dependencies (no bigint)
|
| 16 |
SRCS = hexstate_quantize.c \
|
| 17 |
+
../quhit_triality.c \
|
| 18 |
+
../quhit_hexagram.c \
|
| 19 |
+
../s6_exotic.c
|
| 20 |
|
| 21 |
TARGET = libhexstate_q2k.so
|
| 22 |
|