CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on May 7

Commit

5c1c396

verified ·

1 Parent(s): ae8c38d

Experimental support for other LLMs

Browse files

Files changed (1) hide show

hexstate_quantize.c +111 -11

hexstate_quantize.c CHANGED Viewed

@@ -1,5 +1,5 @@
 /* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_quantize.c — HExState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
@@ -204,6 +204,46 @@ static ConfigJson parse_config_json(const char *path)
     p = tok_find_key(json, "tie_word_embeddings");
     if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
     free(json);
     return cfg;
 }
@@ -235,6 +275,10 @@ static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch,
             strcpy(arch->architecture, "qwen2");
         } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
             strcpy(arch->architecture, "qwen2moe");
         } else if (strcmp(cfg.model_type, "phi3") == 0 ||
                    strcmp(cfg.model_type, "phi") == 0) {
             strcpy(arch->architecture, "phi3");
@@ -373,6 +417,11 @@ static int should_skip_tensor(const char *hf_name)
     if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
     if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
     if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
     return 0;
 }
@@ -384,12 +433,14 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
     /* Top-level mappings (common to all architectures) */
     struct { const char *from; const char *to; } mappings[] = {
-        {"model.embed_tokens.weight",       "token_embd.weight"},
-        {"model.norm.weight",               "output_norm.weight"},
-        {"model.final_norm.weight",         "output_norm.weight"},  /* Gemma */
-        {"lm_head.weight",                  "output.weight"},
-        {"model.embed_tokens.bias",         "token_embd.bias"},
-        {"model.norm.bias",                 "output_norm.bias"},
         {NULL, NULL}
     };
@@ -451,6 +502,19 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
                 /* Gemma pre/post feedforward norm */
                 {"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
                 {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
                 {NULL, NULL}
             };
@@ -516,7 +580,12 @@ static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
     /* Never quantize MoE gate routing weights */
     if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
-    /* Quantize everything else (attention projections, FFN weights) */
     return 1;
 }
@@ -532,6 +601,10 @@ static int is_attention_tensor(const char *gguf_name)
     if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
     if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
     if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
     /* HuggingFace style (fallthrough names) */
     if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
     if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
@@ -3869,6 +3942,8 @@ int main(int argc, char **argv)
         printf("  Options:\n");
         printf("    --optimizer hpc|mse|hybrid   Scale optimization (default: hybrid)\n");
         printf("    --imatrix <file>             Importance matrix for Q2_K quality\n");
         printf("    --verbose                    Per-block diagnostics\n\n");
         return 1;
     }
@@ -3877,7 +3952,9 @@ int main(int argc, char **argv)
     const char *output_path = argv[2];
     OptimizerMode opt_mode = OPT_HYBRID;
     const char *imatrix_path = NULL;
     int verbose = 0;
     /* Parse options */
     for (int i = 3; i < argc; i++) {
@@ -3892,6 +3969,10 @@ int main(int argc, char **argv)
             }
         } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
             imatrix_path = argv[++i];
         } else if (strcmp(argv[i], "--verbose") == 0) {
             verbose = 1;
         } else {
@@ -3906,6 +3987,8 @@ int main(int argc, char **argv)
     printf("  Quant type: Q2_K (2.625 bpw)\n");
     printf("  Optimizer:  %s\n", opt_names[opt_mode]);
     if (imatrix_path) printf("  iMatrix:    %s\n", imatrix_path);
     printf("\n");
     /* ── Phase 1: Load model ── */
@@ -3973,11 +4056,21 @@ int main(int argc, char **argv)
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
-    /* Try to read config.json from model directory */
     char config_path[1024];
-    snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
     const char *config_ptr = NULL;
-    {
         FILE *check = fopen(config_path, "rb");
         if (check) {
             fclose(check);
@@ -3989,6 +4082,13 @@ int main(int argc, char **argv)
     ModelArchitecture arch;
     detect_architecture(mf, &arch, config_ptr);
     printf("  ╔═══════════════════════════════════════════════════════════════╗\n");
     printf("  ║  Model Architecture                                         ║\n");
     printf("  ╠═══════════════════════════════════════════════════════════════╣\n");

 /* ═══════════════════════════════════════════════════════════════════════════
+ * hexstate_quantize.c — HexState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
     p = tok_find_key(json, "tie_word_embeddings");
     if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
+    /* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */
+    if (cfg.hidden_size == 0) {
+        const char *tc = strstr(json, "\"text_config\"");
+        if (tc) {
+            const char *tc_brace = strchr(tc, '{');
+            if (tc_brace) {
+                p = tok_find_key(tc_brace, "hidden_size");
+                if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "intermediate_size");
+                if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "num_attention_heads");
+                if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "num_key_value_heads");
+                if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "num_hidden_layers");
+                if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "vocab_size");
+                if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "max_position_embeddings");
+                if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
+                p = tok_find_key(tc_brace, "rms_norm_eps");
+                if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
+                p = tok_find_key(tc_brace, "model_type");
+                if (p && *p == '"') {
+                    char buf2[64];
+                    tok_extract_string(p, buf2, sizeof(buf2));
+                    strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
+                }
+                p = tok_find_key(tc_brace, "tie_word_embeddings");
+                if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
+                /* Qwen3.6 rope_theta is nested in rope_parameters */
+                const char *rp = strstr(tc_brace, "\"rope_parameters\"");
+                if (rp) {
+                    p = tok_find_key(rp, "rope_theta");
+                    if (p) cfg.rope_theta = (float)strtod(p, NULL);
+                }
+            }
+        }
+    }
     free(json);
     return cfg;
 }
             strcpy(arch->architecture, "qwen2");
         } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
             strcpy(arch->architecture, "qwen2moe");
+        } else if (strcmp(cfg.model_type, "qwen3_5") == 0 ||
+                   strcmp(cfg.model_type, "qwen3_5_text") == 0 ||
+                   strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
+            strcpy(arch->architecture, "qwen2");  /* GGUF arch: qwen2 compat */
         } else if (strcmp(cfg.model_type, "phi3") == 0 ||
                    strcmp(cfg.model_type, "phi") == 0) {
             strcpy(arch->architecture, "phi3");
     if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
     if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
     if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
+    /* Qwen 3.6 vision encoder — skip all visual.* tensors */
+    if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
+    if (strncmp(hf_name, "visual.", 7) == 0) return 1;
+    /* MTP (multi-token prediction) layers — not needed for inference */
+    if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
     return 0;
 }
     /* Top-level mappings (common to all architectures) */
     struct { const char *from; const char *to; } mappings[] = {
+        {"model.embed_tokens.weight",              "token_embd.weight"},
+        {"model.language_model.embed_tokens.weight","token_embd.weight"},  /* Qwen 3.6 */
+        {"model.norm.weight",                      "output_norm.weight"},
+        {"model.language_model.norm.weight",        "output_norm.weight"},  /* Qwen 3.6 */
+        {"model.final_norm.weight",                "output_norm.weight"},  /* Gemma */
+        {"lm_head.weight",                         "output.weight"},
+        {"model.embed_tokens.bias",                "token_embd.bias"},
+        {"model.norm.bias",                        "output_norm.bias"},
         {NULL, NULL}
     };
                 /* Gemma pre/post feedforward norm */
                 {"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
                 {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
+                /* Qwen 3.6 full attention QK norms */
+                {"self_attn.q_norm.weight",         "attn_q_norm.weight"},
+                {"self_attn.k_norm.weight",         "attn_k_norm.weight"},
+                /* Qwen 3.6 DeltaNet (Gated Linear Attention) */
+                {"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
+                {"linear_attn.in_proj_z.weight",   "ssm_in_z.weight"},
+                {"linear_attn.in_proj_a.weight",   "ssm_in_a.weight"},
+                {"linear_attn.in_proj_b.weight",   "ssm_in_b.weight"},
+                {"linear_attn.out_proj.weight",    "ssm_out.weight"},
+                {"linear_attn.conv1d.weight",      "ssm_conv1d.weight"},
+                {"linear_attn.norm.weight",        "ssm_norm.weight"},
+                {"linear_attn.A_log",              "ssm_a"},
+                {"linear_attn.dt_bias",            "ssm_dt.bias"},
                 {NULL, NULL}
             };
     /* Never quantize MoE gate routing weights */
     if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
+    /* Never quantize DeltaNet state-space parameters (1D or small) */
+    if (strstr(gguf_name, "ssm_a") != NULL) return 0;      /* A_log */
+    if (strstr(gguf_name, "ssm_dt") != NULL) return 0;     /* dt_bias */
+    if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */
+    /* Quantize everything else (attention projections, FFN weights, SSM projections) */
     return 1;
 }
     if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
     if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
     if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
+    /* Qwen 3.6 DeltaNet SSM projections — treat as attention-class (Q4_0) */
+    if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
+    if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
+    if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
     /* HuggingFace style (fallthrough names) */
     if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
     if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
         printf("  Options:\n");
         printf("    --optimizer hpc|mse|hybrid   Scale optimization (default: hybrid)\n");
         printf("    --imatrix <file>             Importance matrix for Q2_K quality\n");
+        printf("    --config <file>              Explicit config.json for arch detection\n");
+        printf("    --qwen                       Force Qwen 3.5/3.6 architecture\n");
         printf("    --verbose                    Per-block diagnostics\n\n");
         return 1;
     }
     const char *output_path = argv[2];
     OptimizerMode opt_mode = OPT_HYBRID;
     const char *imatrix_path = NULL;
+    const char *config_override = NULL;
     int verbose = 0;
+    int force_qwen = 0;
     /* Parse options */
     for (int i = 3; i < argc; i++) {
             }
         } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
             imatrix_path = argv[++i];
+        } else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
+            config_override = argv[++i];
+        } else if (strcmp(argv[i], "--qwen") == 0) {
+            force_qwen = 1;
         } else if (strcmp(argv[i], "--verbose") == 0) {
             verbose = 1;
         } else {
     printf("  Quant type: Q2_K (2.625 bpw)\n");
     printf("  Optimizer:  %s\n", opt_names[opt_mode]);
     if (imatrix_path) printf("  iMatrix:    %s\n", imatrix_path);
+    if (config_override) printf("  Config:     %s\n", config_override);
+    if (force_qwen) printf("  Model:      Qwen 3.5/3.6 (forced via --qwen)\n");
     printf("\n");
     /* ── Phase 1: Load model ── */
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
+    /* Try to read config.json: explicit --config overrides auto-detect */
     char config_path[1024];
     const char *config_ptr = NULL;
+    if (config_override) {
+        FILE *check = fopen(config_override, "rb");
+        if (check) {
+            fclose(check);
+            config_ptr = config_override;
+            printf("  Using config.json: %s (via --config)\n", config_override);
+        } else {
+            fprintf(stderr, "  WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
+        }
+    }
+    if (!config_ptr) {
+        snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
         FILE *check = fopen(config_path, "rb");
         if (check) {
             fclose(check);
     ModelArchitecture arch;
     detect_architecture(mf, &arch, config_ptr);
+    /* --qwen override: force Qwen 3.5/3.6 architecture parameters */
+    if (force_qwen) {
+        strcpy(arch.architecture, "qwen2");
+        strcpy(arch.name, "Qwen3.6-HExState-Q2K");
+        printf("  [--qwen] Forcing qwen2-compatible architecture\n");
+    }
     printf("  ╔═══════════════════════════════════════════════════════════════╗\n");
     printf("  ║  Model Architecture                                         ║\n");
     printf("  ╠═══════════════════════════════════════════════════════════════╣\n");