CompressedGemma commited on
Commit
5c1c396
Β·
verified Β·
1 Parent(s): ae8c38d

Experimental support for other LLMs

Browse files
Files changed (1) hide show
  1. hexstate_quantize.c +111 -11
hexstate_quantize.c CHANGED
@@ -1,5 +1,5 @@
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
- * hexstate_quantize.c β€” HExState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * β•‘ HPC-Optimized GGUF Quantization Engine β•‘
@@ -204,6 +204,46 @@ static ConfigJson parse_config_json(const char *path)
204
  p = tok_find_key(json, "tie_word_embeddings");
205
  if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  free(json);
208
  return cfg;
209
  }
@@ -235,6 +275,10 @@ static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch,
235
  strcpy(arch->architecture, "qwen2");
236
  } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
237
  strcpy(arch->architecture, "qwen2moe");
 
 
 
 
238
  } else if (strcmp(cfg.model_type, "phi3") == 0 ||
239
  strcmp(cfg.model_type, "phi") == 0) {
240
  strcpy(arch->architecture, "phi3");
@@ -373,6 +417,11 @@ static int should_skip_tensor(const char *hf_name)
373
  if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
374
  if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
375
  if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
 
 
 
 
 
376
  return 0;
377
  }
378
 
@@ -384,12 +433,14 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
384
 
385
  /* Top-level mappings (common to all architectures) */
386
  struct { const char *from; const char *to; } mappings[] = {
387
- {"model.embed_tokens.weight", "token_embd.weight"},
388
- {"model.norm.weight", "output_norm.weight"},
389
- {"model.final_norm.weight", "output_norm.weight"}, /* Gemma */
390
- {"lm_head.weight", "output.weight"},
391
- {"model.embed_tokens.bias", "token_embd.bias"},
392
- {"model.norm.bias", "output_norm.bias"},
 
 
393
  {NULL, NULL}
394
  };
395
 
@@ -451,6 +502,19 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
451
  /* Gemma pre/post feedforward norm */
452
  {"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
453
  {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  {NULL, NULL}
455
  };
456
 
@@ -516,7 +580,12 @@ static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
516
  /* Never quantize MoE gate routing weights */
517
  if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
518
 
519
- /* Quantize everything else (attention projections, FFN weights) */
 
 
 
 
 
520
  return 1;
521
  }
522
 
@@ -532,6 +601,10 @@ static int is_attention_tensor(const char *gguf_name)
532
  if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
533
  if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
534
  if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
 
 
 
 
535
  /* HuggingFace style (fallthrough names) */
536
  if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
537
  if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
@@ -3869,6 +3942,8 @@ int main(int argc, char **argv)
3869
  printf(" Options:\n");
3870
  printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n");
3871
  printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
 
 
3872
  printf(" --verbose Per-block diagnostics\n\n");
3873
  return 1;
3874
  }
@@ -3877,7 +3952,9 @@ int main(int argc, char **argv)
3877
  const char *output_path = argv[2];
3878
  OptimizerMode opt_mode = OPT_HYBRID;
3879
  const char *imatrix_path = NULL;
 
3880
  int verbose = 0;
 
3881
 
3882
  /* Parse options */
3883
  for (int i = 3; i < argc; i++) {
@@ -3892,6 +3969,10 @@ int main(int argc, char **argv)
3892
  }
3893
  } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
3894
  imatrix_path = argv[++i];
 
 
 
 
3895
  } else if (strcmp(argv[i], "--verbose") == 0) {
3896
  verbose = 1;
3897
  } else {
@@ -3906,6 +3987,8 @@ int main(int argc, char **argv)
3906
  printf(" Quant type: Q2_K (2.625 bpw)\n");
3907
  printf(" Optimizer: %s\n", opt_names[opt_mode]);
3908
  if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
 
 
3909
  printf("\n");
3910
 
3911
  /* ── Phase 1: Load model ── */
@@ -3973,11 +4056,21 @@ int main(int argc, char **argv)
3973
  /* ── Phase 2: Detect architecture ── */
3974
  printf(" Phase 2: Detecting model architecture...\n");
3975
 
3976
- /* Try to read config.json from model directory */
3977
  char config_path[1024];
3978
- snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
3979
  const char *config_ptr = NULL;
3980
- {
 
 
 
 
 
 
 
 
 
 
 
3981
  FILE *check = fopen(config_path, "rb");
3982
  if (check) {
3983
  fclose(check);
@@ -3989,6 +4082,13 @@ int main(int argc, char **argv)
3989
  ModelArchitecture arch;
3990
  detect_architecture(mf, &arch, config_ptr);
3991
 
 
 
 
 
 
 
 
3992
  printf(" ╔═══════════════════════════════════════════════════════════════╗\n");
3993
  printf(" β•‘ Model Architecture β•‘\n");
3994
  printf(" ╠═══════════════════════════════════════════════════════════════╣\n");
 
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
+ * hexstate_quantize.c β€” HexState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * β•‘ HPC-Optimized GGUF Quantization Engine β•‘
 
204
  p = tok_find_key(json, "tie_word_embeddings");
205
  if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
206
 
207
+ /* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */
208
+ if (cfg.hidden_size == 0) {
209
+ const char *tc = strstr(json, "\"text_config\"");
210
+ if (tc) {
211
+ const char *tc_brace = strchr(tc, '{');
212
+ if (tc_brace) {
213
+ p = tok_find_key(tc_brace, "hidden_size");
214
+ if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
215
+ p = tok_find_key(tc_brace, "intermediate_size");
216
+ if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
217
+ p = tok_find_key(tc_brace, "num_attention_heads");
218
+ if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
219
+ p = tok_find_key(tc_brace, "num_key_value_heads");
220
+ if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
221
+ p = tok_find_key(tc_brace, "num_hidden_layers");
222
+ if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
223
+ p = tok_find_key(tc_brace, "vocab_size");
224
+ if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
225
+ p = tok_find_key(tc_brace, "max_position_embeddings");
226
+ if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
227
+ p = tok_find_key(tc_brace, "rms_norm_eps");
228
+ if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
229
+ p = tok_find_key(tc_brace, "model_type");
230
+ if (p && *p == '"') {
231
+ char buf2[64];
232
+ tok_extract_string(p, buf2, sizeof(buf2));
233
+ strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
234
+ }
235
+ p = tok_find_key(tc_brace, "tie_word_embeddings");
236
+ if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
237
+ /* Qwen3.6 rope_theta is nested in rope_parameters */
238
+ const char *rp = strstr(tc_brace, "\"rope_parameters\"");
239
+ if (rp) {
240
+ p = tok_find_key(rp, "rope_theta");
241
+ if (p) cfg.rope_theta = (float)strtod(p, NULL);
242
+ }
243
+ }
244
+ }
245
+ }
246
+
247
  free(json);
248
  return cfg;
249
  }
 
275
  strcpy(arch->architecture, "qwen2");
276
  } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
277
  strcpy(arch->architecture, "qwen2moe");
278
+ } else if (strcmp(cfg.model_type, "qwen3_5") == 0 ||
279
+ strcmp(cfg.model_type, "qwen3_5_text") == 0 ||
280
+ strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
281
+ strcpy(arch->architecture, "qwen2"); /* GGUF arch: qwen2 compat */
282
  } else if (strcmp(cfg.model_type, "phi3") == 0 ||
283
  strcmp(cfg.model_type, "phi") == 0) {
284
  strcpy(arch->architecture, "phi3");
 
417
  if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
418
  if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
419
  if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
420
+ /* Qwen 3.6 vision encoder β€” skip all visual.* tensors */
421
+ if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
422
+ if (strncmp(hf_name, "visual.", 7) == 0) return 1;
423
+ /* MTP (multi-token prediction) layers β€” not needed for inference */
424
+ if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
425
  return 0;
426
  }
427
 
 
433
 
434
  /* Top-level mappings (common to all architectures) */
435
  struct { const char *from; const char *to; } mappings[] = {
436
+ {"model.embed_tokens.weight", "token_embd.weight"},
437
+ {"model.language_model.embed_tokens.weight","token_embd.weight"}, /* Qwen 3.6 */
438
+ {"model.norm.weight", "output_norm.weight"},
439
+ {"model.language_model.norm.weight", "output_norm.weight"}, /* Qwen 3.6 */
440
+ {"model.final_norm.weight", "output_norm.weight"}, /* Gemma */
441
+ {"lm_head.weight", "output.weight"},
442
+ {"model.embed_tokens.bias", "token_embd.bias"},
443
+ {"model.norm.bias", "output_norm.bias"},
444
  {NULL, NULL}
445
  };
446
 
 
502
  /* Gemma pre/post feedforward norm */
503
  {"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
504
  {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
505
+ /* Qwen 3.6 full attention QK norms */
506
+ {"self_attn.q_norm.weight", "attn_q_norm.weight"},
507
+ {"self_attn.k_norm.weight", "attn_k_norm.weight"},
508
+ /* Qwen 3.6 DeltaNet (Gated Linear Attention) */
509
+ {"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
510
+ {"linear_attn.in_proj_z.weight", "ssm_in_z.weight"},
511
+ {"linear_attn.in_proj_a.weight", "ssm_in_a.weight"},
512
+ {"linear_attn.in_proj_b.weight", "ssm_in_b.weight"},
513
+ {"linear_attn.out_proj.weight", "ssm_out.weight"},
514
+ {"linear_attn.conv1d.weight", "ssm_conv1d.weight"},
515
+ {"linear_attn.norm.weight", "ssm_norm.weight"},
516
+ {"linear_attn.A_log", "ssm_a"},
517
+ {"linear_attn.dt_bias", "ssm_dt.bias"},
518
  {NULL, NULL}
519
  };
520
 
 
580
  /* Never quantize MoE gate routing weights */
581
  if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
582
 
583
+ /* Never quantize DeltaNet state-space parameters (1D or small) */
584
+ if (strstr(gguf_name, "ssm_a") != NULL) return 0; /* A_log */
585
+ if (strstr(gguf_name, "ssm_dt") != NULL) return 0; /* dt_bias */
586
+ if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */
587
+
588
+ /* Quantize everything else (attention projections, FFN weights, SSM projections) */
589
  return 1;
590
  }
591
 
 
601
  if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
602
  if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
603
  if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
604
+ /* Qwen 3.6 DeltaNet SSM projections β€” treat as attention-class (Q4_0) */
605
+ if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
606
+ if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
607
+ if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
608
  /* HuggingFace style (fallthrough names) */
609
  if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
610
  if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
 
3942
  printf(" Options:\n");
3943
  printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n");
3944
  printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
3945
+ printf(" --config <file> Explicit config.json for arch detection\n");
3946
+ printf(" --qwen Force Qwen 3.5/3.6 architecture\n");
3947
  printf(" --verbose Per-block diagnostics\n\n");
3948
  return 1;
3949
  }
 
3952
  const char *output_path = argv[2];
3953
  OptimizerMode opt_mode = OPT_HYBRID;
3954
  const char *imatrix_path = NULL;
3955
+ const char *config_override = NULL;
3956
  int verbose = 0;
3957
+ int force_qwen = 0;
3958
 
3959
  /* Parse options */
3960
  for (int i = 3; i < argc; i++) {
 
3969
  }
3970
  } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
3971
  imatrix_path = argv[++i];
3972
+ } else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
3973
+ config_override = argv[++i];
3974
+ } else if (strcmp(argv[i], "--qwen") == 0) {
3975
+ force_qwen = 1;
3976
  } else if (strcmp(argv[i], "--verbose") == 0) {
3977
  verbose = 1;
3978
  } else {
 
3987
  printf(" Quant type: Q2_K (2.625 bpw)\n");
3988
  printf(" Optimizer: %s\n", opt_names[opt_mode]);
3989
  if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
3990
+ if (config_override) printf(" Config: %s\n", config_override);
3991
+ if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n");
3992
  printf("\n");
3993
 
3994
  /* ── Phase 1: Load model ── */
 
4056
  /* ── Phase 2: Detect architecture ── */
4057
  printf(" Phase 2: Detecting model architecture...\n");
4058
 
4059
+ /* Try to read config.json: explicit --config overrides auto-detect */
4060
  char config_path[1024];
 
4061
  const char *config_ptr = NULL;
4062
+ if (config_override) {
4063
+ FILE *check = fopen(config_override, "rb");
4064
+ if (check) {
4065
+ fclose(check);
4066
+ config_ptr = config_override;
4067
+ printf(" Using config.json: %s (via --config)\n", config_override);
4068
+ } else {
4069
+ fprintf(stderr, " WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
4070
+ }
4071
+ }
4072
+ if (!config_ptr) {
4073
+ snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
4074
  FILE *check = fopen(config_path, "rb");
4075
  if (check) {
4076
  fclose(check);
 
4082
  ModelArchitecture arch;
4083
  detect_architecture(mf, &arch, config_ptr);
4084
 
4085
+ /* --qwen override: force Qwen 3.5/3.6 architecture parameters */
4086
+ if (force_qwen) {
4087
+ strcpy(arch.architecture, "qwen2");
4088
+ strcpy(arch.name, "Qwen3.6-HExState-Q2K");
4089
+ printf(" [--qwen] Forcing qwen2-compatible architecture\n");
4090
+ }
4091
+
4092
  printf(" ╔═══════════════════════════════════════════════════════════════╗\n");
4093
  printf(" β•‘ Model Architecture β•‘\n");
4094
  printf(" ╠═══════════════════════════════════════════════════════════════╣\n");