Experimental support for other LLMs
Browse files- hexstate_quantize.c +111 -11
hexstate_quantize.c
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
-
* hexstate_quantize.c β
|
| 3 |
*
|
| 4 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
* β HPC-Optimized GGUF Quantization Engine β
|
|
@@ -204,6 +204,46 @@ static ConfigJson parse_config_json(const char *path)
|
|
| 204 |
p = tok_find_key(json, "tie_word_embeddings");
|
| 205 |
if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
free(json);
|
| 208 |
return cfg;
|
| 209 |
}
|
|
@@ -235,6 +275,10 @@ static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch,
|
|
| 235 |
strcpy(arch->architecture, "qwen2");
|
| 236 |
} else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
|
| 237 |
strcpy(arch->architecture, "qwen2moe");
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
} else if (strcmp(cfg.model_type, "phi3") == 0 ||
|
| 239 |
strcmp(cfg.model_type, "phi") == 0) {
|
| 240 |
strcpy(arch->architecture, "phi3");
|
|
@@ -373,6 +417,11 @@ static int should_skip_tensor(const char *hf_name)
|
|
| 373 |
if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
|
| 374 |
if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
|
| 375 |
if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
return 0;
|
| 377 |
}
|
| 378 |
|
|
@@ -384,12 +433,14 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
|
|
| 384 |
|
| 385 |
/* Top-level mappings (common to all architectures) */
|
| 386 |
struct { const char *from; const char *to; } mappings[] = {
|
| 387 |
-
{"model.embed_tokens.weight",
|
| 388 |
-
{"model.
|
| 389 |
-
{"model.
|
| 390 |
-
{"
|
| 391 |
-
{"model.
|
| 392 |
-
{"
|
|
|
|
|
|
|
| 393 |
{NULL, NULL}
|
| 394 |
};
|
| 395 |
|
|
@@ -451,6 +502,19 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
|
|
| 451 |
/* Gemma pre/post feedforward norm */
|
| 452 |
{"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
|
| 453 |
{"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
{NULL, NULL}
|
| 455 |
};
|
| 456 |
|
|
@@ -516,7 +580,12 @@ static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
|
|
| 516 |
/* Never quantize MoE gate routing weights */
|
| 517 |
if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
|
| 518 |
|
| 519 |
-
/*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
return 1;
|
| 521 |
}
|
| 522 |
|
|
@@ -532,6 +601,10 @@ static int is_attention_tensor(const char *gguf_name)
|
|
| 532 |
if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
|
| 533 |
if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
|
| 534 |
if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
/* HuggingFace style (fallthrough names) */
|
| 536 |
if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
|
| 537 |
if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
|
|
@@ -3869,6 +3942,8 @@ int main(int argc, char **argv)
|
|
| 3869 |
printf(" Options:\n");
|
| 3870 |
printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n");
|
| 3871 |
printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
|
|
|
|
|
|
|
| 3872 |
printf(" --verbose Per-block diagnostics\n\n");
|
| 3873 |
return 1;
|
| 3874 |
}
|
|
@@ -3877,7 +3952,9 @@ int main(int argc, char **argv)
|
|
| 3877 |
const char *output_path = argv[2];
|
| 3878 |
OptimizerMode opt_mode = OPT_HYBRID;
|
| 3879 |
const char *imatrix_path = NULL;
|
|
|
|
| 3880 |
int verbose = 0;
|
|
|
|
| 3881 |
|
| 3882 |
/* Parse options */
|
| 3883 |
for (int i = 3; i < argc; i++) {
|
|
@@ -3892,6 +3969,10 @@ int main(int argc, char **argv)
|
|
| 3892 |
}
|
| 3893 |
} else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
|
| 3894 |
imatrix_path = argv[++i];
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3895 |
} else if (strcmp(argv[i], "--verbose") == 0) {
|
| 3896 |
verbose = 1;
|
| 3897 |
} else {
|
|
@@ -3906,6 +3987,8 @@ int main(int argc, char **argv)
|
|
| 3906 |
printf(" Quant type: Q2_K (2.625 bpw)\n");
|
| 3907 |
printf(" Optimizer: %s\n", opt_names[opt_mode]);
|
| 3908 |
if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
|
|
|
|
|
|
|
| 3909 |
printf("\n");
|
| 3910 |
|
| 3911 |
/* ββ Phase 1: Load model ββ */
|
|
@@ -3973,11 +4056,21 @@ int main(int argc, char **argv)
|
|
| 3973 |
/* ββ Phase 2: Detect architecture ββ */
|
| 3974 |
printf(" Phase 2: Detecting model architecture...\n");
|
| 3975 |
|
| 3976 |
-
/* Try to read config.json
|
| 3977 |
char config_path[1024];
|
| 3978 |
-
snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
|
| 3979 |
const char *config_ptr = NULL;
|
| 3980 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3981 |
FILE *check = fopen(config_path, "rb");
|
| 3982 |
if (check) {
|
| 3983 |
fclose(check);
|
|
@@ -3989,6 +4082,13 @@ int main(int argc, char **argv)
|
|
| 3989 |
ModelArchitecture arch;
|
| 3990 |
detect_architecture(mf, &arch, config_ptr);
|
| 3991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3992 |
printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n");
|
| 3993 |
printf(" β Model Architecture β\n");
|
| 3994 |
printf(" β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n");
|
|
|
|
| 1 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
* hexstate_quantize.c β HexState GGUF Quantizer
|
| 3 |
*
|
| 4 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
* β HPC-Optimized GGUF Quantization Engine β
|
|
|
|
| 204 |
p = tok_find_key(json, "tie_word_embeddings");
|
| 205 |
if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
|
| 206 |
|
| 207 |
+
/* ββ Qwen 3.5/3.6: parameters are nested inside "text_config" ββ */
|
| 208 |
+
if (cfg.hidden_size == 0) {
|
| 209 |
+
const char *tc = strstr(json, "\"text_config\"");
|
| 210 |
+
if (tc) {
|
| 211 |
+
const char *tc_brace = strchr(tc, '{');
|
| 212 |
+
if (tc_brace) {
|
| 213 |
+
p = tok_find_key(tc_brace, "hidden_size");
|
| 214 |
+
if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
|
| 215 |
+
p = tok_find_key(tc_brace, "intermediate_size");
|
| 216 |
+
if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
|
| 217 |
+
p = tok_find_key(tc_brace, "num_attention_heads");
|
| 218 |
+
if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
|
| 219 |
+
p = tok_find_key(tc_brace, "num_key_value_heads");
|
| 220 |
+
if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
|
| 221 |
+
p = tok_find_key(tc_brace, "num_hidden_layers");
|
| 222 |
+
if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
|
| 223 |
+
p = tok_find_key(tc_brace, "vocab_size");
|
| 224 |
+
if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
|
| 225 |
+
p = tok_find_key(tc_brace, "max_position_embeddings");
|
| 226 |
+
if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
|
| 227 |
+
p = tok_find_key(tc_brace, "rms_norm_eps");
|
| 228 |
+
if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
|
| 229 |
+
p = tok_find_key(tc_brace, "model_type");
|
| 230 |
+
if (p && *p == '"') {
|
| 231 |
+
char buf2[64];
|
| 232 |
+
tok_extract_string(p, buf2, sizeof(buf2));
|
| 233 |
+
strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
|
| 234 |
+
}
|
| 235 |
+
p = tok_find_key(tc_brace, "tie_word_embeddings");
|
| 236 |
+
if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
|
| 237 |
+
/* Qwen3.6 rope_theta is nested in rope_parameters */
|
| 238 |
+
const char *rp = strstr(tc_brace, "\"rope_parameters\"");
|
| 239 |
+
if (rp) {
|
| 240 |
+
p = tok_find_key(rp, "rope_theta");
|
| 241 |
+
if (p) cfg.rope_theta = (float)strtod(p, NULL);
|
| 242 |
+
}
|
| 243 |
+
}
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
free(json);
|
| 248 |
return cfg;
|
| 249 |
}
|
|
|
|
| 275 |
strcpy(arch->architecture, "qwen2");
|
| 276 |
} else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
|
| 277 |
strcpy(arch->architecture, "qwen2moe");
|
| 278 |
+
} else if (strcmp(cfg.model_type, "qwen3_5") == 0 ||
|
| 279 |
+
strcmp(cfg.model_type, "qwen3_5_text") == 0 ||
|
| 280 |
+
strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
|
| 281 |
+
strcpy(arch->architecture, "qwen2"); /* GGUF arch: qwen2 compat */
|
| 282 |
} else if (strcmp(cfg.model_type, "phi3") == 0 ||
|
| 283 |
strcmp(cfg.model_type, "phi") == 0) {
|
| 284 |
strcpy(arch->architecture, "phi3");
|
|
|
|
| 417 |
if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
|
| 418 |
if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
|
| 419 |
if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
|
| 420 |
+
/* Qwen 3.6 vision encoder β skip all visual.* tensors */
|
| 421 |
+
if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
|
| 422 |
+
if (strncmp(hf_name, "visual.", 7) == 0) return 1;
|
| 423 |
+
/* MTP (multi-token prediction) layers β not needed for inference */
|
| 424 |
+
if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
|
| 425 |
return 0;
|
| 426 |
}
|
| 427 |
|
|
|
|
| 433 |
|
| 434 |
/* Top-level mappings (common to all architectures) */
|
| 435 |
struct { const char *from; const char *to; } mappings[] = {
|
| 436 |
+
{"model.embed_tokens.weight", "token_embd.weight"},
|
| 437 |
+
{"model.language_model.embed_tokens.weight","token_embd.weight"}, /* Qwen 3.6 */
|
| 438 |
+
{"model.norm.weight", "output_norm.weight"},
|
| 439 |
+
{"model.language_model.norm.weight", "output_norm.weight"}, /* Qwen 3.6 */
|
| 440 |
+
{"model.final_norm.weight", "output_norm.weight"}, /* Gemma */
|
| 441 |
+
{"lm_head.weight", "output.weight"},
|
| 442 |
+
{"model.embed_tokens.bias", "token_embd.bias"},
|
| 443 |
+
{"model.norm.bias", "output_norm.bias"},
|
| 444 |
{NULL, NULL}
|
| 445 |
};
|
| 446 |
|
|
|
|
| 502 |
/* Gemma pre/post feedforward norm */
|
| 503 |
{"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
|
| 504 |
{"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
|
| 505 |
+
/* Qwen 3.6 full attention QK norms */
|
| 506 |
+
{"self_attn.q_norm.weight", "attn_q_norm.weight"},
|
| 507 |
+
{"self_attn.k_norm.weight", "attn_k_norm.weight"},
|
| 508 |
+
/* Qwen 3.6 DeltaNet (Gated Linear Attention) */
|
| 509 |
+
{"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
|
| 510 |
+
{"linear_attn.in_proj_z.weight", "ssm_in_z.weight"},
|
| 511 |
+
{"linear_attn.in_proj_a.weight", "ssm_in_a.weight"},
|
| 512 |
+
{"linear_attn.in_proj_b.weight", "ssm_in_b.weight"},
|
| 513 |
+
{"linear_attn.out_proj.weight", "ssm_out.weight"},
|
| 514 |
+
{"linear_attn.conv1d.weight", "ssm_conv1d.weight"},
|
| 515 |
+
{"linear_attn.norm.weight", "ssm_norm.weight"},
|
| 516 |
+
{"linear_attn.A_log", "ssm_a"},
|
| 517 |
+
{"linear_attn.dt_bias", "ssm_dt.bias"},
|
| 518 |
{NULL, NULL}
|
| 519 |
};
|
| 520 |
|
|
|
|
| 580 |
/* Never quantize MoE gate routing weights */
|
| 581 |
if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;
|
| 582 |
|
| 583 |
+
/* Never quantize DeltaNet state-space parameters (1D or small) */
|
| 584 |
+
if (strstr(gguf_name, "ssm_a") != NULL) return 0; /* A_log */
|
| 585 |
+
if (strstr(gguf_name, "ssm_dt") != NULL) return 0; /* dt_bias */
|
| 586 |
+
if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */
|
| 587 |
+
|
| 588 |
+
/* Quantize everything else (attention projections, FFN weights, SSM projections) */
|
| 589 |
return 1;
|
| 590 |
}
|
| 591 |
|
|
|
|
| 601 |
if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
|
| 602 |
if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
|
| 603 |
if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
|
| 604 |
+
/* Qwen 3.6 DeltaNet SSM projections β treat as attention-class (Q4_0) */
|
| 605 |
+
if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
|
| 606 |
+
if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
|
| 607 |
+
if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
|
| 608 |
/* HuggingFace style (fallthrough names) */
|
| 609 |
if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
|
| 610 |
if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
|
|
|
|
| 3942 |
printf(" Options:\n");
|
| 3943 |
printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n");
|
| 3944 |
printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
|
| 3945 |
+
printf(" --config <file> Explicit config.json for arch detection\n");
|
| 3946 |
+
printf(" --qwen Force Qwen 3.5/3.6 architecture\n");
|
| 3947 |
printf(" --verbose Per-block diagnostics\n\n");
|
| 3948 |
return 1;
|
| 3949 |
}
|
|
|
|
| 3952 |
const char *output_path = argv[2];
|
| 3953 |
OptimizerMode opt_mode = OPT_HYBRID;
|
| 3954 |
const char *imatrix_path = NULL;
|
| 3955 |
+
const char *config_override = NULL;
|
| 3956 |
int verbose = 0;
|
| 3957 |
+
int force_qwen = 0;
|
| 3958 |
|
| 3959 |
/* Parse options */
|
| 3960 |
for (int i = 3; i < argc; i++) {
|
|
|
|
| 3969 |
}
|
| 3970 |
} else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
|
| 3971 |
imatrix_path = argv[++i];
|
| 3972 |
+
} else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
|
| 3973 |
+
config_override = argv[++i];
|
| 3974 |
+
} else if (strcmp(argv[i], "--qwen") == 0) {
|
| 3975 |
+
force_qwen = 1;
|
| 3976 |
} else if (strcmp(argv[i], "--verbose") == 0) {
|
| 3977 |
verbose = 1;
|
| 3978 |
} else {
|
|
|
|
| 3987 |
printf(" Quant type: Q2_K (2.625 bpw)\n");
|
| 3988 |
printf(" Optimizer: %s\n", opt_names[opt_mode]);
|
| 3989 |
if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
|
| 3990 |
+
if (config_override) printf(" Config: %s\n", config_override);
|
| 3991 |
+
if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n");
|
| 3992 |
printf("\n");
|
| 3993 |
|
| 3994 |
/* ββ Phase 1: Load model ββ */
|
|
|
|
| 4056 |
/* ββ Phase 2: Detect architecture ββ */
|
| 4057 |
printf(" Phase 2: Detecting model architecture...\n");
|
| 4058 |
|
| 4059 |
+
/* Try to read config.json: explicit --config overrides auto-detect */
|
| 4060 |
char config_path[1024];
|
|
|
|
| 4061 |
const char *config_ptr = NULL;
|
| 4062 |
+
if (config_override) {
|
| 4063 |
+
FILE *check = fopen(config_override, "rb");
|
| 4064 |
+
if (check) {
|
| 4065 |
+
fclose(check);
|
| 4066 |
+
config_ptr = config_override;
|
| 4067 |
+
printf(" Using config.json: %s (via --config)\n", config_override);
|
| 4068 |
+
} else {
|
| 4069 |
+
fprintf(stderr, " WARNING: Cannot open '%s', falling back to auto-detect\n", config_override);
|
| 4070 |
+
}
|
| 4071 |
+
}
|
| 4072 |
+
if (!config_ptr) {
|
| 4073 |
+
snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
|
| 4074 |
FILE *check = fopen(config_path, "rb");
|
| 4075 |
if (check) {
|
| 4076 |
fclose(check);
|
|
|
|
| 4082 |
ModelArchitecture arch;
|
| 4083 |
detect_architecture(mf, &arch, config_ptr);
|
| 4084 |
|
| 4085 |
+
/* --qwen override: force Qwen 3.5/3.6 architecture parameters */
|
| 4086 |
+
if (force_qwen) {
|
| 4087 |
+
strcpy(arch.architecture, "qwen2");
|
| 4088 |
+
strcpy(arch.name, "Qwen3.6-HExState-Q2K");
|
| 4089 |
+
printf(" [--qwen] Forcing qwen2-compatible architecture\n");
|
| 4090 |
+
}
|
| 4091 |
+
|
| 4092 |
printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n");
|
| 4093 |
printf(" β Model Architecture β\n");
|
| 4094 |
printf(" β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n");
|