| { | |
| "quantization_method": "autoround", | |
| "quantization_tool": "auto_round", | |
| "output_format": "auto_round", | |
| "strategy": "mixed_precision_4bit_8bit_with_calibration", | |
| "based_on": "quantize_autoround_simple_fast_non_prod_only.py + quantize_autoround_4bit_hf_dataset.py", | |
| "dataset_source": "same_as_layer_observer", | |
| "mixed_precision": { | |
| "expert_bits": 4, | |
| "default_bits": 8, | |
| "description": "4-bit for MoE experts, 8-bit for attention/FFN, FP16 for router/norms/embeddings" | |
| }, | |
| "group_size": 128, | |
| "symmetric": true, | |
| "vllm_compatible": "check_docs_for_mixed_precision_support", | |
| "source_model": "artifacts/GLM-4.7/agent_calibration_mix_v5.jsonl/pruned_models/reap-seed_42-0.25", | |
| "calibration": { | |
| "samples": 8192, | |
| "seq_len": 2048, | |
| "dataset_name": "artifacts/agent_calibration_mix_v6.jsonl", | |
| "dataset_config_file": null, | |
| "seed": 42 | |
| }, | |
| "optimization_iters": 200, | |
| "layer_stats": { | |
| "ignored_fp16": 268, | |
| "expert_4bit": 31773, | |
| "other_8bit": 377, | |
| "total_quantized": 32150 | |
| }, | |
| "ignore_patterns": [ | |
| ".*embed_tokens.*", | |
| ".*lm_head.*", | |
| ".*shared_experts.*", | |
| ".*shared_head.*", | |
| ".*router.*", | |
| ".*ffn_gate_inp.*", | |
| ".*norm.*\\.weight$", | |
| ".*layernorm.*", | |
| ".*rmsnorm.*", | |
| ".*\\.bias$" | |
| ] | |
| } |