{ "quantization_method": "autoround", "quantization_tool": "auto_round", "output_format": "auto_round", "strategy": "mixed_precision_4bit_8bit_with_calibration", "based_on": "quantize_autoround_simple_fast_non_prod_only.py + quantize_autoround_4bit_hf_dataset.py", "dataset_source": "same_as_layer_observer", "mixed_precision": { "expert_bits": 4, "default_bits": 8, "description": "4-bit for MoE experts, 8-bit for attention/FFN, FP16 for router/norms/embeddings" }, "group_size": 128, "symmetric": true, "vllm_compatible": "check_docs_for_mixed_precision_support", "source_model": "artifacts/GLM-4.7/agent_calibration_mix_v5.jsonl/pruned_models/reap-seed_42-0.25", "calibration": { "samples": 8192, "seq_len": 2048, "dataset_name": "artifacts/agent_calibration_mix_v6.jsonl", "dataset_config_file": null, "seed": 42 }, "optimization_iters": 200, "layer_stats": { "ignored_fp16": 268, "expert_4bit": 31773, "other_8bit": 377, "total_quantized": 32150 }, "ignore_patterns": [ ".*embed_tokens.*", ".*lm_head.*", ".*shared_experts.*", ".*shared_head.*", ".*router.*", ".*ffn_gate_inp.*", ".*norm.*\\.weight$", ".*layernorm.*", ".*rmsnorm.*", ".*\\.bias$" ] }