amd
/

Qwen3-Coder-Next-MXFP4

@@ -32,6 +32,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
 from datasets import load_dataset
 from quark.torch import LLMTemplate, ModelQuantizer, export_safetensors
 from quark.contrib.llm_eval import ppl_eval
 # Register qwen3_next template
 qwen3_next_template = LLMTemplate(
     model_type="qwen3_next",
@@ -40,28 +41,34 @@ qwen3_next_template = LLMTemplate(
     exclude_layers_name=["lm_head", "*linear_attn.in_proj_ba", "*linear_attn.in_proj_qkvz","*mlp.gate", "*mlp.shared_expert_gate", "*self_attn.k_proj", "*self_attn.q_proj", "*self_attn.v_proj"],
 )
 LLMTemplate.register_template(qwen3_next_template)
 # Configuration
 ckpt_path = "Qwen/Qwen3-Coder-Next"
 output_dir = "amd/Qwen3-Coder-Next-MXFP4"
 quant_scheme = "mxfp4"
 exclude_layers = ["lm_head", "*linear_attn.in_proj_ba", "*linear_attn.in_proj_qkvz","*mlp.gate", "*mlp.shared_expert_gate", "*self_attn.k_proj", "*self_attn.q_proj", "*self_attn.v_proj"]
 # Load model
 model = AutoModelForCausalLM.from_pretrained(ckpt_path, torch_dtype="auto", device_map="auto")
 model.eval()
 tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=True)
 # Get quant config from template
 template = LLMTemplate.get(model.config.model_type)
 quant_config = template.get_config(scheme=quant_scheme, exclude_layers=exclude_layers)
 # Quantize
 quantizer = ModelQuantizer(quant_config)
 model = quantizer.quantize_model(model)
 model = quantizer.freeze(model)
 # Export hf_format
 export_safetensors(model, output_dir, custom_mode="quark")
 tokenizer.save_pretrained(output_dir)
 processor.save_pretrained(output_dir)
-# Evaluate PPL
 testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
 ppl = ppl_eval(model, testenc, model.device)

 from datasets import load_dataset
 from quark.torch import LLMTemplate, ModelQuantizer, export_safetensors
 from quark.contrib.llm_eval import ppl_eval
 # Register qwen3_next template
 qwen3_next_template = LLMTemplate(
     model_type="qwen3_next",
     exclude_layers_name=["lm_head", "*linear_attn.in_proj_ba", "*linear_attn.in_proj_qkvz","*mlp.gate", "*mlp.shared_expert_gate", "*self_attn.k_proj", "*self_attn.q_proj", "*self_attn.v_proj"],
 )
 LLMTemplate.register_template(qwen3_next_template)
 # Configuration
 ckpt_path = "Qwen/Qwen3-Coder-Next"
 output_dir = "amd/Qwen3-Coder-Next-MXFP4"
 quant_scheme = "mxfp4"
 exclude_layers = ["lm_head", "*linear_attn.in_proj_ba", "*linear_attn.in_proj_qkvz","*mlp.gate", "*mlp.shared_expert_gate", "*self_attn.k_proj", "*self_attn.q_proj", "*self_attn.v_proj"]
 # Load model
 model = AutoModelForCausalLM.from_pretrained(ckpt_path, torch_dtype="auto", device_map="auto")
 model.eval()
 tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=True)
 # Get quant config from template
 template = LLMTemplate.get(model.config.model_type)
 quant_config = template.get_config(scheme=quant_scheme, exclude_layers=exclude_layers)
 # Quantize
 quantizer = ModelQuantizer(quant_config)
 model = quantizer.quantize_model(model)
 model = quantizer.freeze(model)
 # Export hf_format
 export_safetensors(model, output_dir, custom_mode="quark")
 tokenizer.save_pretrained(output_dir)
 processor.save_pretrained(output_dir)
+# Evaluate PPL (optional)
 testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
 ppl = ppl_eval(model, testenc, model.device)