| --- |
| base_model: |
| - Qwen/Qwen3-Coder-30B-A3B-Instruct |
| --- |
| |
| # Generation |
| Requires: https://github.com/vllm-project/llm-compressor/pull/1788 |
|
|
|
|
| ```python |
| |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
| from llmcompressor import oneshot |
| from llmcompressor.modifiers.awq import AWQModifier |
| |
| MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct" |
| |
| SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq" |
| |
| |
| # Configure the quantization algorithm to run. |
| recipe = [ |
| AWQModifier( |
| duo_scaling=False, |
| ignore=[ |
| "lm_head", |
| "re:.*mlp.gate$", |
| "re:.*mlp.shared_expert_gate$", |
| "re:visual.*", |
| ], |
| scheme="W4A16", |
| targets=["Linear"], |
| ), |
| ] |
| |
| # Select calibration dataset. |
| DATASET_ID = "codeparrot/self-instruct-starcoder" |
| DATASET_SPLIT = "curated" |
| |
| # Select number of samples. 256 samples is a good place to start. |
| # Increasing the number of samples can improve accuracy. |
| NUM_CALIBRATION_SAMPLES = 256 |
| MAX_SEQUENCE_LENGTH = 2048 |
| |
| |
| def get_calib_dataset(tokenizer): |
| from datasets import load_dataset |
| |
| ds = load_dataset( |
| DATASET_ID, |
| split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]", |
| ) |
| |
| def preprocess(example): |
| chat_messages = [ |
| {"role": "user", "content": example["instruction"].strip()}, |
| {"role": "assistant", "content": example["output"].strip()}, |
| ] |
| tokenized_messages = tokenizer.apply_chat_template( |
| chat_messages, tokenize=True |
| ) |
| return {"input_ids": tokenized_messages} |
| |
| ds = ( |
| ds.shuffle(seed=42) |
| .map(preprocess, remove_columns=ds.column_names) |
| .select(range(NUM_CALIBRATION_SAMPLES)) |
| ) |
| |
| return ds |
| |
| |
| if __name__ == "__main__": |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, torch_dtype="auto", trust_remote_code=True |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| |
| ### |
| ### Apply algorithms. |
| ### |
| oneshot( |
| model=model, |
| dataset=get_calib_dataset(tokenizer), |
| recipe=recipe, |
| max_seq_length=MAX_SEQUENCE_LENGTH, |
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
| log_dir=None, |
| trust_remote_code_model=True, |
| ) |
| |
| model.save_pretrained(SAVE_DIR) |
| tokenizer.save_pretrained(SAVE_DIR) |
| ``` |
|
|
| # Evaluation |
| The model was evaluated on HumanEval and HumanEval+ benchmark with the Neural Magic fork of the EvalPlus implementation of HumanEval+ and the vLLM engine, using the following commands: |
|
|
| ```bash |
| python evalplus/codegen/generate.py --model nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq --bs 16 --temperature 0.2 --n_samples 50 --root "./results" --dataset humaneval --backend vllm --dtype auto |
| |
| python evalplus/evalplus/sanitize.py results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2 |
| |
| evalplus.evaluate --dataset humaneval --samples results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2-sanitized |
| |
| ``` |
|
|
|
|
| | Metric | Qwen/Qwen3-Coder-30B-A3B-Instruct | nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq | |
| |------------------------|:---------------------------------:|:-------------------------------------------------:| |
| | HumanEval pass@1 | 93.0 | 93.7 | |
| | HumanEval pass@10 | 93.9 | 94.5 | |
| | HumanEval+ pass@1 | 88.7 | 89.3 | |
| | HumanEval+ pass@10 | 89.8 | 90.2 | |
| | **Average Score** | **91.35** | **91.93** | |
|
|
|
|