amd
/

GLM-5.1-NVFP4

@@ -11,6 +11,8 @@ base_model:
   - **Output:** Text
 - **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
 - **ROCm:** 7.2.2
 - **Operating System(s):** Linux
 - **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
 - **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
@@ -29,8 +31,8 @@ The model was quantized from [zai-org/GLM-5.1](https://huggingface.co/zai-org/GL
 sudo sysctl -w vm.max_map_count=4194304
 cd Quark/examples/torch/language_modeling/llm_ptq/
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export MODEL_DIR=/zai-org/GLM-5.1
-export output_dir=/amd/GLM-5.1-NVFP4
 exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
 python3 quantize_quark.py --model_dir $MODEL_DIR \
                           --quant_scheme nvfp4 \
@@ -91,7 +93,7 @@ pip install lm-eval[api]
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_FP8BMM=0
 export VLLM_ROCM_USE_AITER_FP4BMM=0
-HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve /amd/GLM-5.1-NVFP4 \
   -tp 4 \
   --block-size 1 \
   --trust-remote-code \
@@ -103,7 +105,7 @@ HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve /amd/GLM-5.1-NVFP4 \
 ```
 lm_eval \
   --model local-completions \
-  --model_args '{"model": "/amd/GLM-5.1-NVFP4", "base_url": "http://localhost:8082/v1/completions", "num_concurrent": 32, "max_retries": 10, "max_gen_toks": 2048, "tokenizer_backend": null, "tokenized_requests": false}' \
   --tasks gsm8k \
   --batch_size auto \
   --num_fewshot 5 \
@@ -113,4 +115,4 @@ lm_eval \
 # License
-Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.

   - **Output:** Text
 - **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
 - **ROCm:** 7.2.2
+- **PyTorch**: 2.10.0
+- **Transformers**: 5.2.0
 - **Operating System(s):** Linux
 - **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
 - **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
 sudo sysctl -w vm.max_map_count=4194304
 cd Quark/examples/torch/language_modeling/llm_ptq/
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MODEL_DIR=zai-org/GLM-5.1
+export output_dir=amd/GLM-5.1-NVFP4
 exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
 python3 quantize_quark.py --model_dir $MODEL_DIR \
                           --quant_scheme nvfp4 \
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_FP8BMM=0
 export VLLM_ROCM_USE_AITER_FP4BMM=0
+HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve amd/GLM-5.1-NVFP4 \
   -tp 4 \
   --block-size 1 \
   --trust-remote-code \
 ```
 lm_eval \
   --model local-completions \
+  --model_args '{"model": "amd/GLM-5.1-NVFP4", "base_url": "http://localhost:8082/v1/completions", "num_concurrent": 32, "max_retries": 10, "max_gen_toks": 2048, "tokenizer_backend": null, "tokenized_requests": false}' \
   --tasks gsm8k \
   --batch_size auto \
   --num_fewshot 5 \
 # License
+Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.