update model card
#4
by linzhao-amd - opened
- .README.md.swp +0 -0
- README.md +7 -5
.README.md.swp
ADDED
|
Binary file (12.3 kB). View file
|
|
|
README.md
CHANGED
|
@@ -11,6 +11,8 @@ base_model:
|
|
| 11 |
- **Output:** Text
|
| 12 |
- **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
|
| 13 |
- **ROCm:** 7.2.2
|
|
|
|
|
|
|
| 14 |
- **Operating System(s):** Linux
|
| 15 |
- **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
|
| 16 |
- **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
|
|
@@ -29,8 +31,8 @@ The model was quantized from [zai-org/GLM-5.1](https://huggingface.co/zai-org/GL
|
|
| 29 |
sudo sysctl -w vm.max_map_count=4194304
|
| 30 |
cd Quark/examples/torch/language_modeling/llm_ptq/
|
| 31 |
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 32 |
-
export MODEL_DIR=
|
| 33 |
-
export output_dir=
|
| 34 |
exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
|
| 35 |
python3 quantize_quark.py --model_dir $MODEL_DIR \
|
| 36 |
--quant_scheme nvfp4 \
|
|
@@ -91,7 +93,7 @@ pip install lm-eval[api]
|
|
| 91 |
export VLLM_ROCM_USE_AITER=1
|
| 92 |
export VLLM_ROCM_USE_AITER_FP8BMM=0
|
| 93 |
export VLLM_ROCM_USE_AITER_FP4BMM=0
|
| 94 |
-
HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve
|
| 95 |
-tp 4 \
|
| 96 |
--block-size 1 \
|
| 97 |
--trust-remote-code \
|
|
@@ -103,7 +105,7 @@ HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve /amd/GLM-5.1-NVFP4 \
|
|
| 103 |
```
|
| 104 |
lm_eval \
|
| 105 |
--model local-completions \
|
| 106 |
-
--model_args '{"model": "
|
| 107 |
--tasks gsm8k \
|
| 108 |
--batch_size auto \
|
| 109 |
--num_fewshot 5 \
|
|
@@ -113,4 +115,4 @@ lm_eval \
|
|
| 113 |
|
| 114 |
|
| 115 |
# License
|
| 116 |
-
Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
| 11 |
- **Output:** Text
|
| 12 |
- **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
|
| 13 |
- **ROCm:** 7.2.2
|
| 14 |
+
- **PyTorch**: 2.10.0
|
| 15 |
+
- **Transformers**: 5.2.0
|
| 16 |
- **Operating System(s):** Linux
|
| 17 |
- **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
|
| 18 |
- **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
|
|
|
|
| 31 |
sudo sysctl -w vm.max_map_count=4194304
|
| 32 |
cd Quark/examples/torch/language_modeling/llm_ptq/
|
| 33 |
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 34 |
+
export MODEL_DIR=zai-org/GLM-5.1
|
| 35 |
+
export output_dir=amd/GLM-5.1-NVFP4
|
| 36 |
exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
|
| 37 |
python3 quantize_quark.py --model_dir $MODEL_DIR \
|
| 38 |
--quant_scheme nvfp4 \
|
|
|
|
| 93 |
export VLLM_ROCM_USE_AITER=1
|
| 94 |
export VLLM_ROCM_USE_AITER_FP8BMM=0
|
| 95 |
export VLLM_ROCM_USE_AITER_FP4BMM=0
|
| 96 |
+
HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve amd/GLM-5.1-NVFP4 \
|
| 97 |
-tp 4 \
|
| 98 |
--block-size 1 \
|
| 99 |
--trust-remote-code \
|
|
|
|
| 105 |
```
|
| 106 |
lm_eval \
|
| 107 |
--model local-completions \
|
| 108 |
+
--model_args '{"model": "amd/GLM-5.1-NVFP4", "base_url": "http://localhost:8082/v1/completions", "num_concurrent": 32, "max_retries": 10, "max_gen_toks": 2048, "tokenizer_backend": null, "tokenized_requests": false}' \
|
| 109 |
--tasks gsm8k \
|
| 110 |
--batch_size auto \
|
| 111 |
--num_fewshot 5 \
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
# License
|
| 118 |
+
Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
|