Files changed (2) hide show
  1. .README.md.swp +0 -0
  2. README.md +7 -5
.README.md.swp ADDED
Binary file (12.3 kB). View file
 
README.md CHANGED
@@ -11,6 +11,8 @@ base_model:
11
  - **Output:** Text
12
  - **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
13
  - **ROCm:** 7.2.2
 
 
14
  - **Operating System(s):** Linux
15
  - **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
16
  - **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
@@ -29,8 +31,8 @@ The model was quantized from [zai-org/GLM-5.1](https://huggingface.co/zai-org/GL
29
  sudo sysctl -w vm.max_map_count=4194304
30
  cd Quark/examples/torch/language_modeling/llm_ptq/
31
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
32
- export MODEL_DIR=/zai-org/GLM-5.1
33
- export output_dir=/amd/GLM-5.1-NVFP4
34
  exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
35
  python3 quantize_quark.py --model_dir $MODEL_DIR \
36
  --quant_scheme nvfp4 \
@@ -91,7 +93,7 @@ pip install lm-eval[api]
91
  export VLLM_ROCM_USE_AITER=1
92
  export VLLM_ROCM_USE_AITER_FP8BMM=0
93
  export VLLM_ROCM_USE_AITER_FP4BMM=0
94
- HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve /amd/GLM-5.1-NVFP4 \
95
  -tp 4 \
96
  --block-size 1 \
97
  --trust-remote-code \
@@ -103,7 +105,7 @@ HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve /amd/GLM-5.1-NVFP4 \
103
  ```
104
  lm_eval \
105
  --model local-completions \
106
- --model_args '{"model": "/amd/GLM-5.1-NVFP4", "base_url": "http://localhost:8082/v1/completions", "num_concurrent": 32, "max_retries": 10, "max_gen_toks": 2048, "tokenizer_backend": null, "tokenized_requests": false}' \
107
  --tasks gsm8k \
108
  --batch_size auto \
109
  --num_fewshot 5 \
@@ -113,4 +115,4 @@ lm_eval \
113
 
114
 
115
  # License
116
- Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
 
11
  - **Output:** Text
12
  - **Supported Hardware Microarchitecture:** AMD MI300/MI350/MI355 (emulation)
13
  - **ROCm:** 7.2.2
14
+ - **PyTorch**: 2.10.0
15
+ - **Transformers**: 5.2.0
16
  - **Operating System(s):** Linux
17
  - **Inference Engine:** [vLLM](https://docs.vllm.ai/en/latest/)
18
  - **Model Optimizer:** [AMD-Quark](https://quark.docs.amd.com/latest/index.html) (V0.12)
 
31
  sudo sysctl -w vm.max_map_count=4194304
32
  cd Quark/examples/torch/language_modeling/llm_ptq/
33
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
34
+ export MODEL_DIR=zai-org/GLM-5.1
35
+ export output_dir=amd/GLM-5.1-NVFP4
36
  exclude_layers="*self_attn* *mlp.gate lm_head *mlp.gate_proj *mlp.up_proj *mlp.down_proj"
37
  python3 quantize_quark.py --model_dir $MODEL_DIR \
38
  --quant_scheme nvfp4 \
 
93
  export VLLM_ROCM_USE_AITER=1
94
  export VLLM_ROCM_USE_AITER_FP8BMM=0
95
  export VLLM_ROCM_USE_AITER_FP4BMM=0
96
+ HIP_VISIBLE_DEVICES=4,5,6,7 vllm serve amd/GLM-5.1-NVFP4 \
97
  -tp 4 \
98
  --block-size 1 \
99
  --trust-remote-code \
 
105
  ```
106
  lm_eval \
107
  --model local-completions \
108
+ --model_args '{"model": "amd/GLM-5.1-NVFP4", "base_url": "http://localhost:8082/v1/completions", "num_concurrent": 32, "max_retries": 10, "max_gen_toks": 2048, "tokenizer_backend": null, "tokenized_requests": false}' \
109
  --tasks gsm8k \
110
  --batch_size auto \
111
  --num_fewshot 5 \
 
115
 
116
 
117
  # License
118
+ Modifications Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.