Update README.md
Browse files
README.md
CHANGED
|
@@ -47,7 +47,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
|
|
| 47 |
|
| 48 |
2. Initialize vLLM server:
|
| 49 |
```
|
| 50 |
-
vllm serve RedHatAI/granite-4.0-h-small-FP8-
|
| 51 |
```
|
| 52 |
|
| 53 |
3. Send requests to the server:
|
|
@@ -64,7 +64,7 @@ client = OpenAI(
|
|
| 64 |
base_url=openai_api_base,
|
| 65 |
)
|
| 66 |
|
| 67 |
-
model = "RedHatAI/granite-4.0-h-small-FP8-
|
| 68 |
|
| 69 |
messages = [
|
| 70 |
{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
|
|
@@ -109,11 +109,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
| 109 |
|
| 110 |
model = replace_modules_for_calibration(model)
|
| 111 |
|
| 112 |
-
ignore_lay = ["lm_head", "re:.*block_sparse_moe.router"
|
| 113 |
|
| 114 |
recipe = QuantizationModifier(
|
| 115 |
targets=["Linear"],
|
| 116 |
-
scheme="
|
| 117 |
ignore=ignore_lay,
|
| 118 |
)
|
| 119 |
|
|
@@ -128,7 +128,7 @@ output = model.generate(input_ids, max_new_tokens=35)
|
|
| 128 |
print(tokenizer.decode(output[0]))
|
| 129 |
print("==========================================")
|
| 130 |
|
| 131 |
-
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-
|
| 132 |
print(f"Saving to {SAVE_DIR}")
|
| 133 |
|
| 134 |
model.save_pretrained(SAVE_DIR)
|
|
@@ -158,7 +158,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
|
|
| 158 |
```
|
| 159 |
lm_eval \
|
| 160 |
--model vllm \
|
| 161 |
-
--model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-
|
| 162 |
--tasks openllm \
|
| 163 |
--write_out \
|
| 164 |
--batch_size auto \
|
|
@@ -170,7 +170,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
|
|
| 170 |
```
|
| 171 |
lm_eval \
|
| 172 |
--model vllm \
|
| 173 |
-
--model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-
|
| 174 |
--tasks leaderboard \
|
| 175 |
--apply_chat_template \
|
| 176 |
--fewshot_as_multiturn \
|
|
@@ -183,13 +183,13 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
|
|
| 183 |
**Coding Benchmarks**
|
| 184 |
|
| 185 |
```
|
| 186 |
-
evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-
|
| 187 |
--dataset "humaneval" \
|
| 188 |
--backend vllm \
|
| 189 |
--tp 1 \
|
| 190 |
--greedy
|
| 191 |
|
| 192 |
-
evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-
|
| 193 |
--dataset "mbpp" \
|
| 194 |
--backend vllm \
|
| 195 |
--tp 1 \
|
|
|
|
| 47 |
|
| 48 |
2. Initialize vLLM server:
|
| 49 |
```
|
| 50 |
+
vllm serve RedHatAI/granite-4.0-h-small-FP8-dynamic --tensor_parallel_size 1
|
| 51 |
```
|
| 52 |
|
| 53 |
3. Send requests to the server:
|
|
|
|
| 64 |
base_url=openai_api_base,
|
| 65 |
)
|
| 66 |
|
| 67 |
+
model = "RedHatAI/granite-4.0-h-small-FP8-dynamic"
|
| 68 |
|
| 69 |
messages = [
|
| 70 |
{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
|
|
|
|
| 109 |
|
| 110 |
model = replace_modules_for_calibration(model)
|
| 111 |
|
| 112 |
+
ignore_lay = ["lm_head", "re:.*block_sparse_moe.router"]
|
| 113 |
|
| 114 |
recipe = QuantizationModifier(
|
| 115 |
targets=["Linear"],
|
| 116 |
+
scheme="FP8_DYNAMIC",
|
| 117 |
ignore=ignore_lay,
|
| 118 |
)
|
| 119 |
|
|
|
|
| 128 |
print(tokenizer.decode(output[0]))
|
| 129 |
print("==========================================")
|
| 130 |
|
| 131 |
+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-dynamic"
|
| 132 |
print(f"Saving to {SAVE_DIR}")
|
| 133 |
|
| 134 |
model.save_pretrained(SAVE_DIR)
|
|
|
|
| 158 |
```
|
| 159 |
lm_eval \
|
| 160 |
--model vllm \
|
| 161 |
+
--model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.9,enable_chunked_prefill=True,trust_remote_code=True \
|
| 162 |
--tasks openllm \
|
| 163 |
--write_out \
|
| 164 |
--batch_size auto \
|
|
|
|
| 170 |
```
|
| 171 |
lm_eval \
|
| 172 |
--model vllm \
|
| 173 |
+
--model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.7,disable_log_stats=True,enable_chunked_prefill=True,trust_remote_code=True \
|
| 174 |
--tasks leaderboard \
|
| 175 |
--apply_chat_template \
|
| 176 |
--fewshot_as_multiturn \
|
|
|
|
| 183 |
**Coding Benchmarks**
|
| 184 |
|
| 185 |
```
|
| 186 |
+
evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
|
| 187 |
--dataset "humaneval" \
|
| 188 |
--backend vllm \
|
| 189 |
--tp 1 \
|
| 190 |
--greedy
|
| 191 |
|
| 192 |
+
evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
|
| 193 |
--dataset "mbpp" \
|
| 194 |
--backend vllm \
|
| 195 |
--tp 1 \
|