nm-testing
/

granite-4.0-h-small-FP8-dynamic

@@ -47,7 +47,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
 2. Initialize vLLM server:
 ```
-vllm serve RedHatAI/granite-4.0-h-small-FP8-block --tensor_parallel_size 1
 ```
 3. Send requests to the server:
@@ -64,7 +64,7 @@ client = OpenAI(
     base_url=openai_api_base,
 )
-model = "RedHatAI/granite-4.0-h-small-FP8-block"
 messages = [
     {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
@@ -109,11 +109,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = replace_modules_for_calibration(model)
-ignore_lay = ["lm_head", "re:.*block_sparse_moe.router", "re:.*mamba.in_proj", "re:.*shared_mlp.input_linear"]
 recipe = QuantizationModifier(
     targets=["Linear"],
-    scheme="FP8_BLOCK",
     ignore=ignore_lay,
 )
@@ -128,7 +128,7 @@ output = model.generate(input_ids, max_new_tokens=35)
 print(tokenizer.decode(output[0]))
 print("==========================================")
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
 print(f"Saving to {SAVE_DIR}")
 model.save_pretrained(SAVE_DIR)
@@ -158,7 +158,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-block",dtype=auto,add_bos_token=True,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.9,enable_chunked_prefill=True,trust_remote_code=True \
     --tasks openllm \
     --write_out \
     --batch_size auto \
@@ -170,7 +170,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-block",dtype=auto,add_bos_token=False,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.7,disable_log_stats=True,enable_chunked_prefill=True,trust_remote_code=True \
     --tasks leaderboard \
     --apply_chat_template \
     --fewshot_as_multiturn \
@@ -183,13 +183,13 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
   **Coding Benchmarks**
   ```
-  evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-block" \
                     --dataset "humaneval" \
                     --backend vllm \
                     --tp 1 \
                     --greedy
-  evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-block" \
                   --dataset "mbpp" \
                   --backend vllm \
                   --tp 1 \

 2. Initialize vLLM server:
 ```
+vllm serve RedHatAI/granite-4.0-h-small-FP8-dynamic --tensor_parallel_size 1
 ```
 3. Send requests to the server:
     base_url=openai_api_base,
 )
+model = "RedHatAI/granite-4.0-h-small-FP8-dynamic"
 messages = [
     {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
 model = replace_modules_for_calibration(model)
+ignore_lay = ["lm_head", "re:.*block_sparse_moe.router"]
 recipe = QuantizationModifier(
     targets=["Linear"],
+    scheme="FP8_DYNAMIC",
     ignore=ignore_lay,
 )
 print(tokenizer.decode(output[0]))
 print("==========================================")
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-dynamic"
 print(f"Saving to {SAVE_DIR}")
 model.save_pretrained(SAVE_DIR)
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.9,enable_chunked_prefill=True,trust_remote_code=True \
     --tasks openllm \
     --write_out \
     --batch_size auto \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.7,disable_log_stats=True,enable_chunked_prefill=True,trust_remote_code=True \
     --tasks leaderboard \
     --apply_chat_template \
     --fewshot_as_multiturn \
   **Coding Benchmarks**
   ```
+  evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
                     --dataset "humaneval" \
                     --backend vllm \
                     --tp 1 \
                     --greedy
+  evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
                   --dataset "mbpp" \
                   --backend vllm \
                   --tp 1 \