krishnateja95 commited on
Commit
ce48396
·
verified ·
1 Parent(s): 0b2da09

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -9
README.md CHANGED
@@ -47,7 +47,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
47
 
48
  2. Initialize vLLM server:
49
  ```
50
- vllm serve RedHatAI/granite-4.0-h-small-FP8-block --tensor_parallel_size 1
51
  ```
52
 
53
  3. Send requests to the server:
@@ -64,7 +64,7 @@ client = OpenAI(
64
  base_url=openai_api_base,
65
  )
66
 
67
- model = "RedHatAI/granite-4.0-h-small-FP8-block"
68
 
69
  messages = [
70
  {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
@@ -109,11 +109,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
109
 
110
  model = replace_modules_for_calibration(model)
111
 
112
- ignore_lay = ["lm_head", "re:.*block_sparse_moe.router", "re:.*mamba.in_proj", "re:.*shared_mlp.input_linear"]
113
 
114
  recipe = QuantizationModifier(
115
  targets=["Linear"],
116
- scheme="FP8_BLOCK",
117
  ignore=ignore_lay,
118
  )
119
 
@@ -128,7 +128,7 @@ output = model.generate(input_ids, max_new_tokens=35)
128
  print(tokenizer.decode(output[0]))
129
  print("==========================================")
130
 
131
- SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
132
  print(f"Saving to {SAVE_DIR}")
133
 
134
  model.save_pretrained(SAVE_DIR)
@@ -158,7 +158,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
158
  ```
159
  lm_eval \
160
  --model vllm \
161
- --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-block",dtype=auto,add_bos_token=True,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.9,enable_chunked_prefill=True,trust_remote_code=True \
162
  --tasks openllm \
163
  --write_out \
164
  --batch_size auto \
@@ -170,7 +170,7 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
170
  ```
171
  lm_eval \
172
  --model vllm \
173
- --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-block",dtype=auto,add_bos_token=False,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.7,disable_log_stats=True,enable_chunked_prefill=True,trust_remote_code=True \
174
  --tasks leaderboard \
175
  --apply_chat_template \
176
  --fewshot_as_multiturn \
@@ -183,13 +183,13 @@ uv pip install -U git+https://github.com/vllm-project/vllm.git@refs/pull/28398/h
183
  **Coding Benchmarks**
184
 
185
  ```
186
- evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-block" \
187
  --dataset "humaneval" \
188
  --backend vllm \
189
  --tp 1 \
190
  --greedy
191
 
192
- evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-block" \
193
  --dataset "mbpp" \
194
  --backend vllm \
195
  --tp 1 \
 
47
 
48
  2. Initialize vLLM server:
49
  ```
50
+ vllm serve RedHatAI/granite-4.0-h-small-FP8-dynamic --tensor_parallel_size 1
51
  ```
52
 
53
  3. Send requests to the server:
 
64
  base_url=openai_api_base,
65
  )
66
 
67
+ model = "RedHatAI/granite-4.0-h-small-FP8-dynamic"
68
 
69
  messages = [
70
  {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
 
109
 
110
  model = replace_modules_for_calibration(model)
111
 
112
+ ignore_lay = ["lm_head", "re:.*block_sparse_moe.router"]
113
 
114
  recipe = QuantizationModifier(
115
  targets=["Linear"],
116
+ scheme="FP8_DYNAMIC",
117
  ignore=ignore_lay,
118
  )
119
 
 
128
  print(tokenizer.decode(output[0]))
129
  print("==========================================")
130
 
131
+ SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-dynamic"
132
  print(f"Saving to {SAVE_DIR}")
133
 
134
  model.save_pretrained(SAVE_DIR)
 
158
  ```
159
  lm_eval \
160
  --model vllm \
161
+ --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.9,enable_chunked_prefill=True,trust_remote_code=True \
162
  --tasks openllm \
163
  --write_out \
164
  --batch_size auto \
 
170
  ```
171
  lm_eval \
172
  --model vllm \
173
+ --model_args pretrained="RedHatAI/granite-4.0-h-small-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=16384,tensor_parallel_size=1,gpu_memory_utilization=0.7,disable_log_stats=True,enable_chunked_prefill=True,trust_remote_code=True \
174
  --tasks leaderboard \
175
  --apply_chat_template \
176
  --fewshot_as_multiturn \
 
183
  **Coding Benchmarks**
184
 
185
  ```
186
+ evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
187
  --dataset "humaneval" \
188
  --backend vllm \
189
  --tp 1 \
190
  --greedy
191
 
192
+ evalplus.evaluate --model "RedHatAI/granite-4.0-h-small-FP8-dynamic" \
193
  --dataset "mbpp" \
194
  --backend vllm \
195
  --tp 1 \