Joshua Odmark commited on
Commit
2e4500a
·
1 Parent(s): 717a3b2

Mistral acquired by RedHat, updating references

Browse files
README.md CHANGED
@@ -25,8 +25,8 @@ tags:
25
  base_model:
26
  - NousResearch/Hermes-3-Llama-3.1-70B-FP8
27
  - nvidia/Llama-3.3-70B-Instruct-FP8
28
- - neuralmagic/Qwen2-72B-Instruct-FP8
29
- - neuralmagic/Mistral-Nemo-Instruct-2407-FP8
30
  ---
31
 
32
  # VLLM Tool Calling Guide
@@ -473,10 +473,10 @@ All models listed below have been verified to exist on Hugging Face and work wit
473
  **70B+ Models (High Performance):**
474
  - [NousResearch/Hermes-3-Llama-3.1-70B-FP8](https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B-FP8) — Best tool calling
475
  - [nvidia/Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8) — Best Open WebUI support
476
- - [neuralmagic/Qwen2-72B-Instruct-FP8](https://huggingface.co/neuralmagic/Qwen2-72B-Instruct-FP8) — Best multilingual
477
 
478
  **12B Models (Fast Iteration):**
479
- - [neuralmagic/Mistral-Nemo-Instruct-2407-FP8](https://huggingface.co/neuralmagic/Mistral-Nemo-Instruct-2407-FP8) — 100-150 tok/s
480
 
481
  **Memory Requirements (single GPU):**
482
  - 70B FP8: ~40-50GB
@@ -501,7 +501,7 @@ If you find this guide useful, please star the repository and share it.
501
 
502
  - [NousResearch](https://huggingface.co/NousResearch) for Hermes-3 and pioneering open source tool calling
503
  - [vLLM Project](https://github.com/vllm-project/vllm) for the inference engine
504
- - [NVIDIA](https://huggingface.co/nvidia) and [NeuralMagic](https://huggingface.co/neuralmagic) for FP8 quantized models
505
 
506
  ## License
507
 
 
25
  base_model:
26
  - NousResearch/Hermes-3-Llama-3.1-70B-FP8
27
  - nvidia/Llama-3.3-70B-Instruct-FP8
28
+ - RedHatAI/Qwen2-72B-Instruct-FP8
29
+ - RedHatAI/Mistral-Nemo-Instruct-2407-FP8
30
  ---
31
 
32
  # VLLM Tool Calling Guide
 
473
  **70B+ Models (High Performance):**
474
  - [NousResearch/Hermes-3-Llama-3.1-70B-FP8](https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B-FP8) — Best tool calling
475
  - [nvidia/Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8) — Best Open WebUI support
476
+ - [RedHatAI/Qwen2-72B-Instruct-FP8](https://huggingface.co/RedHatAI/Qwen2-72B-Instruct-FP8) — Best multilingual
477
 
478
  **12B Models (Fast Iteration):**
479
+ - [RedHatAI/Mistral-Nemo-Instruct-2407-FP8](https://huggingface.co/RedHatAI/Mistral-Nemo-Instruct-2407-FP8) — 100-150 tok/s
480
 
481
  **Memory Requirements (single GPU):**
482
  - 70B FP8: ~40-50GB
 
501
 
502
  - [NousResearch](https://huggingface.co/NousResearch) for Hermes-3 and pioneering open source tool calling
503
  - [vLLM Project](https://github.com/vllm-project/vllm) for the inference engine
504
+ - [NVIDIA](https://huggingface.co/nvidia) and [Red Hat AI / NeuralMagic](https://huggingface.co/RedHatAI) for FP8 quantized models
505
 
506
  ## License
507
 
configs/mistral_nemo_12b_fp8.sh CHANGED
@@ -3,7 +3,7 @@
3
  # VLLM Launch Config: Mistral-Nemo-Instruct-2407-FP8
4
  # ============================================================================
5
  #
6
- # Model: neuralmagic/Mistral-Nemo-Instruct-2407-FP8
7
  # Purpose: Fast tool calling for rapid iteration and testing
8
  # Parser: mistral (native Mistral tool call format)
9
  # Memory: ~15GB (leaves tons of VRAM for other tasks)
@@ -21,7 +21,7 @@ echo "=========================================="
21
  echo "Starting VLLM: Mistral-Nemo-12B-FP8"
22
  echo "=========================================="
23
  echo ""
24
- echo "Model: neuralmagic/Mistral-Nemo-Instruct-2407-FP8"
25
  echo "Context: 128K tokens"
26
  echo "Parser: mistral"
27
  echo "Quantization: FP8"
@@ -35,7 +35,7 @@ export VLLM_ATTENTION_BACKEND=FLASH_ATTN
35
  export VLLM_USE_FLASHINFER=0
36
 
37
  python -m vllm.entrypoints.openai.api_server \
38
- --model neuralmagic/Mistral-Nemo-Instruct-2407-FP8 \
39
  --host 0.0.0.0 \
40
  --port 8000 \
41
  --dtype auto \
 
3
  # VLLM Launch Config: Mistral-Nemo-Instruct-2407-FP8
4
  # ============================================================================
5
  #
6
+ # Model: RedHatAI/Mistral-Nemo-Instruct-2407-FP8
7
  # Purpose: Fast tool calling for rapid iteration and testing
8
  # Parser: mistral (native Mistral tool call format)
9
  # Memory: ~15GB (leaves tons of VRAM for other tasks)
 
21
  echo "Starting VLLM: Mistral-Nemo-12B-FP8"
22
  echo "=========================================="
23
  echo ""
24
+ echo "Model: RedHatAI/Mistral-Nemo-Instruct-2407-FP8"
25
  echo "Context: 128K tokens"
26
  echo "Parser: mistral"
27
  echo "Quantization: FP8"
 
35
  export VLLM_USE_FLASHINFER=0
36
 
37
  python -m vllm.entrypoints.openai.api_server \
38
+ --model RedHatAI/Mistral-Nemo-Instruct-2407-FP8 \
39
  --host 0.0.0.0 \
40
  --port 8000 \
41
  --dtype auto \
configs/qwen2_72b_fp8.sh CHANGED
@@ -3,7 +3,7 @@
3
  # VLLM Launch Config: Qwen2-72B-Instruct-FP8
4
  # ============================================================================
5
  #
6
- # Model: neuralmagic/Qwen2-72B-Instruct-FP8
7
  # Purpose: Strong multilingual tool calling with excellent reasoning
8
  # Parser: hermes (Qwen2 uses ChatML-compatible format)
9
  # Memory: ~45GB model + KV cache
@@ -15,7 +15,7 @@ echo "=========================================="
15
  echo "Starting VLLM: Qwen2-72B-Instruct-FP8"
16
  echo "=========================================="
17
  echo ""
18
- echo "Model: neuralmagic/Qwen2-72B-Instruct-FP8"
19
  echo "Context: 128K tokens"
20
  echo "Parser: hermes"
21
  echo "Quantization: FP8"
@@ -29,7 +29,7 @@ export VLLM_ATTENTION_BACKEND=FLASH_ATTN
29
  export VLLM_USE_FLASHINFER=0
30
 
31
  python -m vllm.entrypoints.openai.api_server \
32
- --model neuralmagic/Qwen2-72B-Instruct-FP8 \
33
  --host 0.0.0.0 \
34
  --port 8000 \
35
  --dtype auto \
 
3
  # VLLM Launch Config: Qwen2-72B-Instruct-FP8
4
  # ============================================================================
5
  #
6
+ # Model: RedHatAI/Qwen2-72B-Instruct-FP8
7
  # Purpose: Strong multilingual tool calling with excellent reasoning
8
  # Parser: hermes (Qwen2 uses ChatML-compatible format)
9
  # Memory: ~45GB model + KV cache
 
15
  echo "Starting VLLM: Qwen2-72B-Instruct-FP8"
16
  echo "=========================================="
17
  echo ""
18
+ echo "Model: RedHatAI/Qwen2-72B-Instruct-FP8"
19
  echo "Context: 128K tokens"
20
  echo "Parser: hermes"
21
  echo "Quantization: FP8"
 
29
  export VLLM_USE_FLASHINFER=0
30
 
31
  python -m vllm.entrypoints.openai.api_server \
32
+ --model RedHatAI/Qwen2-72B-Instruct-FP8 \
33
  --host 0.0.0.0 \
34
  --port 8000 \
35
  --dtype auto \
guides/MODEL_COMPARISON.md CHANGED
@@ -6,7 +6,7 @@ Detailed comparison of open source models tested for tool calling with VLLM on N
6
 
7
  | | Hermes-3 70B | Llama-3.3 70B | Qwen2 72B | Mistral-Nemo 12B |
8
  |---|---|---|---|---|
9
- | **Model ID** | `NousResearch/Hermes-3-Llama-3.1-70B-FP8` | `nvidia/Llama-3.3-70B-Instruct-FP8` | `neuralmagic/Qwen2-72B-Instruct-FP8` | `neuralmagic/Mistral-Nemo-Instruct-2407-FP8` |
10
  | **Size** | 70B | 70B | 72B | 12B |
11
  | **Quantization** | FP8 (compressed-tensors) | FP8 (native e4m3) | FP8 | FP8 |
12
  | **VLLM Parser** | `hermes` | `llama3_json` | `hermes` | `mistral` |
 
6
 
7
  | | Hermes-3 70B | Llama-3.3 70B | Qwen2 72B | Mistral-Nemo 12B |
8
  |---|---|---|---|---|
9
+ | **Model ID** | `NousResearch/Hermes-3-Llama-3.1-70B-FP8` | `nvidia/Llama-3.3-70B-Instruct-FP8` | `RedHatAI/Qwen2-72B-Instruct-FP8` | `RedHatAI/Mistral-Nemo-Instruct-2407-FP8` |
10
  | **Size** | 70B | 70B | 72B | 12B |
11
  | **Quantization** | FP8 (compressed-tensors) | FP8 (native e4m3) | FP8 | FP8 |
12
  | **VLLM Parser** | `hermes` | `llama3_json` | `hermes` | `mistral` |