Update README.md
Browse files
README.md
CHANGED
|
@@ -660,7 +660,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 660 |
<details>
|
| 661 |
<summary>Benchmarking Command</summary>
|
| 662 |
```
|
| 663 |
-
guidellm --model
|
| 664 |
```
|
| 665 |
|
| 666 |
</details>
|
|
@@ -695,7 +695,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 695 |
<tr>
|
| 696 |
<th rowspan="3" valign="top">A100</th>
|
| 697 |
<td>4</td>
|
| 698 |
-
<td>
|
| 699 |
<td></td>
|
| 700 |
<td>7.5</td>
|
| 701 |
<td>67</td>
|
|
@@ -706,7 +706,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 706 |
</tr>
|
| 707 |
<tr>
|
| 708 |
<td>2</td>
|
| 709 |
-
<td>
|
| 710 |
<td>1.86</td>
|
| 711 |
<td>8.1</td>
|
| 712 |
<td>124</td>
|
|
@@ -717,7 +717,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 717 |
</tr>
|
| 718 |
<tr>
|
| 719 |
<td>2</td>
|
| 720 |
-
<td>
|
| 721 |
<td>2.52</td>
|
| 722 |
<td>6.9</td>
|
| 723 |
<td>147</td>
|
|
@@ -729,7 +729,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 729 |
<tr>
|
| 730 |
<th rowspan="3" valign="top">H100</th>
|
| 731 |
<td>4</td>
|
| 732 |
-
<td>
|
| 733 |
<td></td>
|
| 734 |
<td>4.4</td>
|
| 735 |
<td>67</td>
|
|
@@ -740,7 +740,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 740 |
</tr>
|
| 741 |
<tr>
|
| 742 |
<td>2</td>
|
| 743 |
-
<td>
|
| 744 |
<td>1.82</td>
|
| 745 |
<td>4.7</td>
|
| 746 |
<td>120</td>
|
|
@@ -751,7 +751,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 751 |
</tr>
|
| 752 |
<tr>
|
| 753 |
<td>2</td>
|
| 754 |
-
<td>
|
| 755 |
<td>1.87</td>
|
| 756 |
<td>4.7</td>
|
| 757 |
<td>120</td>
|
|
@@ -794,7 +794,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 794 |
<tbody style="text-align: center">
|
| 795 |
<tr>
|
| 796 |
<th rowspan="3" valign="top">A100x4</th>
|
| 797 |
-
<td>
|
| 798 |
<td></td>
|
| 799 |
<td>0.4</td>
|
| 800 |
<td>222</td>
|
|
@@ -804,28 +804,28 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 804 |
<td>399</td>
|
| 805 |
</tr>
|
| 806 |
<tr>
|
| 807 |
-
<td>
|
| 808 |
<td>1.70</td>
|
| 809 |
<td>1.6</td>
|
| 810 |
-
<td>
|
| 811 |
<td>2.2</td>
|
| 812 |
-
<td>
|
| 813 |
<td>2.6</td>
|
| 814 |
-
<td>
|
| 815 |
</tr>
|
| 816 |
<tr>
|
| 817 |
-
<td>
|
| 818 |
<td>1.48</td>
|
| 819 |
<td>1.0</td>
|
| 820 |
-
<td>
|
| 821 |
<td>2.0</td>
|
| 822 |
-
<td>
|
| 823 |
<td>2.8</td>
|
| 824 |
-
<td>
|
| 825 |
</tr>
|
| 826 |
<tr>
|
| 827 |
<<th rowspan="3" valign="top">H100x4</th>
|
| 828 |
-
<td>
|
| 829 |
<td></td>
|
| 830 |
<td>1.0</td>
|
| 831 |
<td>284</td>
|
|
@@ -835,24 +835,24 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 835 |
<td>511</td>
|
| 836 |
</tr>
|
| 837 |
<tr>
|
| 838 |
-
<td>
|
| 839 |
<td>1.61</td>
|
| 840 |
<td>3.4</td>
|
| 841 |
-
<td>
|
| 842 |
<td>5.2</td>
|
| 843 |
-
<td>
|
| 844 |
<td>6.4</td>
|
| 845 |
-
<td>
|
| 846 |
</tr>
|
| 847 |
<tr>
|
| 848 |
-
<td>
|
| 849 |
<td>1.33</td>
|
| 850 |
<td>2.8</td>
|
| 851 |
-
<td>
|
| 852 |
<td>4.4</td>
|
| 853 |
-
<td>
|
| 854 |
<td>5.4</td>
|
| 855 |
-
<td>
|
| 856 |
</tr>
|
| 857 |
</tbody>
|
| 858 |
</table>
|
|
@@ -861,7 +861,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
| 861 |
|
| 862 |
**QPS: Queries per second.
|
| 863 |
|
| 864 |
-
**QPD: Queries per dollar, based on on-demand cost at [Lambda Labs](https://lambdalabs.com/service/gpu-cloud) (observed on 2/18/2025).
|
| 865 |
|
| 866 |
## The Mistral AI Team
|
| 867 |
|
|
|
|
| 660 |
<details>
|
| 661 |
<summary>Benchmarking Command</summary>
|
| 662 |
```
|
| 663 |
+
guidellm --model neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-dynamic --target "http://localhost:8000/v1" --data-type emulated --data prompt_tokens=<prompt_tokens>,generated_tokens=<generated_tokens>,images=<num_images>,width=<image_width>,height=<image_height> --max seconds 120 --backend aiohttp_server
|
| 664 |
```
|
| 665 |
|
| 666 |
</details>
|
|
|
|
| 695 |
<tr>
|
| 696 |
<th rowspan="3" valign="top">A100</th>
|
| 697 |
<td>4</td>
|
| 698 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
| 699 |
<td></td>
|
| 700 |
<td>7.5</td>
|
| 701 |
<td>67</td>
|
|
|
|
| 706 |
</tr>
|
| 707 |
<tr>
|
| 708 |
<td>2</td>
|
| 709 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w8a8</td>
|
| 710 |
<td>1.86</td>
|
| 711 |
<td>8.1</td>
|
| 712 |
<td>124</td>
|
|
|
|
| 717 |
</tr>
|
| 718 |
<tr>
|
| 719 |
<td>2</td>
|
| 720 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
| 721 |
<td>2.52</td>
|
| 722 |
<td>6.9</td>
|
| 723 |
<td>147</td>
|
|
|
|
| 729 |
<tr>
|
| 730 |
<th rowspan="3" valign="top">H100</th>
|
| 731 |
<td>4</td>
|
| 732 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
| 733 |
<td></td>
|
| 734 |
<td>4.4</td>
|
| 735 |
<td>67</td>
|
|
|
|
| 740 |
</tr>
|
| 741 |
<tr>
|
| 742 |
<td>2</td>
|
| 743 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-Dynamic</td>
|
| 744 |
<td>1.82</td>
|
| 745 |
<td>4.7</td>
|
| 746 |
<td>120</td>
|
|
|
|
| 751 |
</tr>
|
| 752 |
<tr>
|
| 753 |
<td>2</td>
|
| 754 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
| 755 |
<td>1.87</td>
|
| 756 |
<td>4.7</td>
|
| 757 |
<td>120</td>
|
|
|
|
| 794 |
<tbody style="text-align: center">
|
| 795 |
<tr>
|
| 796 |
<th rowspan="3" valign="top">A100x4</th>
|
| 797 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
| 798 |
<td></td>
|
| 799 |
<td>0.4</td>
|
| 800 |
<td>222</td>
|
|
|
|
| 804 |
<td>399</td>
|
| 805 |
</tr>
|
| 806 |
<tr>
|
| 807 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w8a8</td>
|
| 808 |
<td>1.70</td>
|
| 809 |
<td>1.6</td>
|
| 810 |
+
<td>766</td>
|
| 811 |
<td>2.2</td>
|
| 812 |
+
<td>1142</td>
|
| 813 |
<td>2.6</td>
|
| 814 |
+
<td>1348</td>
|
| 815 |
</tr>
|
| 816 |
<tr>
|
| 817 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
| 818 |
<td>1.48</td>
|
| 819 |
<td>1.0</td>
|
| 820 |
+
<td>552</td>
|
| 821 |
<td>2.0</td>
|
| 822 |
+
<td>1010</td>
|
| 823 |
<td>2.8</td>
|
| 824 |
+
<td>1360</td>
|
| 825 |
</tr>
|
| 826 |
<tr>
|
| 827 |
<<th rowspan="3" valign="top">H100x4</th>
|
| 828 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
| 829 |
<td></td>
|
| 830 |
<td>1.0</td>
|
| 831 |
<td>284</td>
|
|
|
|
| 835 |
<td>511</td>
|
| 836 |
</tr>
|
| 837 |
<tr>
|
| 838 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-Dynamic</td>
|
| 839 |
<td>1.61</td>
|
| 840 |
<td>3.4</td>
|
| 841 |
+
<td>905</td>
|
| 842 |
<td>5.2</td>
|
| 843 |
+
<td>1406</td>
|
| 844 |
<td>6.4</td>
|
| 845 |
+
<td>1759</td>
|
| 846 |
</tr>
|
| 847 |
<tr>
|
| 848 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
| 849 |
<td>1.33</td>
|
| 850 |
<td>2.8</td>
|
| 851 |
+
<td>761</td>
|
| 852 |
<td>4.4</td>
|
| 853 |
+
<td>1228</td>
|
| 854 |
<td>5.4</td>
|
| 855 |
+
<td>1480</td>
|
| 856 |
</tr>
|
| 857 |
</tbody>
|
| 858 |
</table>
|
|
|
|
| 861 |
|
| 862 |
**QPS: Queries per second.
|
| 863 |
|
| 864 |
+
**QPD: Queries per dollar, based on on-demand cost at [Lambda Labs](https://lambdalabs.com/service/gpu-cloud) (observed on 2/18/2025).
|
| 865 |
|
| 866 |
## The Mistral AI Team
|
| 867 |
|