Yet another excellent quant suite from AesSedai
Thanks a lot for the work you put into such good quants. This version of "Step-3.5-Flash" --> Base-Midtrain-GGUF seems even more compentent than the initial version, more coherent and focused.
I've started to get addicted to your Q5_K_M quant: it's such a bliss to my setup (8x RTX3090 + AMD Epyc 9443p + 256 GB DDR-3200 ECC). I love that Q5_K_M works so nicely with both llama.cpp mainline but it's a divine treat with ik_llama. Below I will leave a couple of benchmarks (ik_llama-suite) for reference in case some other lunatics are looking for relevant figures:
aessedai/Step-3.5-Flash-Base-Midtrain-Q5_K_M
main: n_kv_max = 65536, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.816 | 1254.55 | 3.352 | 76.37 |
| 1024 | 256 | 1024 | 0.650 | 1575.42 | 3.424 | 74.77 |
| 1024 | 256 | 2048 | 0.615 | 1665.26 | 3.507 | 72.99 |
| 1024 | 256 | 3072 | 0.618 | 1657.53 | 3.592 | 71.26 |
| 1024 | 256 | 4096 | 0.620 | 1651.30 | 3.524 | 72.65 |
| 1024 | 256 | 5120 | 0.625 | 1639.60 | 3.586 | 71.39 |
| 1024 | 256 | 6144 | 0.636 | 1610.26 | 3.583 | 71.45 |
| 1024 | 256 | 7168 | 0.633 | 1616.57 | 3.569 | 71.73 |
| 1024 | 256 | 8192 | 0.636 | 1610.33 | 3.678 | 69.60 |
| 1024 | 256 | 9216 | 0.643 | 1591.79 | 3.680 | 69.57 |
| 1024 | 256 | 10240 | 0.682 | 1501.14 | 3.673 | 69.69 |
| 1024 | 256 | 11264 | 0.655 | 1564.46 | 3.681 | 69.54 |
| 1024 | 256 | 12288 | 0.653 | 1567.21 | 3.679 | 69.59 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, -smgs
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.847 | 1208.38 | 3.421 | 74.83 |
| 1024 | 256 | 1024 | 0.608 | 1685.24 | 3.403 | 75.23 |
| 1024 | 256 | 2048 | 0.613 | 1670.91 | 3.464 | 73.90 |
| 1024 | 256 | 3072 | 0.619 | 1655.17 | 3.445 | 74.32 |
| 1024 | 256 | 4096 | 0.623 | 1644.96 | 3.546 | 72.19 |
| 1024 | 256 | 5120 | 0.626 | 1635.93 | 3.619 | 70.74 |
| 1024 | 256 | 6144 | 0.629 | 1627.50 | 3.535 | 72.41 |
| 1024 | 256 | 7168 | 0.634 | 1616.08 | 3.571 | 71.68 |
| 1024 | 256 | 8192 | 0.640 | 1600.27 | 3.620 | 70.71 |
| 1024 | 256 | 9216 | 0.642 | 1593.94 | 3.572 | 71.67 |
| 1024 | 256 | 10240 | 0.650 | 1575.76 | 3.648 | 70.18 |
| 1024 | 256 | 11264 | 0.651 | 1573.98 | 3.696 | 69.26 |
| 1024 | 256 | 12288 | 0.741 | 1381.43 | 3.647 | 70.19 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, -smgs -muge
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.851 | 1203.81 | 3.333 | 76.80 |
| 1024 | 256 | 1024 | 0.655 | 1563.67 | 3.435 | 74.52 |
| 1024 | 256 | 2048 | 0.612 | 1671.88 | 3.528 | 72.55 |
| 1024 | 256 | 3072 | 0.619 | 1653.30 | 3.547 | 72.18 |
| 1024 | 256 | 4096 | 0.620 | 1651.91 | 3.563 | 71.86 |
| 1024 | 256 | 5120 | 0.626 | 1636.43 | 3.628 | 70.57 |
| 1024 | 256 | 6144 | 0.629 | 1629.16 | 3.631 | 70.50 |
| 1024 | 256 | 7168 | 0.633 | 1618.34 | 3.546 | 72.20 |
| 1024 | 256 | 8192 | 0.637 | 1607.67 | 3.611 | 70.89 |
| 1024 | 256 | 9216 | 0.640 | 1601.13 | 3.647 | 70.19 |
| 1024 | 256 | 10240 | 0.648 | 1579.93 | 3.616 | 70.80 |
| 1024 | 256 | 11264 | 0.649 | 1578.06 | 3.725 | 68.72 |
| 1024 | 256 | 12288 | 0.653 | 1567.47 | 3.783 | 67.67 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, -muge
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.848 | 1207.61 | 3.299 | 77.59 |
| 1024 | 256 | 1024 | 0.607 | 1686.05 | 3.512 | 72.90 |
| 1024 | 256 | 2048 | 0.658 | 1555.56 | 3.505 | 73.03 |
| 1024 | 256 | 3072 | 0.619 | 1653.75 | 3.480 | 73.56 |
| 1024 | 256 | 4096 | 0.623 | 1644.57 | 3.496 | 73.22 |
| 1024 | 256 | 5120 | 0.628 | 1631.21 | 3.662 | 69.91 |
| 1024 | 256 | 6144 | 0.631 | 1623.54 | 3.622 | 70.68 |
| 1024 | 256 | 7168 | 0.634 | 1615.02 | 3.549 | 72.13 |
| 1024 | 256 | 8192 | 0.638 | 1604.29 | 3.643 | 70.28 |
| 1024 | 256 | 9216 | 0.642 | 1595.85 | 3.687 | 69.42 |
| 1024 | 256 | 10240 | 0.692 | 1479.69 | 3.621 | 70.69 |
| 1024 | 256 | 11264 | 0.648 | 1579.19 | 3.691 | 69.36 |
| 1024 | 256 | 12288 | 0.712 | 1438.54 | 3.713 | 68.95 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, -smgs -khad
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.783 | 1307.95 | 3.411 | 75.06 |
| 1024 | 256 | 1024 | 0.609 | 1680.83 | 3.514 | 72.85 |
| 1024 | 256 | 2048 | 0.616 | 1661.08 | 3.520 | 72.74 |
| 1024 | 256 | 3072 | 0.619 | 1653.15 | 3.602 | 71.07 |
| 1024 | 256 | 4096 | 0.623 | 1643.74 | 3.618 | 70.77 |
| 1024 | 256 | 5120 | 0.628 | 1631.07 | 3.569 | 71.73 |
| 1024 | 256 | 6144 | 0.630 | 1626.17 | 3.645 | 70.23 |
| 1024 | 256 | 7168 | 0.634 | 1615.83 | 3.622 | 70.67 |
| 1024 | 256 | 8192 | 0.639 | 1603.20 | 3.616 | 70.79 |
| 1024 | 256 | 9216 | 0.642 | 1595.86 | 3.700 | 69.19 |
| 1024 | 256 | 10240 | 0.648 | 1580.02 | 3.768 | 67.95 |
| 1024 | 256 | 11264 | 0.649 | 1577.76 | 3.765 | 67.99 |
| 1024 | 256 | 12288 | 0.657 | 1559.53 | 3.804 | 67.31 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, -khad
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.840 | 1218.99 | 3.342 | 76.59 |
| 1024 | 256 | 1024 | 0.657 | 1557.66 | 3.575 | 71.61 |
| 1024 | 256 | 2048 | 0.618 | 1656.98 | 3.608 | 70.95 |
| 1024 | 256 | 3072 | 0.619 | 1653.62 | 3.597 | 71.18 |
| 1024 | 256 | 4096 | 0.627 | 1633.86 | 3.628 | 70.56 |
| 1024 | 256 | 5120 | 0.628 | 1630.51 | 3.555 | 72.01 |
| 1024 | 256 | 6144 | 0.631 | 1622.14 | 3.658 | 69.99 |
| 1024 | 256 | 7168 | 0.634 | 1614.13 | 3.663 | 69.88 |
| 1024 | 256 | 8192 | 0.639 | 1602.55 | 3.646 | 70.21 |
| 1024 | 256 | 9216 | 0.642 | 1594.93 | 3.704 | 69.11 |
| 1024 | 256 | 10240 | 0.651 | 1573.72 | 4.037 | 63.42 |
| 1024 | 256 | 11264 | 0.650 | 1574.63 | 3.788 | 67.57 |
| 1024 | 256 | 12288 | 0.656 | 1561.63 | 3.781 | 67.70 |
main: n_kv_max = 16384, n_batch = 2048, n_ubatch = 1024, flash_attn = 1, n_gpu_layers = 999, n_threads = 24, n_threads_batch = 24, sm = graph, --mqkv
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|---|---|---|---|---|---|---|
| 1024 | 256 | 0 | 0.779 | 1314.41 | 3.332 | 76.84 |
| 1024 | 256 | 1024 | 0.610 | 1677.49 | 3.482 | 73.52 |
| 1024 | 256 | 2048 | 0.614 | 1667.13 | 3.509 | 72.94 |
| 1024 | 256 | 3072 | 0.620 | 1652.32 | 3.473 | 73.71 |
| 1024 | 256 | 4096 | 0.621 | 1648.67 | 3.593 | 71.24 |
| 1024 | 256 | 5120 | 0.645 | 1588.76 | 3.561 | 71.89 |
| 1024 | 256 | 6144 | 0.629 | 1628.31 | 3.571 | 71.69 |
| 1024 | 256 | 7168 | 0.631 | 1622.07 | 3.662 | 69.91 |
| 1024 | 256 | 8192 | 0.637 | 1608.50 | 3.639 | 70.34 |
| 1024 | 256 | 9216 | 0.640 | 1599.41 | 3.605 | 71.01 |
| 1024 | 256 | 10240 | 0.645 | 1586.43 | 3.694 | 69.30 |
| 1024 | 256 | 11264 | 0.649 | 1577.20 | 3.707 | 69.06 |
| 1024 | 256 | 12288 | 0.654 | 1566.84 | 3.757 | 68.15 |