tachyphylaxis ubergarm commited on
Commit
de653b9
·
verified ·
0 Parent(s):

Duplicate from ubergarm/DeepSeek-R1-0528-GGUF

Browse files

Co-authored-by: John Leimgruber III <ubergarm@users.noreply.huggingface.co>

Files changed (46) hide show
  1. .gitattributes +39 -0
  2. IQ1_S/DeepSeek-R1-0528-IQ1_S-00001-of-00003.gguf +3 -0
  3. IQ1_S/DeepSeek-R1-0528-IQ1_S-00002-of-00003.gguf +3 -0
  4. IQ1_S/DeepSeek-R1-0528-IQ1_S-00003-of-00003.gguf +3 -0
  5. IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf +3 -0
  6. IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00002-of-00003.gguf +3 -0
  7. IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00003-of-00003.gguf +3 -0
  8. IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00001-of-00005.gguf +3 -0
  9. IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00002-of-00005.gguf +3 -0
  10. IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00003-of-00005.gguf +3 -0
  11. IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00004-of-00005.gguf +3 -0
  12. IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00005-of-00005.gguf +3 -0
  13. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf +3 -0
  14. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00002-of-00007.gguf +3 -0
  15. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00003-of-00007.gguf +3 -0
  16. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00004-of-00007.gguf +3 -0
  17. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00005-of-00007.gguf +3 -0
  18. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00006-of-00007.gguf +3 -0
  19. IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00007-of-00007.gguf +3 -0
  20. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00001-of-00006.gguf +3 -0
  21. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00002-of-00006.gguf +3 -0
  22. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00003-of-00006.gguf +3 -0
  23. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00004-of-00006.gguf +3 -0
  24. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00005-of-00006.gguf +3 -0
  25. IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00006-of-00006.gguf +3 -0
  26. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00001-of-00007.gguf +3 -0
  27. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00002-of-00007.gguf +3 -0
  28. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00003-of-00007.gguf +3 -0
  29. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00004-of-00007.gguf +3 -0
  30. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00005-of-00007.gguf +3 -0
  31. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00006-of-00007.gguf +3 -0
  32. IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00007-of-00007.gguf +3 -0
  33. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00001-of-00009.gguf +3 -0
  34. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00002-of-00009.gguf +3 -0
  35. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00003-of-00009.gguf +3 -0
  36. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00004-of-00009.gguf +3 -0
  37. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00005-of-00009.gguf +3 -0
  38. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00006-of-00009.gguf +3 -0
  39. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00007-of-00009.gguf +3 -0
  40. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00008-of-00009.gguf +3 -0
  41. IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00009-of-00009.gguf +3 -0
  42. README.md +697 -0
  43. images/buff-mokey-meme.png +3 -0
  44. images/kld-r1-0528-smol-bois.png +3 -0
  45. images/perplexity.png +3 -0
  46. imatrix-DeepSeek-R1-0528.dat +3 -0
.gitattributes ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DeepSeek-V3-0324.imatrix filter=lfs diff=lfs merge=lfs -text
37
+ *.gguf filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
39
+ imatrix-DeepSeek-R1-0528.dat filter=lfs diff=lfs merge=lfs -text
IQ1_S/DeepSeek-R1-0528-IQ1_S-00001-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6951b6843b65d20157cb4cc473a356405f6e6a7b026ed44d6bcf530d16ea6bd4
3
+ size 47764001792
IQ1_S/DeepSeek-R1-0528-IQ1_S-00002-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e4e20050439e219306ef7821901ff29c0e6417b08aebc415aaaf4a4969702b1
3
+ size 47723160160
IQ1_S/DeepSeek-R1-0528-IQ1_S-00003-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2e374bf278fbf99615ed76a3d098a10e8be7030ea20a588cb3003ffd07c865a
3
+ size 47393949376
IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ab4fdf6b3c33c49088ad2f39e25341d63bb2b06595a526968313c9e7731af7
3
+ size 46757893120
IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00002-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcbe7cd7dd74e21b326f9ed393ae9b77b01ee963d79f049e469fd2f519af20fc
3
+ size 46692409952
IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00003-of-00003.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bea8e8dad9d175ac173fc19f7851642b7bfc2084544c9e99ab01bf89d8d82143
3
+ size 46359529152
IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00001-of-00005.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:278c5945dbf624a237f48b2bb973931a684cd4056e7df41dfaf0a1419e495794
3
+ size 47725766208
IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00002-of-00005.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3adc61617b63662c6fbb8a92a3b9d1b8d0f529914c38d7e3c2861bb3e354181a
3
+ size 46544229696
IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00003-of-00005.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dfd1aa57580cbd8730e1ce76e6e800b0249159e0b6138f525e1c2d1e03ff359
3
+ size 47999330656
IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00004-of-00005.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc2d7b21d36d94d67488905bfd5391d19fe068874bc0bdf341188dece7e1a44
3
+ size 47035981952
IQ2_K_R4/DeepSeek-R1-0528-IQ2_K_R4-00005-of-00005.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3682bb66bfde2f2b8af3ccf3347bb4babb51074f638c0c2f5e403ebca4a32611
3
+ size 45870176416
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c499e74d5f595257a58a3b1c7f81a0531ee289e55e535d2cbfa4346ff7c5424
3
+ size 43236408512
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00002-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68384185ce2dbf61aeecc1057f091462e982ced729db52b4de7a088bc8ea2641
3
+ size 43345710784
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00003-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50372e7714872c1887dceda5d3a391527fc478ef26703886441ba249e1415fbf
3
+ size 42840297152
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00004-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d1183252025e56c258238fbaab79f2c8c865098d3f287e7d2174fc35653381
3
+ size 43007373216
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00005-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a823094f186f2885c367b91161a6ebe16d01a216e807fab946c4cfa4061e6614
3
+ size 43345710784
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00006-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9fc82a29fd87d7e37cca862a790b1b69b70c1558b9d521e603bf11a8119f46d
3
+ size 42840297152
IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00007-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823847fdbd85b40a20bafd6de72f49b394f21fb7b91d8f672aea42c8fc4fe065
3
+ size 43607760736
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00001-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:336a29561f1df314437ea16acf90ee2b43d1cb5d214b23c397b58c19f04d6b91
3
+ size 49207254720
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00002-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b9ca938bcabb7e8b2ac7c1cd28cda499aed88e4a4afd7e62ccf5086516563f6
3
+ size 49973416448
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00003-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9275d8e1cafbaef21fb73cb1aad0779959a6aa0cfc1b80a6dec7788b186945b4
3
+ size 49973416448
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00004-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d10759ec189cc127a7f97c56b7ec1a791b7814f518215d32084c873d2831c3
3
+ size 49973416448
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00005-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a5925ca468c9ce01eaf31c6a97fb0156f5e6d82f1ed0dd13f22c5495fdf003
3
+ size 49973416448
IQ3_KT/DeepSeek-R1-0528-IQ3_KT-00006-of-00006.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58f110b16d714d09fd2968c62e36032bbb7300010944cc4e3d70d7c9e98f43fb
3
+ size 43527648192
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00001-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d033ac1cbab3a1faa55f84170d8366d8a4f7a1139157d540a7ad76db8514087
3
+ size 46995982048
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00002-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f063ed7445eb34a4ff58e33abb736291a9911bcc0f85b98f6e6695cf155ac8
3
+ size 46250250432
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00003-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ee522cb2f4ab9b0b7b42ac207e2ba632c7602ebb2e6bb38441c78be6f0b622e
3
+ size 45648955072
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00004-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42e2d3cfb8b884dd68d4f3b0bd069fa766c33ffd081f095a98f744866fe8f49a
3
+ size 45919780512
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00005-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5107c84d9bad42baf0dcdeaeee08d133cae028d9538813dd50d03d7e00209e9b
3
+ size 46037976768
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00006-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8ccbdb523f81e1c478813e731c2f7e3e548789e9711ac55499550fd9eccdba7
3
+ size 45648955072
IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4-00007-of-00007.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47cdad7da61f1d9117359ba6c94b2db207cb0be2abe4097a931f0a484176d0e2
3
+ size 46633609056
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00001-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87f395bc0a64414bd3e57d15abab71f4fa9209bea6d4eff123f45cea262db5b
3
+ size 43894404800
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00002-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd3419fdc12491ebbc9a1ea2ab4ebd8ac9647c40f4448e9a75b10c9433dd76f
3
+ size 44650937888
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00003-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8be5a0156c1eff2468bc68f102cdee8bea3210fd8314f6b5f582a1618dd91c7
3
+ size 42718940448
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00004-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e58c6d71b66414c99cab08f189288e124a35e0078643e4baf88016b27b1b4e2
3
+ size 44921705888
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00005-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aea383e57d57ac3f49e87e2fa3c4aff6c45e8ddfb36f4db13f9af96e9c09376
3
+ size 42718940448
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00006-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e39141a052d295cb691d2788e47259d50008b3bc7f9f944f1820d42ec1bc191d
3
+ size 44921705888
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00007-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3f7e238d3665d38bc70d34bb63f55c9ce24031e94551a89476b311a85fb72e2
3
+ size 42718940448
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00008-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a17e92a86862e69f6df3a7564d480b9a70b59855b8e4d98638b3fa44bdb43dea
3
+ size 44921705888
IQ4_KS_R4/DeepSeek-R1-0528-IQ4_KS_R4-00009-of-00009.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23b4426bb4c73e6c1a3aa4c49f6102109091f207d2f9af13f394b1668cd1026
3
+ size 43432768992
README.md ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ quantized_by: ubergarm
3
+ pipeline_tag: text-generation
4
+ base_model: deepseek-ai/DeepSeek-R1-0528
5
+ license: mit
6
+ base_model_relation: quantized
7
+ tags:
8
+ - mla
9
+ - imatrix
10
+ - conversational
11
+ - ik_llama.cpp
12
+ ---
13
+
14
+ ## `ik_llama.cpp` imatrix MLA Quantizations of DeepSeek-R1-0528
15
+ This quant collection **REQUIRES** [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp/) fork to support advanced non-linear SotA quants and Multi-Head Latent Attention (MLA). Do **not** download these big files and expect them to run on mainline vanilla llama.cpp, ollama, LM Studio, KoboldCpp, etc!
16
+
17
+ *NOTE* `ik_llama.cpp` can also run your existing GGUFs from bartowski, unsloth, mradermacher, etc if you want to try it out before downloading my quants.
18
+
19
+ These quants provide best in class perplexity for the given memory footprint. MLA support allows 32k+ context length in under 24GB GPU VRAM for `R1` and `V3` while offloading MoE layers to RAM.
20
+
21
+ These quants are specifically designed for CPU+GPU systems with under 16GB or 24GB VRAM as well as also CPU *only* rigs using dynamic quant repacking (for maximum memory throughput). If you have more VRAM, you can now load `_R4` repacked quants onto GPUs as of [ik_llama.cpp PR462](https://github.com/ikawrakow/ik_llama.cpp/pull/462). So these quants are good for multi-GPU setups as well now!
22
+
23
+ You could try `ik_llama.cpp` quickly with your *existing* quants, as it computes MLA tensors and repacks quants on the fly at startup (if you have enough RAM+VRAM to fit entire model). Then come check out these fat quants here once you see the difference.
24
+
25
+ ## Big Thanks
26
+ Shout out to Wendell and the **Level1Techs** crew, the community [Forums](https://forum.level1techs.com/t/deepseek-deep-dive-r1-at-home/225826), [YouTube Channel](https://www.youtube.com/@Level1Techs)! **BIG thanks** for providing **BIG hardware** expertise and access to run these experiments and make these great quants available to the community!!!
27
+
28
+ Also thanks to all the folks in the quanting and inferencing community here and on `r/LocalLLaMA` for tips and tricks helping each other run all the fun new models!
29
+
30
+ Excited to share and learn together. Thanks!
31
+
32
+ ## Quant Collection
33
+ So far these are my best recipes offering the lowest perplexity per GiB models suitable for a wide variety of CPU+GPU or CPU *only* rigs.
34
+
35
+ ![Perplexity Chart](images/perplexity.png "Chart showing Perplexity improving as BPW increases.")
36
+
37
+ * `DeepSeek-R1-0528-Q8_0` 666GiB
38
+ - `Final estimate: PPL = 3.2130 +/- 0.01698`
39
+ - I didn't upload this, it is for baseline reference only.
40
+ * `DeepSeek-R1-0528-IQ4_KS_R4` 368GiB
41
+ - `Final estimate: PPL = 3.2286 +/- 0.01710`
42
+ - Fits 32k context in under 24GiB VRAM
43
+ * `DeepSeek-R1-0528-IQ3_K_R4` 301GiB
44
+ - `Final estimate: PPL = 3.2730 +/- 0.01738`
45
+ - Fits 32k context in under 24GiB VRAM
46
+ * `DeepSeek-R1-0528-IQ3_KS` 282 GiB
47
+ - Final estimate: PPL = 3.2983 +/- 0.01759
48
+ - Fits 32k context in under 16GiB VRAM
49
+ - Fits 64k context in under 24GiB VRAM
50
+ * `DeepSeek-R1-0528-IQ2_K_R4` 220GiB
51
+ - `Final estimate: PPL = 3.5069 +/- 0.01893`
52
+ - Fits 32k context in under 16GiB VRAM
53
+ - Fits 64k context in under 24GiB VRAM
54
+ * `DeepSeek-R1-0528-IQ1_S_R4` 131GiB
55
+ - `Final estimate: PPL = 4.8805 +/- 0.02876`
56
+ - The world's smallest working DeepSeek-R1-0528 Quant!
57
+ - Run on AM5 class gaming rig with 2x64GB DDR5 DIMM kit and single GPU!
58
+ - Support for this is bleeding edge you need [PR494](https://github.com/ikawrakow/ik_llama.cpp/pull/494)
59
+ - Fits 32k+ context in under 16GiB VRAM
60
+ - Should fit in 128GiB RAM + 24GB VRAM by offloading layers to GPU.
61
+ - "Only for the desperate."
62
+ - Technically "better" (lower) PPL than `Qwen3-235B-A22B-Q8_0 @ ~5.31` though you can't really make comparisons like this.
63
+
64
+ #### `IQ4_KS_R4` 4.701 BPW (368GiB)
65
+ Special mix `IQ5_KS_R4` `ffn_down` and `IQ4_KS_R4` `ffn_(up|gate)` routed experts. All other layers `q8_0` for CPU+GPU offload. For max speed on CPU *only* rigs use `--run-time-repack`.
66
+
67
+ <details>
68
+
69
+ <summary>👈 Secret Recipe</summary>
70
+
71
+ This quant might be fairly fast despite the larger size given `_KS` quant inferencing optimizations. Made this as there were some requests for a larger size. This on *might* fit on 368GB RAM if you have more than average VRAM, or comfortably on a 512GB RAM rig preferably with 24GB VRAM though fine for CPU only as well.
72
+
73
+ ```bash
74
+ #!/usr/bin/env bash
75
+
76
+ custom="
77
+ # Token embedding and output tensors (GPU)
78
+ token_embd\.weight=q8_0
79
+ output\.weight=q8_0
80
+ output_norm\.weight=q8_0
81
+
82
+ # First 3 dense layers (0-3) (GPU)
83
+ blk\.[0-2]\..*=q8_0
84
+
85
+ # All attention, weights, and bias tensors for MoE layers (3-60) (GPU)
86
+ blk\.[3-9]\.attn_.*=q8_0
87
+ blk\.[1-5][0-9]\.attn_.*=q8_0
88
+ blk\.60\.attn_.*=q8_0
89
+
90
+ blk\.[3-9]\.ffn_norm\.weight=q8_0
91
+ blk\.[1-5][0-9]\.ffn_norm\.weight=q8_0
92
+ blk\.60\.ffn_norm\.weight=q8_0
93
+
94
+ blk\.[3-9]\.exp_probs_b\.bias=q8_0
95
+ blk\.[1-5][0-9]\.exp_probs_b\.bias=q8_0
96
+ blk\.60\.exp_probs_b\.bias=q8_0
97
+
98
+ # Shared Experts (3-60) (GPU)
99
+ blk\.[3-9]\.ffn_down_shexp\.weight=q8_0
100
+ blk\.[1-5][0-9]\.ffn_down_shexp\.weight=q8_0
101
+ blk\.60\.ffn_down_shexp\.weight=q8_0
102
+
103
+ blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=q8_0
104
+ blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=q8_0
105
+ blk\.60\.ffn_(gate|up)_shexp\.weight=q8_0
106
+
107
+ # MoE Experts (3-60) (CPU)
108
+ blk\.[3-9]\.ffn_down_exps\.weight=iq5_ks_r4
109
+ blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq5_ks_r4
110
+ blk\.60\.ffn_down_exps\.weight=iq5_ks_r4
111
+
112
+ blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
113
+ blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
114
+ blk\.60\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
115
+ "
116
+
117
+ custom=$(
118
+ echo "$custom" | grep -v '^#' | \
119
+ sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
120
+ )
121
+
122
+ ./build/bin/llama-quantize \
123
+ --custom-q "$custom" \
124
+ --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
125
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
126
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ4_KS_R4.gguf \
127
+ IQ4_KS_R4 \
128
+ 24
129
+ ```
130
+
131
+ </details>
132
+
133
+ #### `IQ3_K_R4` 3.847 BPW (301GiB)
134
+ Special mix `IQ4_KS_R4` `ffn_down` and `IQ3_K_R4` `ffn_(up|gate)` routed experts. All other layers `q8_0` for CPU+GPU offload. For max speed on CPU *only* rigs use `--run-time-repack`.
135
+
136
+ <details>
137
+
138
+ <summary>👈 Possible VRAM & RAM Combinations</summary>
139
+
140
+ This is probably a good size quant for a 368GB RAM rig preferably with at least a single 24GB VRAM GPU.
141
+ It is probably a little out of reach for a 256GB RAM rig unless you have 80+GB VRAM.
142
+ You could still run "troll rig" style and page off disk for maybe 5 tok/sec and some hot NVMe drives hahah...
143
+
144
+ I'm still testing this out, but initial test am seeing ~12 tok/sec with 256GB RAM and 2x RTX A6000 48GB VRAM on 24x Thread Ripper Pro rig. Can probably get more by offloading a couple more layers.
145
+
146
+ Feel free to report in the comments section your configuration for others to see too. Thanks!
147
+
148
+ ```bash
149
+ --n-gpu-layers 63 \
150
+ -ot "blk\.(3|4|5|6|7)\.ffn_.*=CUDA0" \
151
+ -ot "blk\.(8|9|10|11|12)\.ffn_.*=CUDA1" \
152
+ --override-tensor exps=CPU \
153
+
154
+ llm_load_tensors: CPU buffer size = 252646.07 MiB
155
+ llm_load_tensors: CPU buffer size = 938.98 MiB
156
+ llm_load_tensors: CUDA0 buffer size = 33753.38 MiB
157
+ llm_load_tensors: CUDA1 buffer size = 33900.64 MiB
158
+ ...
159
+ llama_kv_cache_init: CUDA0 KV buffer size = 592.89 MiB
160
+ llama_kv_cache_init: CUDA1 KV buffer size = 573.76 MiB
161
+ llama_new_context_with_model: KV self size = 1166.62 MiB, c^KV (q8_0): 1166.62 MiB, kv^T: not used
162
+ llama_new_context_with_model: CUDA_Host output buffer size = 0.99 MiB
163
+ llama_new_context_with_model: pipeline parallelism enabled (n_copies=1)
164
+ llama_new_context_with_model: CUDA0 compute buffer size = 3425.00 MiB
165
+ llama_new_context_with_model: CUDA1 compute buffer size = 3386.00 MiB
166
+ llama_new_context_with_model: CUDA_Host compute buffer size = 78.01 MiB
167
+ ```
168
+
169
+ </details>
170
+
171
+ <details>
172
+
173
+ <summary>👈 Secret Recipe</summary>
174
+
175
+ ```bash
176
+ #!/usr/bin/env bash
177
+
178
+ custom="
179
+ # Token embedding and output tensors (GPU)
180
+ token_embd\.weight=q8_0
181
+ output\.weight=q8_0
182
+ output_norm\.weight=q8_0
183
+
184
+ # First 3 dense layers (0-3) (GPU)
185
+ blk\.[0-2]\..*=q8_0
186
+
187
+ # All attention, weights, and bias tensors for MoE layers (3-60) (GPU)
188
+ blk\.[3-9]\.attn_.*=q8_0
189
+ blk\.[1-5][0-9]\.attn_.*=q8_0
190
+ blk\.60\.attn_.*=q8_0
191
+
192
+ blk\.[3-9]\.ffn_norm\.weight=q8_0
193
+ blk\.[1-5][0-9]\.ffn_norm\.weight=q8_0
194
+ blk\.60\.ffn_norm\.weight=q8_0
195
+
196
+ blk\.[3-9]\.exp_probs_b\.bias=q8_0
197
+ blk\.[1-5][0-9]\.exp_probs_b\.bias=q8_0
198
+ blk\.60\.exp_probs_b\.bias=q8_0
199
+
200
+ # Shared Experts (3-60) (GPU)
201
+ blk\.[3-9]\.ffn_down_shexp\.weight=q8_0
202
+ blk\.[1-5][0-9]\.ffn_down_shexp\.weight=q8_0
203
+ blk\.60\.ffn_down_shexp\.weight=q8_0
204
+
205
+ blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=q8_0
206
+ blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=q8_0
207
+ blk\.60\.ffn_(gate|up)_shexp\.weight=q8_0
208
+
209
+ # MoE Experts (3-60) (CPU)
210
+ blk\.[3-9]\.ffn_down_exps\.weight=iq4_ks_r4
211
+ blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq4_ks_r4
212
+ blk\.60\.ffn_down_exps\.weight=iq4_ks_r4
213
+
214
+ blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq3_k_r4
215
+ blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq3_k_r4
216
+ blk\.60\.ffn_(gate|up)_exps\.weight=iq3_k_r4
217
+ "
218
+
219
+ custom=$(
220
+ echo "$custom" | grep -v '^#' | \
221
+ sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
222
+ )
223
+
224
+ ./build/bin/llama-quantize \
225
+ --custom-q "$custom" \
226
+ --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
227
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
228
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
229
+ IQ3_K_R4 \
230
+ 24
231
+ ```
232
+
233
+ </details>
234
+
235
+ #### `IQ3_KS` 281.463 GiB (3.598 BPW)
236
+ Special mix with all new `IQ3_KS` `ffn_(gate|up)_exps` and `IQ4_KS` `ffn_down_exps` routed experts. Mostly `iq5_ks/iq4_ks` for attn and shared expert. `iq5_k` `token_embd` and `iq6_k` `output` "head".
237
+
238
+ <details>
239
+
240
+ <summary>👈 Secret Recipe</summary>
241
+
242
+ ```bash
243
+ #!/usr/bin/env bash
244
+
245
+ custom="
246
+ # First 3 dense layers (0-3) (GPU)
247
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
248
+ blk\.[0-2]\.attn_k_b.*=q5_0
249
+ blk\.[0-2]\.attn_.*=iq5_ks
250
+ blk\.[0-2]\.ffn_down.*=iq5_ks
251
+ blk\.[0-2]\.ffn_(gate|up).*=iq4_ks
252
+ blk\.[0-2]\..*=iq5_ks
253
+
254
+ # All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
255
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
256
+ blk\.[3-9]\.attn_k_b.*=q5_0
257
+ blk\.[1-5][0-9]\.attn_k_b.*=q5_0
258
+ blk\.60\.attn_k_b.*=q5_0
259
+
260
+ blk\.[3-9]\.attn_.*=iq5_ks
261
+ blk\.[1-5][0-9]\.attn_.*=iq5_ks
262
+ blk\.60\.attn_.*=iq5_ks
263
+
264
+ #blk\.[3-9]\.ffn_norm\.weight=iq5_ks
265
+ #blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
266
+ #blk\.60\.ffn_norm\.weight=iq5_ks
267
+
268
+ #blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
269
+ #blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
270
+ #blk\.60\.exp_probs_b\.bias=iq5_ks
271
+
272
+ # Shared Experts (3-60) (GPU)
273
+ blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
274
+ blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
275
+ blk\.60\.ffn_down_shexp\.weight=iq5_ks
276
+
277
+ blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
278
+ blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
279
+ blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
280
+
281
+ # Routed Experts (3-60) (CPU)
282
+ blk\.[3-9]\.ffn_down_exps\.weight=iq4_ks
283
+ blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq4_ks
284
+ blk\.60\.ffn_down_exps\.weight=iq4_ks
285
+
286
+ blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
287
+ blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
288
+ blk\.60\.ffn_(gate|up)_exps\.weight=iq3_ks
289
+
290
+ # put last so output weight doesn't catch all the attn ones
291
+ # Token embedding and output tensors (GPU)
292
+ # note token_embd cannot be repacked quant type
293
+ token_embd\.weight=iq5_k
294
+ output\.weight=iq6_k
295
+ "
296
+
297
+ custom=$(
298
+ echo "$custom" | grep -v '^#' | \
299
+ sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
300
+ )
301
+
302
+ ./build/bin/llama-quantize \
303
+ --custom-q "$custom" \
304
+ --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
305
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
306
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_KS.gguf \
307
+ IQ3_KS \
308
+ 24
309
+ ```
310
+
311
+ </details>
312
+
313
+ #### `IQ2_K_R4` 2.799 BPW (220GiB)
314
+ Special mix `IQ3_K_R4` `ffn_down` and `IQ2_K_R4` `ffn_(up|gate)` routed experts. All other layers *roughly* `iq5_ks` for CPU+GPU offload. For max speed on CPU *only* rigs use `--run-time-repack` or manually ofline repack if you want to mmap() off disk.
315
+
316
+ Can fit 32k context in under 16GB VRAM and getting almost 15 tok/sec in early testing! Could go faster offloading more exps layers!
317
+
318
+ <details>
319
+
320
+ <summary>👈 Secret Recipe</summary>
321
+
322
+ ```bash
323
+ #!/usr/bin/env bash
324
+
325
+ # Notes:
326
+ # https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2765210993
327
+ # https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2768567062
328
+ custom="
329
+ # Token embedding and output tensors (GPU)
330
+ # note token_embd cannot be repacked quant type
331
+ token_embd\.weight=iq5_ks
332
+ output\.weight=iq5_ks
333
+ output_norm\.weight=iq5_ks
334
+
335
+ # First 3 dense layers (0-3) (GPU)
336
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
337
+ blk\.[0-2]\.attn_k_b.*=q5_0
338
+ blk\.[0-2]\.attn_.*=iq5_ks
339
+ blk\.[0-2]\..*=iq5_ks
340
+
341
+ # All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
342
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
343
+ blk\.[3-9]\.attn_k_b.*=q5_0
344
+ blk\.[1-5][0-9]\.attn_k_b.*=q5_0
345
+ blk\.60\.attn_k_b.*=q5_0
346
+
347
+ blk\.[3-9]\.attn_.*=iq5_ks
348
+ blk\.[1-5][0-9]\.attn_.*=iq5_ks
349
+ blk\.60\.attn_.*=iq5_ks
350
+
351
+ blk\.[3-9]\.ffn_norm\.weight=iq5_ks
352
+ blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
353
+ blk\.60\.ffn_norm\.weight=iq5_ks
354
+
355
+ blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
356
+ blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
357
+ blk\.60\.exp_probs_b\.bias=iq5_ks
358
+
359
+ # Shared Experts (3-60) (GPU)
360
+ blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
361
+ blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
362
+ blk\.60\.ffn_down_shexp\.weight=iq5_ks
363
+
364
+ blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
365
+ blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
366
+ blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
367
+
368
+ # Routed Experts (3-60) (CPU)
369
+ blk\.[3-9]\.ffn_down_exps\.weight=iq3_k_r4
370
+ blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq3_k_r4
371
+ blk\.60\.ffn_down_exps\.weight=iq3_k_r4
372
+
373
+ blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
374
+ blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
375
+ blk\.60\.ffn_(gate|up)_exps\.weight=iq2_k_r4
376
+ "
377
+
378
+ custom=$(
379
+ echo "$custom" | grep -v '^#' | \
380
+ sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
381
+ )
382
+
383
+ ./build/bin/llama-quantize \
384
+ --custom-q "$custom" \
385
+ --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
386
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
387
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ2_K_R4.gguf \
388
+ IQ2_K_R4 \
389
+ 24
390
+ ```
391
+
392
+ </details>
393
+
394
+ #### `IQ2_KT` Not Yet Released
395
+ I might release my `iq2_kt` "QTIP/exl3/trellis" style quant, but it is rather experimental and the inferencing implementation needs more time to bake. Quality wise it is slightly smaller than the above `IQ2_K_R4` with slightly worse perplexity and KLD.
396
+
397
+ #### `IQ1_S_R4` 130.203 GiB (1.664 BPW)
398
+
399
+ The world's smallest working DeepSeek-R1-0528 quant!
400
+
401
+ ![KLD Smol Boi Comparison](images/kld-r1-0528-smol-bois.png "Chart showing competitive KLD quality of smallest R1-0528 quants.")
402
+ The Delta P numbers for average RMS, 99% percentile, and absolute max divergence from the baseline pure `Q8_0`. Lower is better.
403
+
404
+ If you can fit a larger model completely in RAM+VRAM I would recommend that, but if you have 128GB RAM + 24GB VRAM then give this a try as it is surprisingly usable despite heavy quantization.
405
+
406
+ Support for this is bleeding edge you need [PR494](https://github.com/ikawrakow/ik_llama.cpp/pull/494)!
407
+
408
+ Special mix `IQ1_M_R4` `ffn_down` and `IQ1_S_R4` `ffn_(up|gate)` routed experts. All other layers mostly `iq4_ks` for CPU+GPU offload. For max speed on CPU *only* rigs use `--run-time-repack` (only appleis to the `iq4_ks` tensors etc.).
409
+
410
+ Also released [ubergarm/DeepSeek-V3-0324-IQ1_S_R4](https://huggingface.co/ubergarm/DeepSeek-V3-0324-GGUF/tree/main/DeepSeek-V3-0324-IQ1_S_R4) with the same recipe and size if you don't want thinking.
411
+
412
+ <details>
413
+
414
+ <summary>👈 How to run in 128GiB RAM + 24GB VRAM</summary>
415
+
416
+ Thanks for all the help and feedback to figure this out and so I uploaded the non `_R4` variant which *does* allow for GPU offload to run.
417
+
418
+ A lot of [good discussion](https://huggingface.co/ubergarm/DeepSeek-R1-0528-GGUF/discussions/6#683fbbb9c43f1c9609843e08) on [running this quant](https://github.com/ikawrakow/ik_llama.cpp/discussions/477#discussioncomment-13361099).
419
+
420
+ Keep in mind if you can fit the next size up it will likely actually run faster as it has more optimized quant types.
421
+
422
+ This will fit in ~116.1GiB RAM plus 22448MiB VRAM. You can strip it down more and get another layer on GPU possibly too or increase context. Good luck!
423
+ ```bash
424
+ # You can use more CUDA devices just set them all visibile and do *not* use `-ts ...` with this `-ot ...` strategy.
425
+ CUDA_VISIBLE_DEVICES="0" \
426
+ ./build/bin/llama-server \
427
+ --model /mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf \
428
+ --alias ubergarm/DeepSeek-R1-0528-IQ1_S_R4 \
429
+ --ctx-size 32768 \
430
+ -ctk q8_0 \
431
+ -mla 3 -fa \
432
+ -amb 256 \
433
+ -fmoe \
434
+ --n-gpu-layers 99 \
435
+ -ot "blk\.(3|4|5|6)\.ffn_.*=CUDA0" \
436
+ --override-tensor exps=CPU \
437
+ -rtr \
438
+ --parallel 1 \
439
+ --threads 24 \
440
+ --host 127.0.0.1 \
441
+ --port 8080
442
+
443
+ llm_load_tensors: CPU buffer size = 117936.00 MiB
444
+ llm_load_tensors: CUDA_Host buffer size = 469.99 MiB
445
+ llm_load_tensors: CUDA0 buffer size = 17851.01 MiB
446
+ ....................................................................................................
447
+ llama_kv_cache_init: CUDA0 KV buffer size = 2196.00 MiB
448
+ llama_new_context_with_model: KV self size = 2196.00 MiB, c^KV (f16): 2196.00 MiB, kv^T: not used
449
+ llama_new_context_with_model: CUDA_Host output buffer size = 0.99 MiB
450
+ llama_new_context_with_model: CUDA0 compute buffer size = 3041.00 MiB
451
+ llama_new_context_with_model: CUDA_Host compute buffer size = 78.01 MiB
452
+ ```
453
+
454
+ </details>
455
+
456
+ <details>
457
+
458
+ ![Reverse Buff Mokey Meme](images/buff-mokey-meme.png "Reverse Buff Mokey Meme Comparing full R1-671B fp8 to smol iq1_s_r4 quant.")
459
+
460
+ Possibly useful for 128GiB RAM + 16GB+ VRAM? Maybe? It does actually work and can read python code okay. For all I know it might be better than Qwen3-235B-A22B given the iq1_s_r4 actually has lower PPL!
461
+
462
+ Not recommended and slower than a larger quant unless this is the *only* thing you can fit completely in RAM+VRAM as this quant seems slower and less optimized for inferencing and in testing has slower TG and worse quality (higher perplexity). Plus I'm not sure that you can use it with multi-GPU offload so check the ik_llama.cpp PRs as these tiny quants are less used.
463
+
464
+ I recommend to *not* use the `IQ1_S` so use the `IQ1_S_R4` now with the recent updates supporting GPU offload and better speeds with the repacked quant on CUDA.
465
+
466
+ <summary>👈 Secret Recipe</summary>
467
+
468
+ ```bash
469
+ #!/usr/bin/env bash
470
+
471
+ custom="
472
+ # Token embedding and output tensors (GPU)
473
+ # note token_embd cannot be repacked quant type
474
+ token_embd\.weight=iq4_ks
475
+ output\.weight=iq4_ks
476
+ output_norm\.weight=iq4_ks
477
+
478
+ # First 3 dense layers (0-3) (GPU)
479
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
480
+ blk\.[0-2]\.attn_k_b.*=q4_0
481
+ blk\.[0-2]\.attn_.*=iq4_ks
482
+ blk\.[0-2]\..*=iq4_ks
483
+
484
+ # All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
485
+ # Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
486
+ blk\.[3-9]\.attn_k_b.*=q4_0
487
+ blk\.[1-5][0-9]\.attn_k_b.*=q4_0
488
+ blk\.60\.attn_k_b.*=q4_0
489
+
490
+ blk\.[3-9]\.attn_.*=iq4_ks
491
+ blk\.[1-5][0-9]\.attn_.*=iq4_ks
492
+ blk\.60\.attn_.*=iq4_ks
493
+
494
+ blk\.[3-9]\.ffn_norm\.weight=iq4_ks
495
+ blk\.[1-5][0-9]\.ffn_norm\.weight=iq4_ks
496
+ blk\.60\.ffn_norm\.weight=iq4_ks
497
+
498
+ blk\.[3-9]\.exp_probs_b\.bias=iq4_ks
499
+ blk\.[1-5][0-9]\.exp_probs_b\.bias=iq4_ks
500
+ blk\.60\.exp_probs_b\.bias=iq4_ks
501
+
502
+ # Shared Experts (3-60) (GPU)
503
+ blk\.[3-9]\.ffn_down_shexp\.weight=iq4_ks
504
+ blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq4_ks
505
+ blk\.60\.ffn_down_shexp\.weight=iq4_ks
506
+
507
+ blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
508
+ blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
509
+ blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
510
+
511
+ # Routed Experts (3-60) (CPU)
512
+ blk\.[3-9]\.ffn_down_exps\.weight=iq1_m_r4
513
+ blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq1_m_r4
514
+ blk\.60\.ffn_down_exps\.weight=iq1_m_r4
515
+
516
+ blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
517
+ blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
518
+ blk\.60\.ffn_(gate|up)_exps\.weight=iq1_s_r4
519
+ "
520
+
521
+ custom=$(
522
+ echo "$custom" | grep -v '^#' | \
523
+ sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
524
+ )
525
+
526
+ ./build/bin/llama-quantize \
527
+ --custom-q "$custom" \
528
+ --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
529
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
530
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ1_S_R4.gguf \
531
+ IQ1_S_R4 \
532
+ 24
533
+ ```
534
+
535
+ </details>
536
+
537
+
538
+ ## Quick Start
539
+ #### `ik_llama.cpp` API server for GPU+CPU
540
+ ```bash
541
+ # Fits 32k context in under 24GB VRAM
542
+ # Optional `-ser 6,1` improves speed at some cost to quality
543
+ # Recommended sampling: --temp 0.6 --top-p 0.95
544
+ CUDA_VISIBLE_DEVICES="0," \
545
+ ./build/bin/llama-server \
546
+ --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
547
+ --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
548
+ --ctx-size 32768 \
549
+ -ctk q8_0 \
550
+ -mla 3 -fa \
551
+ -amb 512 \
552
+ -fmoe \
553
+ --n-gpu-layers 63 \
554
+ --override-tensor exps=CPU \
555
+ --parallel 1 \
556
+ --threads 16 \
557
+ --host 127.0.0.1 \
558
+ --port 8080
559
+ ```
560
+
561
+ #### `ik_llama.cpp` API server for MultiGPU(+CPU)
562
+ ```bash
563
+ # Adjust number of routed expert layers for additional VRAM on each GPU
564
+ # Compile with -DGGML_SCHED_MAX_COPIES=1 for multi-GPUs
565
+ # Compile with -DGGML_CUDA_IQK_FORCE_BF16=1 if putting `_R4` tensors on GPU (for DeepSeek only)
566
+ # (might go faster or slower with FORCE_BF16 depending on GPU model)
567
+ # If you have extra VRAM go with `-b 4096 -ub 4096` for potential big PP gains!
568
+ ./build/bin/llama-server \
569
+ --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
570
+ --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
571
+ --ctx-size 32768 \
572
+ -ctk q8_0 \
573
+ -mla 3 -fa \
574
+ -amb 512 \
575
+ -fmoe \
576
+ --n-gpu-layers 63 \
577
+ -ot "blk\.(3|4)\.ffn_.*=CUDA0" \
578
+ -ot "blk\.(5|6)\.ffn_.*=CUDA1" \
579
+ --override-tensor exps=CPU \
580
+ --parallel 1 \
581
+ --threads 16 \
582
+ --host 127.0.0.1 \
583
+ --port 8080
584
+ ```
585
+
586
+ #### `ik_llama.cpp` API server for CPU *only*
587
+ ```
588
+ # The goal for now is as much RAM bandwidth in a single NUMA node e.g.
589
+ # Use BIOS `NPS0` on AMD Epyc or single socket of Intel Xeon in BIOS `SNC=Disable` & Snoop Interleave
590
+ # Tune your `--threads` for token generation, and `--threads-batch` for prompt processing (prefill)
591
+ # Note `--run-time-repack` will pre-allocate enough RAM for model weights instead of mmap()'ing off disk
592
+ # Note there are options for both Explicit and Transparent Huge Pages with tuning discussions in [git repo](https://github.com/ikawrakow/ik_llama.cpp/pull/278#issuecomment-2746381515)
593
+ numactl -N 0 -m 0 \
594
+ ./build/bin/llama-server \
595
+ --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
596
+ --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
597
+ --run-time-repack \
598
+ --ctx-size 65536 \
599
+ -ctk q8_0 \
600
+ -mla 3 -fa \
601
+ -amb 512 \
602
+ -fmoe \
603
+ --parallel 1 \
604
+ --threads 88 \
605
+ --threads-batch 128 \
606
+ --numa numactl \
607
+ --host 127.0.0.1 \
608
+ --port 8080
609
+ ```
610
+
611
+ ## Quant Comparisons
612
+ Check out [The Great Quant Wars of 2025](https://www.reddit.com/r/LocalLLaMA/comments/1khwxal/the_great_quant_wars_of_2025/)
613
+ r/LocalLLaMA post for some more discussion on quantization and
614
+ methodology.
615
+
616
+ #### imatrix
617
+
618
+ <details>
619
+
620
+ <summary>Importance Matrix Details Here</summary>
621
+
622
+ This time I threw in extra material from [turboderp-org/exllamav3](https://github.com/turboderp-org/exllamav3/tree/master/exllamav3/conversion/standard_cal_data)'s
623
+ `standard_cal_data` in addition to my usual `calibration_data_v5_rc.txt` linked below.
624
+
625
+ ```bash
626
+ cat calibration_data_v5_rc.txt > ubergarm-imatrix-calibration-corpus-v02.txt
627
+ cat c4.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
628
+ cat code.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
629
+ cat multilingual.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
630
+ cat technical.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
631
+ cat tiny.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
632
+ # Do *not* use the wiki.utf8 to avoid potential over-fitting on wiki.test.raw common test corpus
633
+ # 1.7MiB total size of ubergarm-imatrix-calibration-corpus-v02.txt
634
+
635
+ ./build/bin/llama-imatrix \
636
+ -m /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-Q8_0.gguf \
637
+ -f ubergarm-imatrix-calibration-corpus-v02.txt \
638
+ -o /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
639
+ --verbosity 1 \
640
+ --ctx-size 512 \
641
+ --layer-similarity \
642
+ --threads 128
643
+ ```
644
+
645
+ </details>
646
+
647
+ #### Perplexity
648
+
649
+ I use the `Q8_0` without imatrix as the baseline against `wiki.test.raw`:
650
+
651
+ <details>
652
+
653
+ <summary>👈 Perplexity Logs</summary>
654
+
655
+ ```bash
656
+ $ ./build/bin/llama-perplexity \
657
+ --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
658
+ -f wiki.test.raw \
659
+ --seed 1337 \
660
+ --ctx-size 512 \
661
+ -mla 3 -fa \
662
+ -amb 512 \
663
+ -fmoe \
664
+ --n-gpu-layers 63 \
665
+ -ot "blk\.(3|4|5|6|7|8)\.ffn_.*=CUDA0" \
666
+ -ot "blk\.(9|10|11|12|13)\.ffn_.*=CUDA1" \
667
+ --override-tensor exps=CPU \
668
+ --threads 24
669
+
670
+ Final estimate: PPL = 3.2730 +/- 0.01738
671
+ ```
672
+
673
+ </details>
674
+
675
+ #### Split
676
+
677
+ <details>
678
+
679
+ <summary>👈 Split GGUF</summary>
680
+
681
+ *TODO*: Add key value metadata information before publishing.
682
+
683
+ ```bash
684
+ $ ./build/bin/llama-gguf-split \
685
+ --dry-run \
686
+ --split \
687
+ --split-max-size 50G \
688
+ /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf
689
+ /mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4
690
+ ```
691
+
692
+ </details>
693
+
694
+ ## References
695
+ * [ik_llama.cpp DeepSeek-R1-0528 Discussion](https://github.com/ikawrakow/ik_llama.cpp/discussions/477)
696
+ * [turboderp-org/exllamav3](https://github.com/turboderp-org/exllamav3/pull/26)
697
+ * [imatrix calibration_data_v5_rc.txt](https://gist.github.com/tristandruyen/9e207a95c7d75ddf37525d353e00659c#file-calibration_data_v5_rc-txt)
images/buff-mokey-meme.png ADDED

Git LFS Details

  • SHA256: 15e8f8fc60de0158eb5639513b9629edaa91c47d996becbc496d0cdea4c56eb1
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
images/kld-r1-0528-smol-bois.png ADDED

Git LFS Details

  • SHA256: 37287f8bb732ecf623a63a3d0cc67c171349cf73cfb5317f4cfdb0f16f64bac2
  • Pointer size: 131 Bytes
  • Size of remote file: 129 kB
images/perplexity.png ADDED

Git LFS Details

  • SHA256: 4295b96c2ba5d0d14d6a2a14baf0dc35f3431773a56095d3b67d25d61c89821b
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
imatrix-DeepSeek-R1-0528.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8def19f2ae208a1ff85f34bd170f9d68ee73735c7bc62741d66b42f683d831f
3
+ size 986986781