Update README.md
Browse files
README.md
CHANGED
|
@@ -13,17 +13,6 @@ pipeline_tag: text-generation
|
|
| 13 |
- [Output and embed tensors quantized to q8_0, all other tensors quantized for q4_k.](https://huggingface.co/RobertSinclair)
|
| 14 |
- [Output and embed tensors quantized to bf16, all other tensors quantized for q5_k, q6_k, q8_0 and q8_0 --pure.](https://huggingface.co/RobertSinclair)
|
| 15 |
- BF16 and imatrix q5_k, q6_k available.
|
| 16 |
-
```
|
| 17 |
-
python convert_hf_to_gguf.py --outtype bf16 phi-4 --outfile phi-4.bf16.gguf
|
| 18 |
-
|
| 19 |
-
llama-quantize --allow-requantize --output-tensor-type q8_0 --token-embedding-type q8_0 phi-4.bf16.gguf phi-4.q8.q4.gguf q4_k
|
| 20 |
-
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q5.gguf q5_k
|
| 21 |
-
llama-quantize --imatrix imatrix.dat --leave-output-tensor phi-4.bf16.gguf phi-4.bf16.q5.im.gguf q5_k
|
| 22 |
-
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q6.gguf q6_k
|
| 23 |
-
llama-quantize --imatrix imatrix.dat --leave-output-tensor phi-4.bf16.gguf phi-4.bf16.q6.im.gguf q6_k
|
| 24 |
-
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q8.gguf q8_0
|
| 25 |
-
llama-quantize --allow-requantize --pure phi-4.bf16.gguf phi-4.bf16.q8p.gguf q8_0'
|
| 26 |
-
```
|
| 27 |
|
| 28 |
| | Quant type | File Size | ~Vram*|
|
| 29 |
| -------- | ---------- | --------- | -------- |
|
|
@@ -38,6 +27,19 @@ llama-quantize --allow-requantize --pure phi-4.bf16.gguf phi-4.bf16.q8p.gguf q8_
|
|
| 38 |
|
| 39 |
<sub>*approximate value at **16k context, FP16 cache**.<sup>
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
---------------------------------------------
|
| 42 |
|
| 43 |
# Phi-4 Model Card
|
|
|
|
| 13 |
- [Output and embed tensors quantized to q8_0, all other tensors quantized for q4_k.](https://huggingface.co/RobertSinclair)
|
| 14 |
- [Output and embed tensors quantized to bf16, all other tensors quantized for q5_k, q6_k, q8_0 and q8_0 --pure.](https://huggingface.co/RobertSinclair)
|
| 15 |
- BF16 and imatrix q5_k, q6_k available.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
| | Quant type | File Size | ~Vram*|
|
| 18 |
| -------- | ---------- | --------- | -------- |
|
|
|
|
| 27 |
|
| 28 |
<sub>*approximate value at **16k context, FP16 cache**.<sup>
|
| 29 |
|
| 30 |
+
|
| 31 |
+
```
|
| 32 |
+
python convert_hf_to_gguf.py --outtype bf16 phi-4 --outfile phi-4.bf16.gguf
|
| 33 |
+
|
| 34 |
+
llama-quantize --allow-requantize --output-tensor-type q8_0 --token-embedding-type q8_0 phi-4.bf16.gguf phi-4.q8.q4.gguf q4_k
|
| 35 |
+
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q5.gguf q5_k
|
| 36 |
+
llama-quantize --imatrix imatrix.dat --leave-output-tensor phi-4.bf16.gguf phi-4.bf16.q5.im.gguf q5_k
|
| 37 |
+
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q6.gguf q6_k
|
| 38 |
+
llama-quantize --imatrix imatrix.dat --leave-output-tensor phi-4.bf16.gguf phi-4.bf16.q6.im.gguf q6_k
|
| 39 |
+
llama-quantize --allow-requantize --output-tensor-type bf16 --token-embedding-type bf16 phi-4.bf16.gguf phi-4.bf16.q8.gguf q8_0
|
| 40 |
+
llama-quantize --allow-requantize --pure phi-4.bf16.gguf phi-4.bf16.q8p.gguf q8_0'
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
---------------------------------------------
|
| 44 |
|
| 45 |
# Phi-4 Model Card
|