Upload 21 files
Browse files- GPT2.cbp +63 -0
- GPT2.cscope_file_list +22 -0
- GPT2.depend +240 -0
- GPT2.layout +80 -0
- common-ggml.cpp +244 -0
- common-ggml.h +18 -0
- common.cpp +911 -0
- common.h +343 -0
- dr_wav.h +0 -0
- ggml-aarch64.c +0 -0
- ggml-aarch64.h +39 -0
- ggml-common.h +0 -0
- ggml-cpu-impl.h +614 -0
- ggml-impl.h +209 -0
- ggml-model-gpt-2-774M.bin +3 -0
- ggml-quants.c +0 -0
- ggml-quants.h +147 -0
- ggml.c +0 -0
- ggml.h +0 -0
- main-ctx.cpp +841 -0
- quantize.cpp +184 -0
GPT2.cbp
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
| 2 |
+
<CodeBlocks_project_file>
|
| 3 |
+
<FileVersion major="1" minor="6" />
|
| 4 |
+
<Project>
|
| 5 |
+
<Option title="GPT2" />
|
| 6 |
+
<Option pch_mode="2" />
|
| 7 |
+
<Option compiler="gcc" />
|
| 8 |
+
<Build>
|
| 9 |
+
<Target title="Debug">
|
| 10 |
+
<Option output="bin/Debug/GPT2" prefix_auto="1" extension_auto="1" />
|
| 11 |
+
<Option object_output="obj/Debug/" />
|
| 12 |
+
<Option type="1" />
|
| 13 |
+
<Option compiler="gcc" />
|
| 14 |
+
<Compiler>
|
| 15 |
+
<Add option="-g" />
|
| 16 |
+
</Compiler>
|
| 17 |
+
</Target>
|
| 18 |
+
<Target title="Release">
|
| 19 |
+
<Option output="bin/Release/GPT2" prefix_auto="1" extension_auto="1" />
|
| 20 |
+
<Option object_output="obj/Release/" />
|
| 21 |
+
<Option type="1" />
|
| 22 |
+
<Option compiler="gcc" />
|
| 23 |
+
<Compiler>
|
| 24 |
+
<Add option="-O2" />
|
| 25 |
+
</Compiler>
|
| 26 |
+
<Linker>
|
| 27 |
+
<Add option="-s" />
|
| 28 |
+
</Linker>
|
| 29 |
+
</Target>
|
| 30 |
+
</Build>
|
| 31 |
+
<Compiler>
|
| 32 |
+
<Add option="-Wall" />
|
| 33 |
+
<Add option="-fexceptions" />
|
| 34 |
+
</Compiler>
|
| 35 |
+
<Unit filename="GPT2.cbp" />
|
| 36 |
+
<Unit filename="GPT2.layout" />
|
| 37 |
+
<Unit filename="common-ggml.cpp" />
|
| 38 |
+
<Unit filename="common-ggml.h" />
|
| 39 |
+
<Unit filename="common.cpp" />
|
| 40 |
+
<Unit filename="common.h" />
|
| 41 |
+
<Unit filename="dr_wav.h" />
|
| 42 |
+
<Unit filename="ggml-aarch64.c">
|
| 43 |
+
<Option compilerVar="CC" />
|
| 44 |
+
</Unit>
|
| 45 |
+
<Unit filename="ggml-aarch64.h" />
|
| 46 |
+
<Unit filename="ggml-common.h" />
|
| 47 |
+
<Unit filename="ggml-cpu-impl.h" />
|
| 48 |
+
<Unit filename="ggml-impl.h" />
|
| 49 |
+
<Unit filename="ggml-quants.c">
|
| 50 |
+
<Option compilerVar="CC" />
|
| 51 |
+
</Unit>
|
| 52 |
+
<Unit filename="ggml-quants.h" />
|
| 53 |
+
<Unit filename="ggml.c">
|
| 54 |
+
<Option compilerVar="CC" />
|
| 55 |
+
</Unit>
|
| 56 |
+
<Unit filename="ggml.h" />
|
| 57 |
+
<Unit filename="main-ctx.cpp" />
|
| 58 |
+
<Unit filename="quantize.cpp" />
|
| 59 |
+
<Extensions>
|
| 60 |
+
<lib_finder disable_auto="1" />
|
| 61 |
+
</Extensions>
|
| 62 |
+
</Project>
|
| 63 |
+
</CodeBlocks_project_file>
|
GPT2.cscope_file_list
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.h"
|
| 2 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.c"
|
| 3 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-impl.h"
|
| 4 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.cpp"
|
| 5 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.c"
|
| 6 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.h"
|
| 7 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.cpp"
|
| 8 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.h"
|
| 9 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend-impl.h"
|
| 10 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.h"
|
| 11 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-common.h"
|
| 12 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.c"
|
| 13 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.h"
|
| 14 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.h"
|
| 15 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.cbp"
|
| 16 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.h"
|
| 17 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-cpu-impl.h"
|
| 18 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.c"
|
| 19 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.layout"
|
| 20 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.cpp"
|
| 21 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\dr_wav.h"
|
| 22 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\main-ctx.cpp"
|
GPT2.depend
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# depslib dependency file v1.0
|
| 2 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.cpp
|
| 3 |
+
"common-ggml.h"
|
| 4 |
+
<regex>
|
| 5 |
+
<map>
|
| 6 |
+
|
| 7 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.h
|
| 8 |
+
"ggml.h"
|
| 9 |
+
<fstream>
|
| 10 |
+
<vector>
|
| 11 |
+
<string>
|
| 12 |
+
|
| 13 |
+
1730691388 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.h
|
| 14 |
+
<stdbool.h>
|
| 15 |
+
<stddef.h>
|
| 16 |
+
<stdint.h>
|
| 17 |
+
<stdio.h>
|
| 18 |
+
|
| 19 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.c
|
| 20 |
+
"ggml-alloc.h"
|
| 21 |
+
"ggml-backend-impl.h"
|
| 22 |
+
"ggml.h"
|
| 23 |
+
"ggml-impl.h"
|
| 24 |
+
<assert.h>
|
| 25 |
+
<limits.h>
|
| 26 |
+
<stdarg.h>
|
| 27 |
+
<stdio.h>
|
| 28 |
+
<stdlib.h>
|
| 29 |
+
<string.h>
|
| 30 |
+
|
| 31 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.h
|
| 32 |
+
"ggml.h"
|
| 33 |
+
|
| 34 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend-impl.h
|
| 35 |
+
"ggml-backend.h"
|
| 36 |
+
|
| 37 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.h
|
| 38 |
+
"ggml.h"
|
| 39 |
+
"ggml-alloc.h"
|
| 40 |
+
|
| 41 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-impl.h
|
| 42 |
+
"ggml.h"
|
| 43 |
+
<assert.h>
|
| 44 |
+
<stdlib.h>
|
| 45 |
+
<stdbool.h>
|
| 46 |
+
<stdint.h>
|
| 47 |
+
|
| 48 |
+
1730735604 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.cpp
|
| 49 |
+
"common.h"
|
| 50 |
+
"dr_wav.h"
|
| 51 |
+
<cmath>
|
| 52 |
+
<cstring>
|
| 53 |
+
<fstream>
|
| 54 |
+
<regex>
|
| 55 |
+
<locale>
|
| 56 |
+
<codecvt>
|
| 57 |
+
<sstream>
|
| 58 |
+
<fcntl.h>
|
| 59 |
+
<io.h>
|
| 60 |
+
|
| 61 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.h
|
| 62 |
+
<string>
|
| 63 |
+
<map>
|
| 64 |
+
<vector>
|
| 65 |
+
<random>
|
| 66 |
+
<thread>
|
| 67 |
+
<ctime>
|
| 68 |
+
<fstream>
|
| 69 |
+
<sstream>
|
| 70 |
+
|
| 71 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\dr_wav.h
|
| 72 |
+
"dr_wav.h"
|
| 73 |
+
<stddef.h>
|
| 74 |
+
<stdlib.h>
|
| 75 |
+
<string.h>
|
| 76 |
+
<limits.h>
|
| 77 |
+
<stdio.h>
|
| 78 |
+
<wchar.h>
|
| 79 |
+
<assert.h>
|
| 80 |
+
<errno.h>
|
| 81 |
+
|
| 82 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.cpp
|
| 83 |
+
<windows.h>
|
| 84 |
+
"ggml-backend-impl.h"
|
| 85 |
+
"ggml-alloc.h"
|
| 86 |
+
"ggml-impl.h"
|
| 87 |
+
<assert.h>
|
| 88 |
+
<limits.h>
|
| 89 |
+
<stdarg.h>
|
| 90 |
+
<stdio.h>
|
| 91 |
+
<stdlib.h>
|
| 92 |
+
<string.h>
|
| 93 |
+
<string>
|
| 94 |
+
<vector>
|
| 95 |
+
<sys/types.h>
|
| 96 |
+
<sys/sysctl.h>
|
| 97 |
+
"ggml-cuda.h"
|
| 98 |
+
"ggml-metal.h"
|
| 99 |
+
"ggml-sycl.h"
|
| 100 |
+
"ggml-vulkan.h"
|
| 101 |
+
"ggml-blas.h"
|
| 102 |
+
"ggml-rpc.h"
|
| 103 |
+
"ggml-amx.h"
|
| 104 |
+
"ggml-cann.h"
|
| 105 |
+
<hbwmalloc.h>
|
| 106 |
+
|
| 107 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.c
|
| 108 |
+
"ggml-common.h"
|
| 109 |
+
"ggml-quants.h"
|
| 110 |
+
"ggml-impl.h"
|
| 111 |
+
"ggml-cpu-impl.h"
|
| 112 |
+
<math.h>
|
| 113 |
+
<string.h>
|
| 114 |
+
<assert.h>
|
| 115 |
+
<float.h>
|
| 116 |
+
<stdlib.h>
|
| 117 |
+
<stdio.h>
|
| 118 |
+
"ggml-aarch64.h"
|
| 119 |
+
|
| 120 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-common.h
|
| 121 |
+
<stdint.h>
|
| 122 |
+
<metal_stdlib>
|
| 123 |
+
<musa_fp16.h>
|
| 124 |
+
<cuda_fp16.h>
|
| 125 |
+
<cstdint>
|
| 126 |
+
<hip/hip_fp16.h>
|
| 127 |
+
<cstdint>
|
| 128 |
+
<sycl/half_type.hpp>
|
| 129 |
+
<cstdint>
|
| 130 |
+
<stdint.h>
|
| 131 |
+
<metal_stdlib>
|
| 132 |
+
<cstdint>
|
| 133 |
+
<cstdint>
|
| 134 |
+
|
| 135 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.h
|
| 136 |
+
"ggml-common.h"
|
| 137 |
+
"ggml.h"
|
| 138 |
+
|
| 139 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-cpu-impl.h
|
| 140 |
+
"ggml.h"
|
| 141 |
+
"ggml-impl.h"
|
| 142 |
+
<stdlib.h>
|
| 143 |
+
<stdbool.h>
|
| 144 |
+
<string.h>
|
| 145 |
+
<math.h>
|
| 146 |
+
<arm_sve.h>
|
| 147 |
+
<sys/prctl.h>
|
| 148 |
+
<arm_neon.h>
|
| 149 |
+
<wasm_simd128.h>
|
| 150 |
+
<altivec.h>
|
| 151 |
+
<intrin.h>
|
| 152 |
+
<immintrin.h>
|
| 153 |
+
<riscv_vector.h>
|
| 154 |
+
<lasxintrin.h>
|
| 155 |
+
<lsxintrin.h>
|
| 156 |
+
<arm_sve.h>
|
| 157 |
+
|
| 158 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.h
|
| 159 |
+
"ggml-common.h"
|
| 160 |
+
"ggml.h"
|
| 161 |
+
|
| 162 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.c
|
| 163 |
+
"ggml-common.h"
|
| 164 |
+
"ggml-quants.h"
|
| 165 |
+
"ggml-impl.h"
|
| 166 |
+
"ggml-cpu-impl.h"
|
| 167 |
+
<math.h>
|
| 168 |
+
<string.h>
|
| 169 |
+
<assert.h>
|
| 170 |
+
<float.h>
|
| 171 |
+
<stdlib.h>
|
| 172 |
+
<stdio.h>
|
| 173 |
+
|
| 174 |
+
1730734998 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.c
|
| 175 |
+
"ggml-impl.h"
|
| 176 |
+
"ggml-cpu-impl.h"
|
| 177 |
+
"ggml-quants.h"
|
| 178 |
+
"ggml.h"
|
| 179 |
+
"ggml-aarch64.h"
|
| 180 |
+
<malloc.h>
|
| 181 |
+
<alloca.h>
|
| 182 |
+
<assert.h>
|
| 183 |
+
<errno.h>
|
| 184 |
+
<time.h>
|
| 185 |
+
<math.h>
|
| 186 |
+
<stdlib.h>
|
| 187 |
+
<string.h>
|
| 188 |
+
<stdint.h>
|
| 189 |
+
<inttypes.h>
|
| 190 |
+
<stdio.h>
|
| 191 |
+
<float.h>
|
| 192 |
+
<limits.h>
|
| 193 |
+
<stdarg.h>
|
| 194 |
+
<signal.h>
|
| 195 |
+
<syscall.h>
|
| 196 |
+
<omp.h>
|
| 197 |
+
<llamafile/sgemm.h>
|
| 198 |
+
<windows.h>
|
| 199 |
+
<stdatomic.h>
|
| 200 |
+
<pthread.h>
|
| 201 |
+
<stdatomic.h>
|
| 202 |
+
<sched.h>
|
| 203 |
+
<pthread_np.h>
|
| 204 |
+
<sys/types.h>
|
| 205 |
+
<sys/stat.h>
|
| 206 |
+
<unistd.h>
|
| 207 |
+
<hbwmalloc.h>
|
| 208 |
+
<unistd.h>
|
| 209 |
+
<mach/mach.h>
|
| 210 |
+
<TargetConditionals.h>
|
| 211 |
+
<sys/wait.h>
|
| 212 |
+
<unwind.h>
|
| 213 |
+
<dlfcn.h>
|
| 214 |
+
<stdio.h>
|
| 215 |
+
<execinfo.h>
|
| 216 |
+
<Accelerate/Accelerate.h>
|
| 217 |
+
<sys/auxv.h>
|
| 218 |
+
<sys/sysctl.h>
|
| 219 |
+
"windows.h"
|
| 220 |
+
<sys/types.h>
|
| 221 |
+
<sys/resource.h>
|
| 222 |
+
|
| 223 |
+
1730683892 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-alloc.cpp
|
| 224 |
+
|
| 225 |
+
1730737838 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-ctx.cpp
|
| 226 |
+
|
| 227 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\quantize.cpp
|
| 228 |
+
"ggml.h"
|
| 229 |
+
"common.h"
|
| 230 |
+
"common-ggml.h"
|
| 231 |
+
<cassert>
|
| 232 |
+
<cmath>
|
| 233 |
+
<cstdio>
|
| 234 |
+
<cstring>
|
| 235 |
+
<fstream>
|
| 236 |
+
<map>
|
| 237 |
+
<string>
|
| 238 |
+
<vector>
|
| 239 |
+
<regex>
|
| 240 |
+
|
GPT2.layout
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
| 2 |
+
<CodeBlocks_layout_file>
|
| 3 |
+
<FileVersion major="1" minor="0" />
|
| 4 |
+
<ActiveTarget name="Debug" />
|
| 5 |
+
<File name="ggml-impl.h" open="1" top="0" tabpos="10" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 6 |
+
<Cursor>
|
| 7 |
+
<Cursor1 position="6388" topLine="0" />
|
| 8 |
+
</Cursor>
|
| 9 |
+
</File>
|
| 10 |
+
<File name="common-ggml.cpp" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 11 |
+
<Cursor>
|
| 12 |
+
<Cursor1 position="223" topLine="135" />
|
| 13 |
+
</Cursor>
|
| 14 |
+
</File>
|
| 15 |
+
<File name="ggml-quants.c" open="1" top="0" tabpos="11" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 16 |
+
<Cursor>
|
| 17 |
+
<Cursor1 position="2705" topLine="0" />
|
| 18 |
+
</Cursor>
|
| 19 |
+
</File>
|
| 20 |
+
<File name="ggml-aarch64.h" open="1" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 21 |
+
<Cursor>
|
| 22 |
+
<Cursor1 position="1519" topLine="0" />
|
| 23 |
+
</Cursor>
|
| 24 |
+
</File>
|
| 25 |
+
<File name="common.cpp" open="1" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 26 |
+
<Cursor>
|
| 27 |
+
<Cursor1 position="152" topLine="0" />
|
| 28 |
+
</Cursor>
|
| 29 |
+
</File>
|
| 30 |
+
<File name="ggml-quants.h" open="1" top="0" tabpos="13" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 31 |
+
<Cursor>
|
| 32 |
+
<Cursor1 position="0" topLine="128" />
|
| 33 |
+
</Cursor>
|
| 34 |
+
</File>
|
| 35 |
+
<File name="quantize.cpp" open="1" top="1" tabpos="15" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 36 |
+
<Cursor>
|
| 37 |
+
<Cursor1 position="4241" topLine="139" />
|
| 38 |
+
</Cursor>
|
| 39 |
+
</File>
|
| 40 |
+
<File name="ggml.h" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 41 |
+
<Cursor>
|
| 42 |
+
<Cursor1 position="8069" topLine="212" />
|
| 43 |
+
</Cursor>
|
| 44 |
+
</File>
|
| 45 |
+
<File name="ggml-common.h" open="1" top="0" tabpos="14" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 46 |
+
<Cursor>
|
| 47 |
+
<Cursor1 position="0" topLine="0" />
|
| 48 |
+
</Cursor>
|
| 49 |
+
</File>
|
| 50 |
+
<File name="ggml.c" open="1" top="0" tabpos="6" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 51 |
+
<Cursor>
|
| 52 |
+
<Cursor1 position="522" topLine="0" />
|
| 53 |
+
</Cursor>
|
| 54 |
+
</File>
|
| 55 |
+
<File name="common.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 56 |
+
<Cursor>
|
| 57 |
+
<Cursor1 position="0" topLine="0" />
|
| 58 |
+
</Cursor>
|
| 59 |
+
</File>
|
| 60 |
+
<File name="common-ggml.h" open="1" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 61 |
+
<Cursor>
|
| 62 |
+
<Cursor1 position="141" topLine="0" />
|
| 63 |
+
</Cursor>
|
| 64 |
+
</File>
|
| 65 |
+
<File name="ggml-cpu-impl.h" open="1" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 66 |
+
<Cursor>
|
| 67 |
+
<Cursor1 position="0" topLine="0" />
|
| 68 |
+
</Cursor>
|
| 69 |
+
</File>
|
| 70 |
+
<File name="ggml-aarch64.c" open="1" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 71 |
+
<Cursor>
|
| 72 |
+
<Cursor1 position="442" topLine="0" />
|
| 73 |
+
</Cursor>
|
| 74 |
+
</File>
|
| 75 |
+
<File name="main-ctx.cpp" open="1" top="0" tabpos="12" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
| 76 |
+
<Cursor>
|
| 77 |
+
<Cursor1 position="114" topLine="659" />
|
| 78 |
+
</Cursor>
|
| 79 |
+
</File>
|
| 80 |
+
</CodeBlocks_layout_file>
|
common-ggml.cpp
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common-ggml.h"
|
| 2 |
+
|
| 3 |
+
#include <regex>
|
| 4 |
+
#include <map>
|
| 5 |
+
|
| 6 |
+
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
| 7 |
+
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
| 8 |
+
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
| 9 |
+
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
| 10 |
+
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
| 11 |
+
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
| 12 |
+
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
|
| 13 |
+
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
|
| 14 |
+
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
|
| 15 |
+
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
|
| 16 |
+
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
void ggml_print_ftypes(FILE * fp) {
|
| 20 |
+
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
| 21 |
+
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
| 26 |
+
enum ggml_ftype ftype;
|
| 27 |
+
if (str[0] == 'q') {
|
| 28 |
+
const auto it = GGML_FTYPE_MAP.find(str);
|
| 29 |
+
if (it == GGML_FTYPE_MAP.end()) {
|
| 30 |
+
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
| 31 |
+
return GGML_FTYPE_UNKNOWN;
|
| 32 |
+
}
|
| 33 |
+
ftype = it->second;
|
| 34 |
+
} else {
|
| 35 |
+
ftype = (enum ggml_ftype) atoi(str);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
return ftype;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
bool ggml_common_quantize_0(
|
| 42 |
+
std::ifstream & finp,
|
| 43 |
+
std::ofstream & fout,
|
| 44 |
+
const ggml_ftype ftype,
|
| 45 |
+
const std::vector<std::string> & to_quant,
|
| 46 |
+
const std::vector<std::string> & to_skip) {
|
| 47 |
+
|
| 48 |
+
ggml_type qtype = GGML_TYPE_F32;
|
| 49 |
+
|
| 50 |
+
switch (ftype) {
|
| 51 |
+
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
| 52 |
+
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
| 53 |
+
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
| 54 |
+
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
| 55 |
+
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
| 56 |
+
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
|
| 57 |
+
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
|
| 58 |
+
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
|
| 59 |
+
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
|
| 60 |
+
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
|
| 61 |
+
case GGML_FTYPE_UNKNOWN:
|
| 62 |
+
case GGML_FTYPE_ALL_F32:
|
| 63 |
+
case GGML_FTYPE_MOSTLY_F16:
|
| 64 |
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 65 |
+
case GGML_FTYPE_MOSTLY_IQ2_XXS:
|
| 66 |
+
case GGML_FTYPE_MOSTLY_IQ2_XS:
|
| 67 |
+
case GGML_FTYPE_MOSTLY_IQ2_S:
|
| 68 |
+
case GGML_FTYPE_MOSTLY_IQ3_XXS:
|
| 69 |
+
case GGML_FTYPE_MOSTLY_IQ3_S:
|
| 70 |
+
case GGML_FTYPE_MOSTLY_IQ1_S:
|
| 71 |
+
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
| 72 |
+
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
| 73 |
+
case GGML_FTYPE_MOSTLY_IQ1_M:
|
| 74 |
+
case GGML_FTYPE_MOSTLY_BF16:
|
| 75 |
+
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
|
| 76 |
+
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
|
| 77 |
+
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
|
| 78 |
+
{
|
| 79 |
+
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 80 |
+
return false;
|
| 81 |
+
}
|
| 82 |
+
};
|
| 83 |
+
|
| 84 |
+
if (!ggml_is_quantized(qtype)) {
|
| 85 |
+
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
| 86 |
+
return false;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
size_t total_size_org = 0;
|
| 90 |
+
size_t total_size_new = 0;
|
| 91 |
+
|
| 92 |
+
std::vector<float> work;
|
| 93 |
+
|
| 94 |
+
std::vector<uint8_t> data_u8;
|
| 95 |
+
std::vector<ggml_fp16_t> data_f16;
|
| 96 |
+
std::vector<float> data_f32;
|
| 97 |
+
|
| 98 |
+
while (true) {
|
| 99 |
+
int32_t n_dims;
|
| 100 |
+
int32_t length;
|
| 101 |
+
int32_t ttype;
|
| 102 |
+
|
| 103 |
+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 104 |
+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 105 |
+
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 106 |
+
|
| 107 |
+
if (finp.eof()) {
|
| 108 |
+
break;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
int32_t nelements = 1;
|
| 112 |
+
int32_t ne[4] = { 1, 1, 1, 1 };
|
| 113 |
+
for (int i = 0; i < n_dims; ++i) {
|
| 114 |
+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 115 |
+
nelements *= ne[i];
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
std::string name(length, 0);
|
| 119 |
+
finp.read (&name[0], length);
|
| 120 |
+
|
| 121 |
+
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
|
| 122 |
+
|
| 123 |
+
bool quantize = false;
|
| 124 |
+
|
| 125 |
+
// check if we should quantize this tensor
|
| 126 |
+
for (const auto & s : to_quant) {
|
| 127 |
+
if (std::regex_match(name, std::regex(s))) {
|
| 128 |
+
quantize = true;
|
| 129 |
+
break;
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
// check if we should skip this tensor
|
| 134 |
+
for (const auto & s : to_skip) {
|
| 135 |
+
if (std::regex_match(name, std::regex(s))) {
|
| 136 |
+
quantize = false;
|
| 137 |
+
break;
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
// quantize only 2D tensors
|
| 142 |
+
quantize &= (n_dims == 2);
|
| 143 |
+
|
| 144 |
+
if (quantize) {
|
| 145 |
+
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
| 146 |
+
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
| 147 |
+
return false;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
if (ttype == GGML_TYPE_F16) {
|
| 151 |
+
data_f16.resize(nelements);
|
| 152 |
+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
| 153 |
+
data_f32.resize(nelements);
|
| 154 |
+
for (int i = 0; i < nelements; ++i) {
|
| 155 |
+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
| 156 |
+
}
|
| 157 |
+
} else {
|
| 158 |
+
data_f32.resize(nelements);
|
| 159 |
+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
ttype = qtype;
|
| 163 |
+
} else {
|
| 164 |
+
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
|
| 165 |
+
|
| 166 |
+
data_u8.resize(nelements*bpe);
|
| 167 |
+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 171 |
+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
|
| 172 |
+
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 173 |
+
for (int i = 0; i < n_dims; ++i) {
|
| 174 |
+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 175 |
+
}
|
| 176 |
+
fout.write(&name[0], length);
|
| 177 |
+
|
| 178 |
+
if (quantize) {
|
| 179 |
+
work.resize(nelements); // for quantization
|
| 180 |
+
|
| 181 |
+
size_t cur_size = 0;
|
| 182 |
+
switch ((ggml_type) ttype) {
|
| 183 |
+
case GGML_TYPE_Q4_0:
|
| 184 |
+
case GGML_TYPE_Q4_1:
|
| 185 |
+
case GGML_TYPE_Q5_0:
|
| 186 |
+
case GGML_TYPE_Q5_1:
|
| 187 |
+
case GGML_TYPE_Q8_0:
|
| 188 |
+
case GGML_TYPE_Q2_K:
|
| 189 |
+
case GGML_TYPE_Q3_K:
|
| 190 |
+
case GGML_TYPE_Q4_K:
|
| 191 |
+
case GGML_TYPE_Q5_K:
|
| 192 |
+
case GGML_TYPE_Q6_K:
|
| 193 |
+
{
|
| 194 |
+
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
|
| 195 |
+
} break;
|
| 196 |
+
case GGML_TYPE_F32:
|
| 197 |
+
case GGML_TYPE_F16:
|
| 198 |
+
case GGML_TYPE_I8:
|
| 199 |
+
case GGML_TYPE_I16:
|
| 200 |
+
case GGML_TYPE_I32:
|
| 201 |
+
case GGML_TYPE_I64:
|
| 202 |
+
case GGML_TYPE_F64:
|
| 203 |
+
case GGML_TYPE_Q8_1:
|
| 204 |
+
case GGML_TYPE_Q8_K:
|
| 205 |
+
case GGML_TYPE_IQ2_XXS:
|
| 206 |
+
case GGML_TYPE_IQ2_XS:
|
| 207 |
+
case GGML_TYPE_IQ2_S:
|
| 208 |
+
case GGML_TYPE_IQ3_XXS:
|
| 209 |
+
case GGML_TYPE_IQ3_S:
|
| 210 |
+
case GGML_TYPE_IQ1_S:
|
| 211 |
+
case GGML_TYPE_IQ4_NL:
|
| 212 |
+
case GGML_TYPE_IQ4_XS:
|
| 213 |
+
case GGML_TYPE_IQ1_M:
|
| 214 |
+
case GGML_TYPE_BF16:
|
| 215 |
+
case GGML_TYPE_Q4_0_4_4:
|
| 216 |
+
case GGML_TYPE_Q4_0_4_8:
|
| 217 |
+
case GGML_TYPE_Q4_0_8_8:
|
| 218 |
+
case GGML_TYPE_TQ1_0:
|
| 219 |
+
case GGML_TYPE_TQ2_0:
|
| 220 |
+
case GGML_TYPE_COUNT:
|
| 221 |
+
{
|
| 222 |
+
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
| 223 |
+
return false;
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
| 228 |
+
total_size_new += cur_size;
|
| 229 |
+
|
| 230 |
+
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
| 231 |
+
} else {
|
| 232 |
+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
| 233 |
+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
| 234 |
+
total_size_new += data_u8.size();
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
total_size_org += nelements * sizeof(float);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
| 241 |
+
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
| 242 |
+
|
| 243 |
+
return true;
|
| 244 |
+
}
|
common-ggml.h
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
|
| 5 |
+
#include <fstream>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <string>
|
| 8 |
+
|
| 9 |
+
enum ggml_ftype ggml_parse_ftype(const char * str);
|
| 10 |
+
|
| 11 |
+
void ggml_print_ftypes(FILE * fp = stderr);
|
| 12 |
+
|
| 13 |
+
bool ggml_common_quantize_0(
|
| 14 |
+
std::ifstream & finp,
|
| 15 |
+
std::ofstream & fout,
|
| 16 |
+
const ggml_ftype ftype,
|
| 17 |
+
const std::vector<std::string> & to_quant,
|
| 18 |
+
const std::vector<std::string> & to_skip);
|
common.cpp
ADDED
|
@@ -0,0 +1,911 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#define _USE_MATH_DEFINES // for M_PI
|
| 2 |
+
|
| 3 |
+
#include "common.h"
|
| 4 |
+
|
| 5 |
+
// third-party utilities
|
| 6 |
+
// use your favorite implementations
|
| 7 |
+
#define DR_WAV_IMPLEMENTATION
|
| 8 |
+
#include "dr_wav.h"
|
| 9 |
+
|
| 10 |
+
#include <cmath>
|
| 11 |
+
#include <cstring>
|
| 12 |
+
#include <fstream>
|
| 13 |
+
#include <regex>
|
| 14 |
+
#include <locale>
|
| 15 |
+
#include <codecvt>
|
| 16 |
+
#include <sstream>
|
| 17 |
+
|
| 18 |
+
#if defined(_MSC_VER)
|
| 19 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
| 20 |
+
#endif
|
| 21 |
+
|
| 22 |
+
#ifdef _WIN32
|
| 23 |
+
#include <fcntl.h>
|
| 24 |
+
#include <io.h>
|
| 25 |
+
#endif
|
| 26 |
+
|
| 27 |
+
#ifdef WHISPER_FFMPEG
|
| 28 |
+
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
|
| 29 |
+
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
|
| 30 |
+
#endif
|
| 31 |
+
|
| 32 |
+
// Function to check if the next argument exists
|
| 33 |
+
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
|
| 34 |
+
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
| 35 |
+
return argv[++i];
|
| 36 |
+
} else {
|
| 37 |
+
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
|
| 38 |
+
gpt_print_usage(argc, argv, params);
|
| 39 |
+
exit(0);
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
| 44 |
+
for (int i = 1; i < argc; i++) {
|
| 45 |
+
std::string arg = argv[i];
|
| 46 |
+
|
| 47 |
+
if (arg == "-s" || arg == "--seed") {
|
| 48 |
+
params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 49 |
+
} else if (arg == "-t" || arg == "--threads") {
|
| 50 |
+
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 51 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
| 52 |
+
params.prompt = get_next_arg(i, argc, argv, arg, params);
|
| 53 |
+
} else if (arg == "-n" || arg == "--n_predict") {
|
| 54 |
+
params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 55 |
+
} else if (arg == "-np" || arg == "--n_parallel") {
|
| 56 |
+
params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 57 |
+
} else if (arg == "--top_k") {
|
| 58 |
+
params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 59 |
+
} else if (arg == "--top_p") {
|
| 60 |
+
params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
|
| 61 |
+
} else if (arg == "--temp") {
|
| 62 |
+
params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
|
| 63 |
+
} else if (arg == "--repeat-last-n") {
|
| 64 |
+
params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 65 |
+
} else if (arg == "--repeat-penalty") {
|
| 66 |
+
params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
|
| 67 |
+
} else if (arg == "-b" || arg == "--batch_size") {
|
| 68 |
+
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 69 |
+
} else if (arg == "-c" || arg == "--context") {
|
| 70 |
+
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 71 |
+
} else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
| 72 |
+
params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 73 |
+
} else if (arg == "--ignore-eos") {
|
| 74 |
+
params.ignore_eos = true;
|
| 75 |
+
} else if (arg == "-m" || arg == "--model") {
|
| 76 |
+
params.model = get_next_arg(i, argc, argv, arg, params);
|
| 77 |
+
} else if (arg == "-i" || arg == "--interactive") {
|
| 78 |
+
params.interactive = true;
|
| 79 |
+
} else if (arg == "-ip" || arg == "--interactive-port") {
|
| 80 |
+
params.interactive = true;
|
| 81 |
+
params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
| 82 |
+
} else if (arg == "-h" || arg == "--help") {
|
| 83 |
+
gpt_print_usage(argc, argv, params);
|
| 84 |
+
exit(0);
|
| 85 |
+
} else if (arg == "-f" || arg == "--file") {
|
| 86 |
+
get_next_arg(i, argc, argv, arg, params);
|
| 87 |
+
std::ifstream file(argv[i]);
|
| 88 |
+
if (!file) {
|
| 89 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
| 90 |
+
break;
|
| 91 |
+
}
|
| 92 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
| 93 |
+
if (params.prompt.back() == '\n') {
|
| 94 |
+
params.prompt.pop_back();
|
| 95 |
+
}
|
| 96 |
+
} else if (arg == "-tt" || arg == "--token_test") {
|
| 97 |
+
params.token_test = get_next_arg(i, argc, argv, arg, params);
|
| 98 |
+
}
|
| 99 |
+
else {
|
| 100 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 101 |
+
gpt_print_usage(argc, argv, params);
|
| 102 |
+
exit(0);
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
return true;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
| 110 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 111 |
+
fprintf(stderr, "\n");
|
| 112 |
+
fprintf(stderr, "options:\n");
|
| 113 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
| 114 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
| 115 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
| 116 |
+
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
| 117 |
+
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
| 118 |
+
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
| 119 |
+
fprintf(stderr, " load prompt from a file\n");
|
| 120 |
+
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
|
| 121 |
+
fprintf(stderr, " test tokenization\n");
|
| 122 |
+
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 123 |
+
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 124 |
+
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
| 125 |
+
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
| 126 |
+
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
|
| 127 |
+
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
| 128 |
+
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
| 129 |
+
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
|
| 130 |
+
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
|
| 131 |
+
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
|
| 132 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
| 133 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
| 134 |
+
fprintf(stderr, "\n");
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
| 138 |
+
const int r = rng() % 10;
|
| 139 |
+
switch (r) {
|
| 140 |
+
case 0: return "So";
|
| 141 |
+
case 1: return "Once upon a time";
|
| 142 |
+
case 2: return "When";
|
| 143 |
+
case 3: return "The";
|
| 144 |
+
case 4: return "After";
|
| 145 |
+
case 5: return "If";
|
| 146 |
+
case 6: return "import";
|
| 147 |
+
case 7: return "He";
|
| 148 |
+
case 8: return "She";
|
| 149 |
+
case 9: return "They";
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
return "The";
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
std::string trim(const std::string & s) {
|
| 156 |
+
std::regex e("^\\s+|\\s+$");
|
| 157 |
+
return std::regex_replace(s, e, "");
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
| 161 |
+
std::string result = s;
|
| 162 |
+
size_t pos = 0;
|
| 163 |
+
while ((pos = result.find(from, pos)) != std::string::npos) {
|
| 164 |
+
result.replace(pos, from.length(), to);
|
| 165 |
+
pos += to.length();
|
| 166 |
+
}
|
| 167 |
+
return result;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
void gpt_vocab::add_special_token(const std::string & token) {
|
| 171 |
+
special_tokens.push_back(token);
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
| 175 |
+
std::map<std::string, int32_t> result;
|
| 176 |
+
|
| 177 |
+
// read file into string
|
| 178 |
+
std::string json;
|
| 179 |
+
{
|
| 180 |
+
std::ifstream ifs(fname);
|
| 181 |
+
if (!ifs) {
|
| 182 |
+
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
| 183 |
+
exit(1);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
| 187 |
+
(std::istreambuf_iterator<char>()));
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if (json[0] != '{') {
|
| 191 |
+
return result;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
// parse json
|
| 195 |
+
{
|
| 196 |
+
bool has_key = false;
|
| 197 |
+
bool in_token = false;
|
| 198 |
+
|
| 199 |
+
std::string str_key = "";
|
| 200 |
+
std::string str_val = "";
|
| 201 |
+
|
| 202 |
+
int n = json.size();
|
| 203 |
+
for (int i = 1; i < n; ++i) {
|
| 204 |
+
if (!in_token) {
|
| 205 |
+
if (json[i] == ' ') continue;
|
| 206 |
+
if (json[i] == '"') {
|
| 207 |
+
in_token = true;
|
| 208 |
+
continue;
|
| 209 |
+
}
|
| 210 |
+
} else {
|
| 211 |
+
if (json[i] == '\\' && i+1 < n) {
|
| 212 |
+
if (has_key == false) {
|
| 213 |
+
str_key += json[i];
|
| 214 |
+
} else {
|
| 215 |
+
str_val += json[i];
|
| 216 |
+
}
|
| 217 |
+
++i;
|
| 218 |
+
} else if (json[i] == '"') {
|
| 219 |
+
if (has_key == false) {
|
| 220 |
+
has_key = true;
|
| 221 |
+
++i;
|
| 222 |
+
while (json[i] == ' ') ++i;
|
| 223 |
+
++i; // :
|
| 224 |
+
while (json[i] == ' ') ++i;
|
| 225 |
+
if (json[i] != '\"') {
|
| 226 |
+
while (json[i] != ',' && json[i] != '}') {
|
| 227 |
+
str_val += json[i++];
|
| 228 |
+
}
|
| 229 |
+
has_key = false;
|
| 230 |
+
} else {
|
| 231 |
+
in_token = true;
|
| 232 |
+
continue;
|
| 233 |
+
}
|
| 234 |
+
} else {
|
| 235 |
+
has_key = false;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
| 239 |
+
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
| 240 |
+
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
|
| 241 |
+
|
| 242 |
+
try {
|
| 243 |
+
result[str_key] = std::stoi(str_val);
|
| 244 |
+
} catch (...) {
|
| 245 |
+
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
| 246 |
+
|
| 247 |
+
}
|
| 248 |
+
str_key = "";
|
| 249 |
+
str_val = "";
|
| 250 |
+
in_token = false;
|
| 251 |
+
continue;
|
| 252 |
+
}
|
| 253 |
+
if (has_key == false) {
|
| 254 |
+
str_key += json[i];
|
| 255 |
+
} else {
|
| 256 |
+
str_val += json[i];
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
return result;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
std::string convert_to_utf8(const std::wstring & input) {
|
| 266 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
| 267 |
+
return converter.to_bytes(input);
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
std::wstring convert_to_wstring(const std::string & input) {
|
| 272 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
| 273 |
+
return converter.from_bytes(input);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
| 277 |
+
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 278 |
+
const std::regex re(pattern);
|
| 279 |
+
std::smatch m;
|
| 280 |
+
|
| 281 |
+
while (std::regex_search(str, m, re)) {
|
| 282 |
+
for (auto x : m) {
|
| 283 |
+
words.push_back(x);
|
| 284 |
+
}
|
| 285 |
+
str = m.suffix();
|
| 286 |
+
}
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 290 |
+
std::vector<std::string> words;
|
| 291 |
+
|
| 292 |
+
// first split the text into words
|
| 293 |
+
{
|
| 294 |
+
std::string str = text;
|
| 295 |
+
|
| 296 |
+
// Generate the subpattern from the special_tokens vector if it's not empty
|
| 297 |
+
if (!vocab.special_tokens.empty()) {
|
| 298 |
+
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
| 299 |
+
std::string special_tokens_subpattern;
|
| 300 |
+
for (const auto & token : vocab.special_tokens) {
|
| 301 |
+
if (!special_tokens_subpattern.empty()) {
|
| 302 |
+
special_tokens_subpattern += "|";
|
| 303 |
+
}
|
| 304 |
+
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
std::regex re(special_tokens_subpattern);
|
| 308 |
+
std::smatch m;
|
| 309 |
+
// Split the text by special tokens.
|
| 310 |
+
while (std::regex_search(str, m, re)) {
|
| 311 |
+
// Split the substrings in-between special tokens into words.
|
| 312 |
+
gpt_split_words(m.prefix(), words);
|
| 313 |
+
// Add matched special tokens as words.
|
| 314 |
+
for (auto x : m) {
|
| 315 |
+
words.push_back(x);
|
| 316 |
+
}
|
| 317 |
+
str = m.suffix();
|
| 318 |
+
}
|
| 319 |
+
// Remaining text without special tokens will be handled below.
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
gpt_split_words(str, words);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
// find the longest token that forms each word in words:
|
| 326 |
+
std::vector<gpt_vocab::id> tokens;
|
| 327 |
+
for (const auto & word : words) {
|
| 328 |
+
for (int i = 0; i < (int) word.size(); ){
|
| 329 |
+
for (int j = word.size() - 1; j >= i; j--){
|
| 330 |
+
auto cand = word.substr(i, j-i+1);
|
| 331 |
+
auto it = vocab.token_to_id.find(cand);
|
| 332 |
+
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
| 333 |
+
tokens.push_back(it->second);
|
| 334 |
+
i = j + 1;
|
| 335 |
+
break;
|
| 336 |
+
}
|
| 337 |
+
else if (j == i){ // word.substr(i, 1) has no matching
|
| 338 |
+
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
| 339 |
+
i++;
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
return tokens;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
|
| 349 |
+
std::vector<gpt_vocab::id> output;
|
| 350 |
+
std::stringstream ss(input);
|
| 351 |
+
std::string token;
|
| 352 |
+
|
| 353 |
+
while (std::getline(ss, token, delimiter)) {
|
| 354 |
+
output.push_back(std::stoi(token));
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
return output;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
|
| 361 |
+
if (fpath_test.empty()){
|
| 362 |
+
fprintf(stderr, "%s : No test file found.\n", __func__);
|
| 363 |
+
return std::map<std::string, std::vector<gpt_vocab::id>>();
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests;
|
| 367 |
+
|
| 368 |
+
auto fin = std::ifstream(fpath_test, std::ios_base::in);
|
| 369 |
+
const char * delimeter = " => ";
|
| 370 |
+
const char del_tok = ',';
|
| 371 |
+
std::string line;
|
| 372 |
+
while (std::getline(fin, line)) {
|
| 373 |
+
size_t delimiterPos = line.find(delimeter);
|
| 374 |
+
if (delimiterPos != std::string::npos) {
|
| 375 |
+
std::string text = line.substr(0, delimiterPos);
|
| 376 |
+
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
|
| 377 |
+
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
|
| 378 |
+
}
|
| 379 |
+
}
|
| 380 |
+
return tests;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
|
| 384 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
|
| 385 |
+
|
| 386 |
+
size_t n_fails = 0;
|
| 387 |
+
|
| 388 |
+
for (const auto & test : tests) {
|
| 389 |
+
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
|
| 390 |
+
|
| 391 |
+
if (tokens != test.second){
|
| 392 |
+
n_fails++;
|
| 393 |
+
|
| 394 |
+
// print out failure cases
|
| 395 |
+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
|
| 396 |
+
fprintf(stderr, "%s : tokens in hf: ", __func__);
|
| 397 |
+
for (const auto & t : test.second) {
|
| 398 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
| 399 |
+
}
|
| 400 |
+
fprintf(stderr, "\n");
|
| 401 |
+
fprintf(stderr, "%s : tokens in ggml: ", __func__);
|
| 402 |
+
for (const auto & t : tokens) {
|
| 403 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
| 404 |
+
}
|
| 405 |
+
fprintf(stderr, "\n");
|
| 406 |
+
}
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
| 413 |
+
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
| 414 |
+
|
| 415 |
+
vocab.token_to_id = ::json_parse(fname);
|
| 416 |
+
|
| 417 |
+
for (const auto & kv : vocab.token_to_id) {
|
| 418 |
+
vocab.id_to_token[kv.second] = kv.first;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
| 422 |
+
|
| 423 |
+
// print the vocabulary
|
| 424 |
+
//for (auto kv : vocab.token_to_id) {
|
| 425 |
+
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
| 426 |
+
//}
|
| 427 |
+
|
| 428 |
+
return true;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 432 |
+
const gpt_vocab & vocab,
|
| 433 |
+
const float * logits,
|
| 434 |
+
int top_k,
|
| 435 |
+
double top_p,
|
| 436 |
+
double temp,
|
| 437 |
+
std::mt19937 & rng) {
|
| 438 |
+
int n_logits = vocab.id_to_token.size();
|
| 439 |
+
|
| 440 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 441 |
+
logits_id.reserve(n_logits);
|
| 442 |
+
|
| 443 |
+
{
|
| 444 |
+
const double scale = 1.0/temp;
|
| 445 |
+
for (int i = 0; i < n_logits; ++i) {
|
| 446 |
+
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
| 447 |
+
}
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
// find the top K tokens
|
| 451 |
+
std::partial_sort(
|
| 452 |
+
logits_id.begin(),
|
| 453 |
+
logits_id.begin() + top_k, logits_id.end(),
|
| 454 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 455 |
+
return a.first > b.first;
|
| 456 |
+
});
|
| 457 |
+
|
| 458 |
+
logits_id.resize(top_k);
|
| 459 |
+
|
| 460 |
+
double maxl = -INFINITY;
|
| 461 |
+
for (const auto & kv : logits_id) {
|
| 462 |
+
maxl = std::max(maxl, kv.first);
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
// compute probs for the top K tokens
|
| 466 |
+
std::vector<double> probs;
|
| 467 |
+
probs.reserve(logits_id.size());
|
| 468 |
+
|
| 469 |
+
double sum = 0.0;
|
| 470 |
+
for (const auto & kv : logits_id) {
|
| 471 |
+
double p = exp(kv.first - maxl);
|
| 472 |
+
probs.push_back(p);
|
| 473 |
+
sum += p;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
// normalize the probs
|
| 477 |
+
for (auto & p : probs) {
|
| 478 |
+
p /= sum;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
if (top_p < 1.0f) {
|
| 482 |
+
double cumsum = 0.0f;
|
| 483 |
+
for (int i = 0; i < top_k; i++) {
|
| 484 |
+
cumsum += probs[i];
|
| 485 |
+
if (cumsum >= top_p) {
|
| 486 |
+
top_k = i + 1;
|
| 487 |
+
probs.resize(top_k);
|
| 488 |
+
logits_id.resize(top_k);
|
| 489 |
+
break;
|
| 490 |
+
}
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
cumsum = 1.0/cumsum;
|
| 494 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
| 495 |
+
probs[i] *= cumsum;
|
| 496 |
+
}
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
//printf("\n");
|
| 500 |
+
//for (int i = 0; i < (int) probs.size(); i++) {
|
| 501 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
| 502 |
+
//}
|
| 503 |
+
//exit(0);
|
| 504 |
+
|
| 505 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 506 |
+
int idx = dist(rng);
|
| 507 |
+
|
| 508 |
+
return logits_id[idx].second;
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
| 512 |
+
const gpt_vocab & vocab,
|
| 513 |
+
const float * logits,
|
| 514 |
+
const int32_t * last_n_tokens_data,
|
| 515 |
+
size_t last_n_tokens_data_size,
|
| 516 |
+
int top_k,
|
| 517 |
+
double top_p,
|
| 518 |
+
double temp,
|
| 519 |
+
int repeat_last_n,
|
| 520 |
+
float repeat_penalty,
|
| 521 |
+
std::mt19937 & rng) {
|
| 522 |
+
|
| 523 |
+
int n_logits = vocab.id_to_token.size();
|
| 524 |
+
|
| 525 |
+
const auto * plogits = logits;
|
| 526 |
+
|
| 527 |
+
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
|
| 528 |
+
|
| 529 |
+
if (temp <= 0) {
|
| 530 |
+
// select the token with the highest logit directly
|
| 531 |
+
float max_logit = plogits[0];
|
| 532 |
+
gpt_vocab::id max_id = 0;
|
| 533 |
+
|
| 534 |
+
for (int i = 1; i < n_logits; ++i) {
|
| 535 |
+
if (plogits[i] > max_logit) {
|
| 536 |
+
max_logit = plogits[i];
|
| 537 |
+
max_id = i;
|
| 538 |
+
}
|
| 539 |
+
}
|
| 540 |
+
return max_id;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 545 |
+
logits_id.reserve(n_logits);
|
| 546 |
+
|
| 547 |
+
{
|
| 548 |
+
const float scale = 1.0f/temp;
|
| 549 |
+
for (int i = 0; i < n_logits; ++i) {
|
| 550 |
+
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
| 551 |
+
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
| 552 |
+
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
|
| 553 |
+
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
| 554 |
+
if (plogits[i] < 0.0f) {
|
| 555 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
| 556 |
+
} else {
|
| 557 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
| 558 |
+
}
|
| 559 |
+
} else {
|
| 560 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
| 561 |
+
}
|
| 562 |
+
}
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
// find the top K tokens
|
| 566 |
+
std::partial_sort(
|
| 567 |
+
logits_id.begin(),
|
| 568 |
+
logits_id.begin() + top_k, logits_id.end(),
|
| 569 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 570 |
+
return a.first > b.first;
|
| 571 |
+
});
|
| 572 |
+
|
| 573 |
+
logits_id.resize(top_k);
|
| 574 |
+
|
| 575 |
+
double maxl = -INFINITY;
|
| 576 |
+
for (const auto & kv : logits_id) {
|
| 577 |
+
maxl = std::max(maxl, kv.first);
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
// compute probs for the top K tokens
|
| 581 |
+
std::vector<double> probs;
|
| 582 |
+
probs.reserve(logits_id.size());
|
| 583 |
+
|
| 584 |
+
double sum = 0.0;
|
| 585 |
+
for (const auto & kv : logits_id) {
|
| 586 |
+
double p = exp(kv.first - maxl);
|
| 587 |
+
probs.push_back(p);
|
| 588 |
+
sum += p;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
// normalize the probs
|
| 592 |
+
for (auto & p : probs) {
|
| 593 |
+
p /= sum;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
if (top_p < 1.0f) {
|
| 597 |
+
double cumsum = 0.0f;
|
| 598 |
+
for (int i = 0; i < top_k; i++) {
|
| 599 |
+
cumsum += probs[i];
|
| 600 |
+
if (cumsum >= top_p) {
|
| 601 |
+
top_k = i + 1;
|
| 602 |
+
probs.resize(top_k);
|
| 603 |
+
logits_id.resize(top_k);
|
| 604 |
+
break;
|
| 605 |
+
}
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
cumsum = 1.0/cumsum;
|
| 609 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
| 610 |
+
probs[i] *= cumsum;
|
| 611 |
+
}
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
// printf("\n");
|
| 615 |
+
// for (int i = 0; i < (int) probs.size(); i++) {
|
| 616 |
+
// for (int i = 0; i < 10; i++) {
|
| 617 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
| 618 |
+
// }
|
| 619 |
+
|
| 620 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 621 |
+
int idx = dist(rng);
|
| 622 |
+
|
| 623 |
+
return logits_id[idx].second;
|
| 624 |
+
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
bool is_wav_buffer(const std::string buf) {
|
| 628 |
+
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
| 629 |
+
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
| 630 |
+
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
|
| 631 |
+
return false;
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
|
| 635 |
+
if (chunk_size + 8 != buf.size()) {
|
| 636 |
+
return false;
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
return true;
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
| 643 |
+
drwav wav;
|
| 644 |
+
std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
|
| 645 |
+
|
| 646 |
+
if (fname == "-") {
|
| 647 |
+
{
|
| 648 |
+
#ifdef _WIN32
|
| 649 |
+
_setmode(_fileno(stdin), _O_BINARY);
|
| 650 |
+
#endif
|
| 651 |
+
|
| 652 |
+
uint8_t buf[1024];
|
| 653 |
+
while (true)
|
| 654 |
+
{
|
| 655 |
+
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
| 656 |
+
if (n == 0) {
|
| 657 |
+
break;
|
| 658 |
+
}
|
| 659 |
+
wav_data.insert(wav_data.end(), buf, buf + n);
|
| 660 |
+
}
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
| 664 |
+
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
| 665 |
+
return false;
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
| 669 |
+
}
|
| 670 |
+
else if (is_wav_buffer(fname)) {
|
| 671 |
+
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
|
| 672 |
+
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
|
| 673 |
+
return false;
|
| 674 |
+
}
|
| 675 |
+
}
|
| 676 |
+
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
| 677 |
+
#if defined(WHISPER_FFMPEG)
|
| 678 |
+
if (ffmpeg_decode_audio(fname, wav_data) != 0) {
|
| 679 |
+
fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
|
| 680 |
+
return false;
|
| 681 |
+
}
|
| 682 |
+
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
| 683 |
+
fprintf(stderr, "error: failed to read wav data as wav \n");
|
| 684 |
+
return false;
|
| 685 |
+
}
|
| 686 |
+
#else
|
| 687 |
+
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
| 688 |
+
return false;
|
| 689 |
+
#endif
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
if (wav.channels != 1 && wav.channels != 2) {
|
| 693 |
+
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
|
| 694 |
+
drwav_uninit(&wav);
|
| 695 |
+
return false;
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
if (stereo && wav.channels != 2) {
|
| 699 |
+
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
|
| 700 |
+
drwav_uninit(&wav);
|
| 701 |
+
return false;
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
| 705 |
+
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
|
| 706 |
+
drwav_uninit(&wav);
|
| 707 |
+
return false;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
if (wav.bitsPerSample != 16) {
|
| 711 |
+
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
|
| 712 |
+
drwav_uninit(&wav);
|
| 713 |
+
return false;
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
| 717 |
+
|
| 718 |
+
std::vector<int16_t> pcm16;
|
| 719 |
+
pcm16.resize(n*wav.channels);
|
| 720 |
+
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
| 721 |
+
drwav_uninit(&wav);
|
| 722 |
+
|
| 723 |
+
// convert to mono, float
|
| 724 |
+
pcmf32.resize(n);
|
| 725 |
+
if (wav.channels == 1) {
|
| 726 |
+
for (uint64_t i = 0; i < n; i++) {
|
| 727 |
+
pcmf32[i] = float(pcm16[i])/32768.0f;
|
| 728 |
+
}
|
| 729 |
+
} else {
|
| 730 |
+
for (uint64_t i = 0; i < n; i++) {
|
| 731 |
+
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
| 732 |
+
}
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
if (stereo) {
|
| 736 |
+
// convert to stereo, float
|
| 737 |
+
pcmf32s.resize(2);
|
| 738 |
+
|
| 739 |
+
pcmf32s[0].resize(n);
|
| 740 |
+
pcmf32s[1].resize(n);
|
| 741 |
+
for (uint64_t i = 0; i < n; i++) {
|
| 742 |
+
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
| 743 |
+
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
| 744 |
+
}
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
return true;
|
| 748 |
+
}
|
| 749 |
+
|
| 750 |
+
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
| 751 |
+
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
| 752 |
+
const float dt = 1.0f / sample_rate;
|
| 753 |
+
const float alpha = dt / (rc + dt);
|
| 754 |
+
|
| 755 |
+
float y = data[0];
|
| 756 |
+
|
| 757 |
+
for (size_t i = 1; i < data.size(); i++) {
|
| 758 |
+
y = alpha * (y + data[i] - data[i - 1]);
|
| 759 |
+
data[i] = y;
|
| 760 |
+
}
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
| 764 |
+
const int n_samples = pcmf32.size();
|
| 765 |
+
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
| 766 |
+
|
| 767 |
+
if (n_samples_last >= n_samples) {
|
| 768 |
+
// not enough samples - assume no speech
|
| 769 |
+
return false;
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
if (freq_thold > 0.0f) {
|
| 773 |
+
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
float energy_all = 0.0f;
|
| 777 |
+
float energy_last = 0.0f;
|
| 778 |
+
|
| 779 |
+
for (int i = 0; i < n_samples; i++) {
|
| 780 |
+
energy_all += fabsf(pcmf32[i]);
|
| 781 |
+
|
| 782 |
+
if (i >= n_samples - n_samples_last) {
|
| 783 |
+
energy_last += fabsf(pcmf32[i]);
|
| 784 |
+
}
|
| 785 |
+
}
|
| 786 |
+
|
| 787 |
+
energy_all /= n_samples;
|
| 788 |
+
energy_last /= n_samples_last;
|
| 789 |
+
|
| 790 |
+
if (verbose) {
|
| 791 |
+
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
| 792 |
+
}
|
| 793 |
+
|
| 794 |
+
if (energy_last > vad_thold*energy_all) {
|
| 795 |
+
return false;
|
| 796 |
+
}
|
| 797 |
+
|
| 798 |
+
return true;
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
float similarity(const std::string & s0, const std::string & s1) {
|
| 802 |
+
const size_t len0 = s0.size() + 1;
|
| 803 |
+
const size_t len1 = s1.size() + 1;
|
| 804 |
+
|
| 805 |
+
std::vector<int> col(len1, 0);
|
| 806 |
+
std::vector<int> prevCol(len1, 0);
|
| 807 |
+
|
| 808 |
+
for (size_t i = 0; i < len1; i++) {
|
| 809 |
+
prevCol[i] = i;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
for (size_t i = 0; i < len0; i++) {
|
| 813 |
+
col[0] = i;
|
| 814 |
+
for (size_t j = 1; j < len1; j++) {
|
| 815 |
+
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
|
| 816 |
+
}
|
| 817 |
+
col.swap(prevCol);
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
const float dist = prevCol[len1 - 1];
|
| 821 |
+
|
| 822 |
+
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
| 826 |
+
for (int i = 1; i < argc; i++) {
|
| 827 |
+
std::string arg = argv[i];
|
| 828 |
+
|
| 829 |
+
if (arg == "-s" || arg == "--seed") {
|
| 830 |
+
params.seed = std::stoi(argv[++i]);
|
| 831 |
+
} else if (arg == "-t" || arg == "--threads") {
|
| 832 |
+
params.n_threads = std::stoi(argv[++i]);
|
| 833 |
+
} else if (arg == "-m" || arg == "--model") {
|
| 834 |
+
params.model = argv[++i];
|
| 835 |
+
} else if (arg == "-i" || arg == "--inp") {
|
| 836 |
+
params.fname_inp = argv[++i];
|
| 837 |
+
} else if (arg == "-o" || arg == "--out") {
|
| 838 |
+
params.fname_out = argv[++i];
|
| 839 |
+
} else if (arg == "-h" || arg == "--help") {
|
| 840 |
+
sam_print_usage(argc, argv, params);
|
| 841 |
+
exit(0);
|
| 842 |
+
} else {
|
| 843 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 844 |
+
sam_print_usage(argc, argv, params);
|
| 845 |
+
exit(0);
|
| 846 |
+
}
|
| 847 |
+
}
|
| 848 |
+
|
| 849 |
+
return true;
|
| 850 |
+
}
|
| 851 |
+
|
| 852 |
+
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
| 853 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 854 |
+
fprintf(stderr, "\n");
|
| 855 |
+
fprintf(stderr, "options:\n");
|
| 856 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
| 857 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
| 858 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
| 859 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
| 860 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
| 861 |
+
fprintf(stderr, " -i FNAME, --inp FNAME\n");
|
| 862 |
+
fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
|
| 863 |
+
fprintf(stderr, " -o FNAME, --out FNAME\n");
|
| 864 |
+
fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
|
| 865 |
+
fprintf(stderr, "\n");
|
| 866 |
+
}
|
| 867 |
+
|
| 868 |
+
// 500 -> 00:05.000
|
| 869 |
+
// 6000 -> 01:00.000
|
| 870 |
+
std::string to_timestamp(int64_t t, bool comma) {
|
| 871 |
+
int64_t msec = t * 10;
|
| 872 |
+
int64_t hr = msec / (1000 * 60 * 60);
|
| 873 |
+
msec = msec - hr * (1000 * 60 * 60);
|
| 874 |
+
int64_t min = msec / (1000 * 60);
|
| 875 |
+
msec = msec - min * (1000 * 60);
|
| 876 |
+
int64_t sec = msec / 1000;
|
| 877 |
+
msec = msec - sec * 1000;
|
| 878 |
+
|
| 879 |
+
char buf[32];
|
| 880 |
+
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
|
| 881 |
+
|
| 882 |
+
return std::string(buf);
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
|
| 886 |
+
return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
bool is_file_exist(const char *fileName)
|
| 890 |
+
{
|
| 891 |
+
std::ifstream infile(fileName);
|
| 892 |
+
return infile.good();
|
| 893 |
+
}
|
| 894 |
+
|
| 895 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
|
| 896 |
+
{
|
| 897 |
+
std::ofstream speak_file(path.c_str());
|
| 898 |
+
if (speak_file.fail()) {
|
| 899 |
+
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
|
| 900 |
+
return false;
|
| 901 |
+
} else {
|
| 902 |
+
speak_file.write(text.c_str(), text.size());
|
| 903 |
+
speak_file.close();
|
| 904 |
+
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
|
| 905 |
+
if (ret != 0) {
|
| 906 |
+
fprintf(stderr, "%s: failed to speak\n", __func__);
|
| 907 |
+
return false;
|
| 908 |
+
}
|
| 909 |
+
}
|
| 910 |
+
return true;
|
| 911 |
+
}
|
common.h
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Various helper functions and utilities
|
| 2 |
+
|
| 3 |
+
#pragma once
|
| 4 |
+
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <map>
|
| 7 |
+
#include <vector>
|
| 8 |
+
#include <random>
|
| 9 |
+
#include <thread>
|
| 10 |
+
#include <ctime>
|
| 11 |
+
#include <fstream>
|
| 12 |
+
#include <sstream>
|
| 13 |
+
|
| 14 |
+
#define COMMON_SAMPLE_RATE 16000
|
| 15 |
+
|
| 16 |
+
//
|
| 17 |
+
// GPT CLI argument parsing
|
| 18 |
+
//
|
| 19 |
+
|
| 20 |
+
struct gpt_params {
|
| 21 |
+
int32_t seed = -1; // RNG seed
|
| 22 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 23 |
+
int32_t n_predict = 200; // new tokens to predict
|
| 24 |
+
int32_t n_parallel = 1; // number of parallel streams
|
| 25 |
+
int32_t n_batch = 32; // batch size for prompt processing
|
| 26 |
+
int32_t n_ctx = 2048; // context size (this is the KV cache max size)
|
| 27 |
+
int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
|
| 28 |
+
|
| 29 |
+
bool ignore_eos = false; // ignore EOS token when generating text
|
| 30 |
+
|
| 31 |
+
// sampling parameters
|
| 32 |
+
int32_t top_k = 40;
|
| 33 |
+
float top_p = 0.9f;
|
| 34 |
+
float temp = 0.9f;
|
| 35 |
+
int32_t repeat_last_n = 64;
|
| 36 |
+
float repeat_penalty = 1.00f;
|
| 37 |
+
|
| 38 |
+
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
| 39 |
+
std::string prompt = "";
|
| 40 |
+
std::string token_test = "";
|
| 41 |
+
|
| 42 |
+
bool interactive = false;
|
| 43 |
+
int32_t interactive_port = -1;
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
| 47 |
+
|
| 48 |
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
| 49 |
+
|
| 50 |
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
| 51 |
+
|
| 52 |
+
//
|
| 53 |
+
// Vocab utils
|
| 54 |
+
//
|
| 55 |
+
|
| 56 |
+
std::string trim(const std::string & s);
|
| 57 |
+
|
| 58 |
+
std::string replace(
|
| 59 |
+
const std::string & s,
|
| 60 |
+
const std::string & from,
|
| 61 |
+
const std::string & to);
|
| 62 |
+
|
| 63 |
+
struct gpt_vocab {
|
| 64 |
+
using id = int32_t;
|
| 65 |
+
using token = std::string;
|
| 66 |
+
|
| 67 |
+
std::map<token, id> token_to_id;
|
| 68 |
+
std::map<id, token> id_to_token;
|
| 69 |
+
std::vector<std::string> special_tokens;
|
| 70 |
+
|
| 71 |
+
void add_special_token(const std::string & token);
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
// poor-man's JSON parsing
|
| 75 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
| 76 |
+
|
| 77 |
+
std::string convert_to_utf8(const std::wstring & input);
|
| 78 |
+
|
| 79 |
+
std::wstring convert_to_wstring(const std::string & input);
|
| 80 |
+
|
| 81 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
| 82 |
+
|
| 83 |
+
// split text into tokens
|
| 84 |
+
//
|
| 85 |
+
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
| 86 |
+
//
|
| 87 |
+
// Regex (Python):
|
| 88 |
+
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
| 89 |
+
//
|
| 90 |
+
// Regex (C++):
|
| 91 |
+
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
| 92 |
+
//
|
| 93 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
| 94 |
+
|
| 95 |
+
// test outputs of gpt_tokenize
|
| 96 |
+
//
|
| 97 |
+
// - compare with tokens generated by the huggingface tokenizer
|
| 98 |
+
// - test cases are chosen based on the model's main language (under 'prompt' directory)
|
| 99 |
+
// - if all sentences are tokenized identically, print 'All tests passed.'
|
| 100 |
+
// - otherwise, print sentence, huggingface tokens, ggml tokens
|
| 101 |
+
//
|
| 102 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
|
| 103 |
+
|
| 104 |
+
// load the tokens from encoder.json
|
| 105 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
| 106 |
+
|
| 107 |
+
// sample next token given probabilities for each embedding
|
| 108 |
+
//
|
| 109 |
+
// - consider only the top K tokens
|
| 110 |
+
// - from them, consider only the top tokens with cumulative probability > P
|
| 111 |
+
//
|
| 112 |
+
// TODO: not sure if this implementation is correct
|
| 113 |
+
// TODO: temperature is not implemented
|
| 114 |
+
//
|
| 115 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 116 |
+
const gpt_vocab & vocab,
|
| 117 |
+
const float * logits,
|
| 118 |
+
int top_k,
|
| 119 |
+
double top_p,
|
| 120 |
+
double temp,
|
| 121 |
+
std::mt19937 & rng);
|
| 122 |
+
|
| 123 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
| 124 |
+
const gpt_vocab & vocab,
|
| 125 |
+
const float * logits,
|
| 126 |
+
const int32_t * last_n_tokens_data,
|
| 127 |
+
size_t last_n_tokens_data_size,
|
| 128 |
+
int top_k,
|
| 129 |
+
double top_p,
|
| 130 |
+
double temp,
|
| 131 |
+
int repeat_last_n,
|
| 132 |
+
float repeat_penalty,
|
| 133 |
+
std::mt19937 & rng);
|
| 134 |
+
|
| 135 |
+
//
|
| 136 |
+
// Audio utils
|
| 137 |
+
//
|
| 138 |
+
|
| 139 |
+
// Check if a buffer is a WAV audio file
|
| 140 |
+
bool is_wav_buffer(const std::string buf);
|
| 141 |
+
|
| 142 |
+
// Read WAV audio file and store the PCM data into pcmf32
|
| 143 |
+
// fname can be a buffer of WAV data instead of a filename
|
| 144 |
+
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
| 145 |
+
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
| 146 |
+
bool read_wav(
|
| 147 |
+
const std::string & fname,
|
| 148 |
+
std::vector<float> & pcmf32,
|
| 149 |
+
std::vector<std::vector<float>> & pcmf32s,
|
| 150 |
+
bool stereo);
|
| 151 |
+
|
| 152 |
+
// Write PCM data into WAV audio file
|
| 153 |
+
class wav_writer {
|
| 154 |
+
private:
|
| 155 |
+
std::ofstream file;
|
| 156 |
+
uint32_t dataSize = 0;
|
| 157 |
+
std::string wav_filename;
|
| 158 |
+
|
| 159 |
+
bool write_header(const uint32_t sample_rate,
|
| 160 |
+
const uint16_t bits_per_sample,
|
| 161 |
+
const uint16_t channels) {
|
| 162 |
+
|
| 163 |
+
file.write("RIFF", 4);
|
| 164 |
+
file.write("\0\0\0\0", 4); // Placeholder for file size
|
| 165 |
+
file.write("WAVE", 4);
|
| 166 |
+
file.write("fmt ", 4);
|
| 167 |
+
|
| 168 |
+
const uint32_t sub_chunk_size = 16;
|
| 169 |
+
const uint16_t audio_format = 1; // PCM format
|
| 170 |
+
const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
|
| 171 |
+
const uint16_t block_align = channels * bits_per_sample / 8;
|
| 172 |
+
|
| 173 |
+
file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
|
| 174 |
+
file.write(reinterpret_cast<const char *>(&audio_format), 2);
|
| 175 |
+
file.write(reinterpret_cast<const char *>(&channels), 2);
|
| 176 |
+
file.write(reinterpret_cast<const char *>(&sample_rate), 4);
|
| 177 |
+
file.write(reinterpret_cast<const char *>(&byte_rate), 4);
|
| 178 |
+
file.write(reinterpret_cast<const char *>(&block_align), 2);
|
| 179 |
+
file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
|
| 180 |
+
file.write("data", 4);
|
| 181 |
+
file.write("\0\0\0\0", 4); // Placeholder for data size
|
| 182 |
+
|
| 183 |
+
return true;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
// It is assumed that PCM data is normalized to a range from -1 to 1
|
| 187 |
+
bool write_audio(const float * data, size_t length) {
|
| 188 |
+
for (size_t i = 0; i < length; ++i) {
|
| 189 |
+
const int16_t intSample = int16_t(data[i] * 32767);
|
| 190 |
+
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
|
| 191 |
+
dataSize += sizeof(int16_t);
|
| 192 |
+
}
|
| 193 |
+
if (file.is_open()) {
|
| 194 |
+
file.seekp(4, std::ios::beg);
|
| 195 |
+
uint32_t fileSize = 36 + dataSize;
|
| 196 |
+
file.write(reinterpret_cast<char *>(&fileSize), 4);
|
| 197 |
+
file.seekp(40, std::ios::beg);
|
| 198 |
+
file.write(reinterpret_cast<char *>(&dataSize), 4);
|
| 199 |
+
file.seekp(0, std::ios::end);
|
| 200 |
+
}
|
| 201 |
+
return true;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
bool open_wav(const std::string & filename) {
|
| 205 |
+
if (filename != wav_filename) {
|
| 206 |
+
if (file.is_open()) {
|
| 207 |
+
file.close();
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
if (!file.is_open()) {
|
| 211 |
+
file.open(filename, std::ios::binary);
|
| 212 |
+
wav_filename = filename;
|
| 213 |
+
dataSize = 0;
|
| 214 |
+
}
|
| 215 |
+
return file.is_open();
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
public:
|
| 219 |
+
bool open(const std::string & filename,
|
| 220 |
+
const uint32_t sample_rate,
|
| 221 |
+
const uint16_t bits_per_sample,
|
| 222 |
+
const uint16_t channels) {
|
| 223 |
+
|
| 224 |
+
if (open_wav(filename)) {
|
| 225 |
+
write_header(sample_rate, bits_per_sample, channels);
|
| 226 |
+
} else {
|
| 227 |
+
return false;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
return true;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
bool close() {
|
| 234 |
+
file.close();
|
| 235 |
+
return true;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
bool write(const float * data, size_t length) {
|
| 239 |
+
return write_audio(data, length);
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
~wav_writer() {
|
| 243 |
+
if (file.is_open()) {
|
| 244 |
+
file.close();
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
};
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
// Apply a high-pass frequency filter to PCM audio
|
| 251 |
+
// Suppresses frequencies below cutoff Hz
|
| 252 |
+
void high_pass_filter(
|
| 253 |
+
std::vector<float> & data,
|
| 254 |
+
float cutoff,
|
| 255 |
+
float sample_rate);
|
| 256 |
+
|
| 257 |
+
// Basic voice activity detection (VAD) using audio energy adaptive threshold
|
| 258 |
+
bool vad_simple(
|
| 259 |
+
std::vector<float> & pcmf32,
|
| 260 |
+
int sample_rate,
|
| 261 |
+
int last_ms,
|
| 262 |
+
float vad_thold,
|
| 263 |
+
float freq_thold,
|
| 264 |
+
bool verbose);
|
| 265 |
+
|
| 266 |
+
// compute similarity between two strings using Levenshtein distance
|
| 267 |
+
float similarity(const std::string & s0, const std::string & s1);
|
| 268 |
+
|
| 269 |
+
//
|
| 270 |
+
// SAM argument parsing
|
| 271 |
+
//
|
| 272 |
+
|
| 273 |
+
struct sam_params {
|
| 274 |
+
int32_t seed = -1; // RNG seed
|
| 275 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 276 |
+
|
| 277 |
+
std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
|
| 278 |
+
std::string fname_inp = "img.jpg";
|
| 279 |
+
std::string fname_out = "img.out";
|
| 280 |
+
};
|
| 281 |
+
|
| 282 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params);
|
| 283 |
+
|
| 284 |
+
void sam_print_usage(int argc, char ** argv, const sam_params & params);
|
| 285 |
+
|
| 286 |
+
//
|
| 287 |
+
// Terminal utils
|
| 288 |
+
//
|
| 289 |
+
|
| 290 |
+
#define SQR(X) ((X) * (X))
|
| 291 |
+
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
|
| 292 |
+
|
| 293 |
+
/**
|
| 294 |
+
* Quantizes 24-bit RGB to xterm256 code range [16,256).
|
| 295 |
+
*/
|
| 296 |
+
static int rgb2xterm256(int r, int g, int b) {
|
| 297 |
+
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
|
| 298 |
+
int av, ir, ig, ib, il, qr, qg, qb, ql;
|
| 299 |
+
av = r * .299 + g * .587 + b * .114 + .5;
|
| 300 |
+
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
|
| 301 |
+
qr = cube[(ir = UNCUBE(r))];
|
| 302 |
+
qg = cube[(ig = UNCUBE(g))];
|
| 303 |
+
qb = cube[(ib = UNCUBE(b))];
|
| 304 |
+
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
|
| 305 |
+
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
|
| 306 |
+
return ir * 36 + ig * 6 + ib + 020;
|
| 307 |
+
return il + 0350;
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
static std::string set_xterm256_foreground(int r, int g, int b) {
|
| 311 |
+
int x = rgb2xterm256(r, g, b);
|
| 312 |
+
std::ostringstream oss;
|
| 313 |
+
oss << "\033[38;5;" << x << "m";
|
| 314 |
+
return oss.str();
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
// Lowest is red, middle is yellow, highest is green. Color scheme from
|
| 318 |
+
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
|
| 319 |
+
const std::vector<std::string> k_colors = {
|
| 320 |
+
set_xterm256_foreground(220, 5, 12),
|
| 321 |
+
set_xterm256_foreground(232, 96, 28),
|
| 322 |
+
set_xterm256_foreground(241, 147, 45),
|
| 323 |
+
set_xterm256_foreground(246, 193, 65),
|
| 324 |
+
set_xterm256_foreground(247, 240, 86),
|
| 325 |
+
set_xterm256_foreground(144, 201, 135),
|
| 326 |
+
set_xterm256_foreground( 78, 178, 101),
|
| 327 |
+
};
|
| 328 |
+
|
| 329 |
+
//
|
| 330 |
+
// Other utils
|
| 331 |
+
//
|
| 332 |
+
|
| 333 |
+
// convert timestamp to string, 6000 -> 01:00.000
|
| 334 |
+
std::string to_timestamp(int64_t t, bool comma = false);
|
| 335 |
+
|
| 336 |
+
// given a timestamp get the sample
|
| 337 |
+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
|
| 338 |
+
|
| 339 |
+
// check if file exists using ifstream
|
| 340 |
+
bool is_file_exist(const char *fileName);
|
| 341 |
+
|
| 342 |
+
// write text to file, and call system("command voice_id file")
|
| 343 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
|
dr_wav.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-aarch64.c
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-aarch64.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#define GGML_COMMON_DECL_C
|
| 5 |
+
#include "ggml-common.h"
|
| 6 |
+
|
| 7 |
+
#include "ggml.h"
|
| 8 |
+
|
| 9 |
+
// GGML internal header
|
| 10 |
+
|
| 11 |
+
#ifdef __cplusplus
|
| 12 |
+
extern "C" {
|
| 13 |
+
#endif
|
| 14 |
+
|
| 15 |
+
// Quantization
|
| 16 |
+
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 17 |
+
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 18 |
+
|
| 19 |
+
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
| 20 |
+
|
| 21 |
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 22 |
+
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 23 |
+
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 24 |
+
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 25 |
+
|
| 26 |
+
// GEMV
|
| 27 |
+
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 28 |
+
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 29 |
+
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 30 |
+
|
| 31 |
+
// GEMM
|
| 32 |
+
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 33 |
+
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 34 |
+
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
| 35 |
+
|
| 36 |
+
#ifdef __cplusplus
|
| 37 |
+
}
|
| 38 |
+
#endif
|
| 39 |
+
|
ggml-common.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-cpu-impl.h
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// GGML CPU internal header
|
| 4 |
+
|
| 5 |
+
#include "ggml.h"
|
| 6 |
+
#include "ggml-impl.h"
|
| 7 |
+
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
| 8 |
+
//#include <stddef.h>
|
| 9 |
+
#include <stdbool.h>
|
| 10 |
+
#include <string.h> // memcpy
|
| 11 |
+
#include <math.h> // fabsf
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
#ifdef __cplusplus
|
| 15 |
+
extern "C" {
|
| 16 |
+
#endif
|
| 17 |
+
|
| 18 |
+
#if defined(_MSC_VER)
|
| 19 |
+
|
| 20 |
+
#define m512bh(p) p
|
| 21 |
+
#define m512i(p) p
|
| 22 |
+
|
| 23 |
+
#else
|
| 24 |
+
|
| 25 |
+
#define m512bh(p) (__m512bh)(p)
|
| 26 |
+
#define m512i(p) (__m512i)(p)
|
| 27 |
+
|
| 28 |
+
#endif
|
| 29 |
+
|
| 30 |
+
/**
|
| 31 |
+
* Converts brain16 to float32.
|
| 32 |
+
*
|
| 33 |
+
* The bfloat16 floating point format has the following structure:
|
| 34 |
+
*
|
| 35 |
+
* ┌sign
|
| 36 |
+
* │
|
| 37 |
+
* │ ┌exponent
|
| 38 |
+
* │ │
|
| 39 |
+
* │ │ ┌mantissa
|
| 40 |
+
* │ │ │
|
| 41 |
+
* │┌──┴───┐┌─┴───┐
|
| 42 |
+
* 0b0000000000000000 brain16
|
| 43 |
+
*
|
| 44 |
+
* Since bf16 has the same number of exponent bits as a 32bit float,
|
| 45 |
+
* encoding and decoding numbers becomes relatively straightforward.
|
| 46 |
+
*
|
| 47 |
+
* ┌sign
|
| 48 |
+
* │
|
| 49 |
+
* │ ┌exponent
|
| 50 |
+
* │ │
|
| 51 |
+
* │ │ ┌mantissa
|
| 52 |
+
* │ │ │
|
| 53 |
+
* │┌──┴───┐┌─┴───────────────────┐
|
| 54 |
+
* 0b00000000000000000000000000000000 IEEE binary32
|
| 55 |
+
*
|
| 56 |
+
* For comparison, the standard fp16 format has fewer exponent bits.
|
| 57 |
+
*
|
| 58 |
+
* ┌sign
|
| 59 |
+
* │
|
| 60 |
+
* │ ┌exponent
|
| 61 |
+
* │ │
|
| 62 |
+
* │ │ ┌mantissa
|
| 63 |
+
* │ │ │
|
| 64 |
+
* │┌─┴─┐┌─┴──────┐
|
| 65 |
+
* 0b0000000000000000 IEEE binary16
|
| 66 |
+
*
|
| 67 |
+
* @see IEEE 754-2008
|
| 68 |
+
*/
|
| 69 |
+
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
| 70 |
+
union {
|
| 71 |
+
float f;
|
| 72 |
+
uint32_t i;
|
| 73 |
+
} u;
|
| 74 |
+
u.i = (uint32_t)h.bits << 16;
|
| 75 |
+
return u.f;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* Converts float32 to brain16.
|
| 80 |
+
*
|
| 81 |
+
* This is binary identical with Google Brain float conversion.
|
| 82 |
+
* Floats shall round to nearest even, and NANs shall be quiet.
|
| 83 |
+
* Subnormals aren't flushed to zero, except perhaps when used.
|
| 84 |
+
* This code should vectorize nicely if using modern compilers.
|
| 85 |
+
*/
|
| 86 |
+
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
| 87 |
+
ggml_bf16_t h;
|
| 88 |
+
union {
|
| 89 |
+
float f;
|
| 90 |
+
uint32_t i;
|
| 91 |
+
} u;
|
| 92 |
+
u.f = s;
|
| 93 |
+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
| 94 |
+
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
| 95 |
+
return h;
|
| 96 |
+
}
|
| 97 |
+
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
| 98 |
+
return h;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
| 102 |
+
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
| 103 |
+
|
| 104 |
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
| 105 |
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
| 106 |
+
#ifndef __FMA__
|
| 107 |
+
#define __FMA__
|
| 108 |
+
#endif
|
| 109 |
+
#ifndef __F16C__
|
| 110 |
+
#define __F16C__
|
| 111 |
+
#endif
|
| 112 |
+
#endif
|
| 113 |
+
|
| 114 |
+
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
| 115 |
+
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
| 116 |
+
#ifndef __SSE3__
|
| 117 |
+
#define __SSE3__
|
| 118 |
+
#endif
|
| 119 |
+
#ifndef __SSSE3__
|
| 120 |
+
#define __SSSE3__
|
| 121 |
+
#endif
|
| 122 |
+
#endif
|
| 123 |
+
|
| 124 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 125 |
+
#include <arm_sve.h>
|
| 126 |
+
#include <sys/prctl.h>
|
| 127 |
+
#endif
|
| 128 |
+
|
| 129 |
+
// 16-bit float
|
| 130 |
+
// on Arm, we use __fp16
|
| 131 |
+
// on x86, we use uint16_t
|
| 132 |
+
#if defined(__ARM_NEON)
|
| 133 |
+
|
| 134 |
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
| 135 |
+
//
|
| 136 |
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
| 137 |
+
//
|
| 138 |
+
#include <arm_neon.h>
|
| 139 |
+
|
| 140 |
+
#ifdef _MSC_VER
|
| 141 |
+
|
| 142 |
+
typedef uint16_t ggml_fp16_internal_t;
|
| 143 |
+
|
| 144 |
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
| 145 |
+
|
| 146 |
+
#else
|
| 147 |
+
|
| 148 |
+
typedef __fp16 ggml_fp16_internal_t;
|
| 149 |
+
|
| 150 |
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
| 151 |
+
|
| 152 |
+
#endif // _MSC_VER
|
| 153 |
+
|
| 154 |
+
#if !defined(__aarch64__)
|
| 155 |
+
|
| 156 |
+
// 32-bit ARM compatibility
|
| 157 |
+
|
| 158 |
+
// vaddlvq_s16
|
| 159 |
+
// vpaddq_s16
|
| 160 |
+
// vpaddq_s32
|
| 161 |
+
// vaddvq_s32
|
| 162 |
+
// vaddvq_f32
|
| 163 |
+
// vmaxvq_f32
|
| 164 |
+
// vcvtnq_s32_f32
|
| 165 |
+
// vzip1_u8
|
| 166 |
+
// vzip2_u8
|
| 167 |
+
|
| 168 |
+
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
| 169 |
+
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
| 170 |
+
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
| 174 |
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
| 175 |
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
| 176 |
+
return vcombine_s16(a0, b0);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
| 180 |
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
| 181 |
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
| 182 |
+
return vcombine_s32(a0, b0);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
| 186 |
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
inline static float vaddvq_f32(float32x4_t v) {
|
| 190 |
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
inline static float vmaxvq_f32(float32x4_t v) {
|
| 194 |
+
return
|
| 195 |
+
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 196 |
+
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
| 200 |
+
int32x4_t res;
|
| 201 |
+
|
| 202 |
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
| 203 |
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
| 204 |
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
| 205 |
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
| 206 |
+
|
| 207 |
+
return res;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
| 211 |
+
uint8x8_t res;
|
| 212 |
+
|
| 213 |
+
res[0] = a[0]; res[1] = b[0];
|
| 214 |
+
res[2] = a[1]; res[3] = b[1];
|
| 215 |
+
res[4] = a[2]; res[5] = b[2];
|
| 216 |
+
res[6] = a[3]; res[7] = b[3];
|
| 217 |
+
|
| 218 |
+
return res;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
| 222 |
+
uint8x8_t res;
|
| 223 |
+
|
| 224 |
+
res[0] = a[4]; res[1] = b[4];
|
| 225 |
+
res[2] = a[5]; res[3] = b[5];
|
| 226 |
+
res[4] = a[6]; res[5] = b[6];
|
| 227 |
+
res[6] = a[7]; res[7] = b[7];
|
| 228 |
+
|
| 229 |
+
return res;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
// vld1q_s16_x2
|
| 233 |
+
// vld1q_u8_x2
|
| 234 |
+
// vld1q_u8_x4
|
| 235 |
+
// vld1q_s8_x2
|
| 236 |
+
// vld1q_s8_x4
|
| 237 |
+
// TODO: double-check these work correctly
|
| 238 |
+
|
| 239 |
+
typedef struct ggml_int16x8x2_t {
|
| 240 |
+
int16x8_t val[2];
|
| 241 |
+
} ggml_int16x8x2_t;
|
| 242 |
+
|
| 243 |
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
| 244 |
+
ggml_int16x8x2_t res;
|
| 245 |
+
|
| 246 |
+
res.val[0] = vld1q_s16(ptr + 0);
|
| 247 |
+
res.val[1] = vld1q_s16(ptr + 8);
|
| 248 |
+
|
| 249 |
+
return res;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
typedef struct ggml_uint8x16x2_t {
|
| 253 |
+
uint8x16_t val[2];
|
| 254 |
+
} ggml_uint8x16x2_t;
|
| 255 |
+
|
| 256 |
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
| 257 |
+
ggml_uint8x16x2_t res;
|
| 258 |
+
|
| 259 |
+
res.val[0] = vld1q_u8(ptr + 0);
|
| 260 |
+
res.val[1] = vld1q_u8(ptr + 16);
|
| 261 |
+
|
| 262 |
+
return res;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
typedef struct ggml_uint8x16x4_t {
|
| 266 |
+
uint8x16_t val[4];
|
| 267 |
+
} ggml_uint8x16x4_t;
|
| 268 |
+
|
| 269 |
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
| 270 |
+
ggml_uint8x16x4_t res;
|
| 271 |
+
|
| 272 |
+
res.val[0] = vld1q_u8(ptr + 0);
|
| 273 |
+
res.val[1] = vld1q_u8(ptr + 16);
|
| 274 |
+
res.val[2] = vld1q_u8(ptr + 32);
|
| 275 |
+
res.val[3] = vld1q_u8(ptr + 48);
|
| 276 |
+
|
| 277 |
+
return res;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
typedef struct ggml_int8x16x2_t {
|
| 281 |
+
int8x16_t val[2];
|
| 282 |
+
} ggml_int8x16x2_t;
|
| 283 |
+
|
| 284 |
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
| 285 |
+
ggml_int8x16x2_t res;
|
| 286 |
+
|
| 287 |
+
res.val[0] = vld1q_s8(ptr + 0);
|
| 288 |
+
res.val[1] = vld1q_s8(ptr + 16);
|
| 289 |
+
|
| 290 |
+
return res;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
typedef struct ggml_int8x16x4_t {
|
| 294 |
+
int8x16_t val[4];
|
| 295 |
+
} ggml_int8x16x4_t;
|
| 296 |
+
|
| 297 |
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
| 298 |
+
ggml_int8x16x4_t res;
|
| 299 |
+
|
| 300 |
+
res.val[0] = vld1q_s8(ptr + 0);
|
| 301 |
+
res.val[1] = vld1q_s8(ptr + 16);
|
| 302 |
+
res.val[2] = vld1q_s8(ptr + 32);
|
| 303 |
+
res.val[3] = vld1q_s8(ptr + 48);
|
| 304 |
+
|
| 305 |
+
return res;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
// NOTE: not tested
|
| 309 |
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
| 310 |
+
int8x16_t res;
|
| 311 |
+
|
| 312 |
+
res[ 0] = a[b[ 0]];
|
| 313 |
+
res[ 1] = a[b[ 1]];
|
| 314 |
+
res[ 2] = a[b[ 2]];
|
| 315 |
+
res[ 3] = a[b[ 3]];
|
| 316 |
+
res[ 4] = a[b[ 4]];
|
| 317 |
+
res[ 5] = a[b[ 5]];
|
| 318 |
+
res[ 6] = a[b[ 6]];
|
| 319 |
+
res[ 7] = a[b[ 7]];
|
| 320 |
+
res[ 8] = a[b[ 8]];
|
| 321 |
+
res[ 9] = a[b[ 9]];
|
| 322 |
+
res[10] = a[b[10]];
|
| 323 |
+
res[11] = a[b[11]];
|
| 324 |
+
res[12] = a[b[12]];
|
| 325 |
+
res[13] = a[b[13]];
|
| 326 |
+
res[14] = a[b[14]];
|
| 327 |
+
res[15] = a[b[15]];
|
| 328 |
+
|
| 329 |
+
return res;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
// NOTE: not tested
|
| 333 |
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
| 334 |
+
uint8x16_t res;
|
| 335 |
+
|
| 336 |
+
res[ 0] = a[b[ 0]];
|
| 337 |
+
res[ 1] = a[b[ 1]];
|
| 338 |
+
res[ 2] = a[b[ 2]];
|
| 339 |
+
res[ 3] = a[b[ 3]];
|
| 340 |
+
res[ 4] = a[b[ 4]];
|
| 341 |
+
res[ 5] = a[b[ 5]];
|
| 342 |
+
res[ 6] = a[b[ 6]];
|
| 343 |
+
res[ 7] = a[b[ 7]];
|
| 344 |
+
res[ 8] = a[b[ 8]];
|
| 345 |
+
res[ 9] = a[b[ 9]];
|
| 346 |
+
res[10] = a[b[10]];
|
| 347 |
+
res[11] = a[b[11]];
|
| 348 |
+
res[12] = a[b[12]];
|
| 349 |
+
res[13] = a[b[13]];
|
| 350 |
+
res[14] = a[b[14]];
|
| 351 |
+
res[15] = a[b[15]];
|
| 352 |
+
|
| 353 |
+
return res;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
#else
|
| 357 |
+
|
| 358 |
+
#define ggml_int16x8x2_t int16x8x2_t
|
| 359 |
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
| 360 |
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
| 361 |
+
#define ggml_int8x16x2_t int8x16x2_t
|
| 362 |
+
#define ggml_int8x16x4_t int8x16x4_t
|
| 363 |
+
|
| 364 |
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
| 365 |
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
| 366 |
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
| 367 |
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
| 368 |
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
| 369 |
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
| 370 |
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
| 371 |
+
|
| 372 |
+
#endif // !defined(__aarch64__)
|
| 373 |
+
|
| 374 |
+
#if !defined(__ARM_FEATURE_DOTPROD)
|
| 375 |
+
|
| 376 |
+
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
| 377 |
+
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
| 378 |
+
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
| 379 |
+
|
| 380 |
+
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
#else
|
| 384 |
+
|
| 385 |
+
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
| 386 |
+
|
| 387 |
+
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
| 388 |
+
|
| 389 |
+
#endif // defined(__ARM_NEON)
|
| 390 |
+
|
| 391 |
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
| 392 |
+
|
| 393 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 394 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 395 |
+
|
| 396 |
+
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 397 |
+
|
| 398 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 399 |
+
ggml_fp16_internal_t tmp;
|
| 400 |
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
| 401 |
+
return (float)tmp;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 405 |
+
ggml_fp16_t res;
|
| 406 |
+
ggml_fp16_internal_t tmp = f;
|
| 407 |
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
| 408 |
+
return res;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
#else
|
| 412 |
+
|
| 413 |
+
#ifdef __wasm_simd128__
|
| 414 |
+
#include <wasm_simd128.h>
|
| 415 |
+
#else
|
| 416 |
+
#ifdef __POWER9_VECTOR__
|
| 417 |
+
#include <altivec.h>
|
| 418 |
+
#undef bool
|
| 419 |
+
#define bool _Bool
|
| 420 |
+
#else
|
| 421 |
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 422 |
+
#include <intrin.h>
|
| 423 |
+
#else
|
| 424 |
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
| 425 |
+
#if !defined(__riscv)
|
| 426 |
+
#include <immintrin.h>
|
| 427 |
+
#endif
|
| 428 |
+
#endif
|
| 429 |
+
#endif
|
| 430 |
+
#endif
|
| 431 |
+
#endif
|
| 432 |
+
|
| 433 |
+
#ifdef __riscv_v_intrinsic
|
| 434 |
+
#include <riscv_vector.h>
|
| 435 |
+
#endif
|
| 436 |
+
|
| 437 |
+
#if defined(__loongarch64)
|
| 438 |
+
#if defined(__loongarch_asx)
|
| 439 |
+
#include <lasxintrin.h>
|
| 440 |
+
#endif
|
| 441 |
+
#if defined(__loongarch_sx)
|
| 442 |
+
#include <lsxintrin.h>
|
| 443 |
+
#endif
|
| 444 |
+
#endif
|
| 445 |
+
|
| 446 |
+
#if defined(__loongarch_asx)
|
| 447 |
+
|
| 448 |
+
typedef union {
|
| 449 |
+
int32_t i;
|
| 450 |
+
float f;
|
| 451 |
+
} ft_union;
|
| 452 |
+
|
| 453 |
+
/* float type data load instructions */
|
| 454 |
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
| 455 |
+
ft_union fi_tmpval = {.f = val};
|
| 456 |
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
| 460 |
+
ft_union fi_tmpval = {.f = val};
|
| 461 |
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
| 462 |
+
}
|
| 463 |
+
#endif
|
| 464 |
+
|
| 465 |
+
#ifdef __F16C__
|
| 466 |
+
|
| 467 |
+
#ifdef _MSC_VER
|
| 468 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
| 469 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
| 470 |
+
#else
|
| 471 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
| 472 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
| 473 |
+
#endif
|
| 474 |
+
|
| 475 |
+
#elif defined(__POWER9_VECTOR__)
|
| 476 |
+
|
| 477 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 478 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 479 |
+
/* the inline asm below is about 12% faster than the lookup method */
|
| 480 |
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
| 481 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
| 482 |
+
|
| 483 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 484 |
+
register float f;
|
| 485 |
+
register double d;
|
| 486 |
+
__asm__(
|
| 487 |
+
"mtfprd %0,%2\n"
|
| 488 |
+
"xscvhpdp %0,%0\n"
|
| 489 |
+
"frsp %1,%0\n" :
|
| 490 |
+
/* temp */ "=d"(d),
|
| 491 |
+
/* out */ "=f"(f):
|
| 492 |
+
/* in */ "r"(h));
|
| 493 |
+
return f;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 497 |
+
register double d;
|
| 498 |
+
register ggml_fp16_t r;
|
| 499 |
+
__asm__( /* xscvdphp can work on double or single precision */
|
| 500 |
+
"xscvdphp %0,%2\n"
|
| 501 |
+
"mffprd %1,%0\n" :
|
| 502 |
+
/* temp */ "=d"(d),
|
| 503 |
+
/* out */ "=r"(r):
|
| 504 |
+
/* in */ "f"(f));
|
| 505 |
+
return r;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
#else
|
| 509 |
+
|
| 510 |
+
// FP16 <-> FP32
|
| 511 |
+
// ref: https://github.com/Maratyszcza/FP16
|
| 512 |
+
|
| 513 |
+
static inline float fp32_from_bits(uint32_t w) {
|
| 514 |
+
union {
|
| 515 |
+
uint32_t as_bits;
|
| 516 |
+
float as_value;
|
| 517 |
+
} fp32;
|
| 518 |
+
fp32.as_bits = w;
|
| 519 |
+
return fp32.as_value;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
static inline uint32_t fp32_to_bits(float f) {
|
| 523 |
+
union {
|
| 524 |
+
float as_value;
|
| 525 |
+
uint32_t as_bits;
|
| 526 |
+
} fp32;
|
| 527 |
+
fp32.as_value = f;
|
| 528 |
+
return fp32.as_bits;
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 532 |
+
const uint32_t w = (uint32_t) h << 16;
|
| 533 |
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
| 534 |
+
const uint32_t two_w = w + w;
|
| 535 |
+
|
| 536 |
+
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
| 537 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
| 538 |
+
const float exp_scale = 0x1.0p-112f;
|
| 539 |
+
#else
|
| 540 |
+
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
| 541 |
+
#endif
|
| 542 |
+
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
| 543 |
+
|
| 544 |
+
const uint32_t magic_mask = UINT32_C(126) << 23;
|
| 545 |
+
const float magic_bias = 0.5f;
|
| 546 |
+
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
| 547 |
+
|
| 548 |
+
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
| 549 |
+
const uint32_t result = sign |
|
| 550 |
+
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
| 551 |
+
return fp32_from_bits(result);
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 555 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
| 556 |
+
const float scale_to_inf = 0x1.0p+112f;
|
| 557 |
+
const float scale_to_zero = 0x1.0p-110f;
|
| 558 |
+
#else
|
| 559 |
+
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
| 560 |
+
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
| 561 |
+
#endif
|
| 562 |
+
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
| 563 |
+
|
| 564 |
+
const uint32_t w = fp32_to_bits(f);
|
| 565 |
+
const uint32_t shl1_w = w + w;
|
| 566 |
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
| 567 |
+
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
| 568 |
+
if (bias < UINT32_C(0x71000000)) {
|
| 569 |
+
bias = UINT32_C(0x71000000);
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
| 573 |
+
const uint32_t bits = fp32_to_bits(base);
|
| 574 |
+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
| 575 |
+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
| 576 |
+
const uint32_t nonsign = exp_bits + mantissa_bits;
|
| 577 |
+
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 581 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 582 |
+
|
| 583 |
+
#endif // __F16C__
|
| 584 |
+
|
| 585 |
+
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
| 586 |
+
|
| 587 |
+
#ifdef __ARM_FEATURE_SVE
|
| 588 |
+
#include <arm_sve.h>
|
| 589 |
+
#endif // __ARM_FEATURE_SVE
|
| 590 |
+
|
| 591 |
+
// precomputed f32 table for f16 (256 KB)
|
| 592 |
+
// defined in ggml.c, initialized in ggml_init()
|
| 593 |
+
extern float ggml_table_f32_f16[1 << 16];
|
| 594 |
+
|
| 595 |
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
| 596 |
+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
| 597 |
+
// This is also true for POWER9.
|
| 598 |
+
#if !defined(GGML_FP16_TO_FP32)
|
| 599 |
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
| 600 |
+
uint16_t s;
|
| 601 |
+
memcpy(&s, &f, sizeof(uint16_t));
|
| 602 |
+
return ggml_table_f32_f16[s];
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
| 606 |
+
#endif
|
| 607 |
+
|
| 608 |
+
#if !defined(GGML_FP32_TO_FP16)
|
| 609 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
| 610 |
+
#endif
|
| 611 |
+
|
| 612 |
+
#ifdef __cplusplus
|
| 613 |
+
}
|
| 614 |
+
#endif
|
ggml-impl.h
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// GGML internal header
|
| 4 |
+
|
| 5 |
+
#include "ggml.h"
|
| 6 |
+
|
| 7 |
+
#include <assert.h>
|
| 8 |
+
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
| 9 |
+
#include <stdbool.h>
|
| 10 |
+
#include <stdint.h>
|
| 11 |
+
|
| 12 |
+
#ifdef __cplusplus
|
| 13 |
+
extern "C" {
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#undef MIN
|
| 17 |
+
#undef MAX
|
| 18 |
+
|
| 19 |
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 20 |
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 21 |
+
|
| 22 |
+
// required for mmap as gguf only guarantees 32-byte alignment
|
| 23 |
+
#define TENSOR_ALIGNMENT 32
|
| 24 |
+
|
| 25 |
+
// static_assert should be a #define, but if it's not,
|
| 26 |
+
// fall back to the _Static_assert C11 keyword.
|
| 27 |
+
// if C99 - static_assert is noop
|
| 28 |
+
// ref: https://stackoverflow.com/a/53923785/4039976
|
| 29 |
+
#ifndef __cplusplus
|
| 30 |
+
#ifndef static_assert
|
| 31 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
| 32 |
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
| 33 |
+
#else
|
| 34 |
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
| 35 |
+
#endif
|
| 36 |
+
#endif
|
| 37 |
+
#endif
|
| 38 |
+
|
| 39 |
+
//
|
| 40 |
+
// logging
|
| 41 |
+
//
|
| 42 |
+
|
| 43 |
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 44 |
+
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
| 45 |
+
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
| 46 |
+
|
| 47 |
+
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
| 48 |
+
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 49 |
+
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 50 |
+
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 51 |
+
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
| 52 |
+
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
| 53 |
+
|
| 54 |
+
// bitset
|
| 55 |
+
|
| 56 |
+
typedef uint32_t ggml_bitset_t;
|
| 57 |
+
|
| 58 |
+
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
| 59 |
+
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
| 60 |
+
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
| 61 |
+
|
| 62 |
+
static size_t ggml_bitset_size(size_t n) {
|
| 63 |
+
return (n + BITSET_MASK) >> BITSET_SHR;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
| 67 |
+
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
| 71 |
+
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
| 75 |
+
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
// hash set
|
| 79 |
+
|
| 80 |
+
#define GGML_HASHSET_FULL ((size_t)-1)
|
| 81 |
+
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
| 82 |
+
|
| 83 |
+
struct ggml_hash_set {
|
| 84 |
+
size_t size;
|
| 85 |
+
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
| 86 |
+
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
| 87 |
+
};
|
| 88 |
+
|
| 89 |
+
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
| 90 |
+
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
| 91 |
+
|
| 92 |
+
// returns the minimum size for a hash set that can hold min_sz elements
|
| 93 |
+
size_t ggml_hash_size(size_t min_sz);
|
| 94 |
+
|
| 95 |
+
// remove all elements from the hash set
|
| 96 |
+
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
| 97 |
+
|
| 98 |
+
// returns true if key is in the hash set
|
| 99 |
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
| 100 |
+
|
| 101 |
+
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
| 102 |
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
| 103 |
+
|
| 104 |
+
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
| 105 |
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
| 106 |
+
|
| 107 |
+
// return index, asserts if table is full
|
| 108 |
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
| 109 |
+
|
| 110 |
+
// hash function for ggml_tensor
|
| 111 |
+
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
| 112 |
+
// the last 4 bits are always zero due to alignment
|
| 113 |
+
return (size_t)(uintptr_t)p >> 4;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
| 117 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
| 118 |
+
|
| 119 |
+
// linear probing
|
| 120 |
+
size_t i = h;
|
| 121 |
+
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
| 122 |
+
i = (i + 1) % hash_set->size;
|
| 123 |
+
if (i == h) {
|
| 124 |
+
// visited all hash table entries -> not found
|
| 125 |
+
return GGML_HASHSET_FULL;
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
return i;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
| 132 |
+
size_t i = ggml_hash_find(hash_set, key);
|
| 133 |
+
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
| 137 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
| 138 |
+
|
| 139 |
+
// linear probing
|
| 140 |
+
size_t i = h;
|
| 141 |
+
do {
|
| 142 |
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
| 143 |
+
ggml_bitset_set(hash_set->used, i);
|
| 144 |
+
hash_set->keys[i] = key;
|
| 145 |
+
return i;
|
| 146 |
+
}
|
| 147 |
+
if (hash_set->keys[i] == key) {
|
| 148 |
+
return GGML_HASHSET_ALREADY_EXISTS;
|
| 149 |
+
}
|
| 150 |
+
i = (i + 1) % hash_set->size;
|
| 151 |
+
} while (i != h);
|
| 152 |
+
|
| 153 |
+
// visited all hash table entries -> not found
|
| 154 |
+
GGML_ABORT("fatal error");
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
| 158 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
| 159 |
+
|
| 160 |
+
// linear probing
|
| 161 |
+
size_t i = h;
|
| 162 |
+
do {
|
| 163 |
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
| 164 |
+
ggml_bitset_set(hash_set->used, i);
|
| 165 |
+
hash_set->keys[i] = key;
|
| 166 |
+
return i;
|
| 167 |
+
}
|
| 168 |
+
if (hash_set->keys[i] == key) {
|
| 169 |
+
return i;
|
| 170 |
+
}
|
| 171 |
+
i = (i + 1) % hash_set->size;
|
| 172 |
+
} while (i != h);
|
| 173 |
+
|
| 174 |
+
// visited all hash table entries -> not found
|
| 175 |
+
GGML_ABORT("fatal error");
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
// computation graph
|
| 179 |
+
|
| 180 |
+
enum ggml_cgraph_eval_order {
|
| 181 |
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
| 182 |
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
| 183 |
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
| 184 |
+
};
|
| 185 |
+
|
| 186 |
+
struct ggml_cgraph {
|
| 187 |
+
int size;
|
| 188 |
+
int n_nodes;
|
| 189 |
+
int n_leafs;
|
| 190 |
+
|
| 191 |
+
struct ggml_tensor ** nodes;
|
| 192 |
+
struct ggml_tensor ** grads;
|
| 193 |
+
struct ggml_tensor ** leafs;
|
| 194 |
+
|
| 195 |
+
struct ggml_hash_set visited_hash_set;
|
| 196 |
+
|
| 197 |
+
enum ggml_cgraph_eval_order order;
|
| 198 |
+
};
|
| 199 |
+
|
| 200 |
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
| 201 |
+
|
| 202 |
+
// Memory allocation
|
| 203 |
+
|
| 204 |
+
void * ggml_aligned_malloc(size_t size);
|
| 205 |
+
void ggml_aligned_free(void * ptr, size_t size);
|
| 206 |
+
|
| 207 |
+
#ifdef __cplusplus
|
| 208 |
+
}
|
| 209 |
+
#endif
|
ggml-model-gpt-2-774M.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44b54a6ab261de692b791d6492940de6e606182158e60d59a630c26a38e3ccf8
|
| 3 |
+
size 1552422809
|
ggml-quants.c
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-quants.h
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#define GGML_COMMON_DECL_C
|
| 4 |
+
#include "ggml-common.h"
|
| 5 |
+
|
| 6 |
+
#include "ggml.h"
|
| 7 |
+
|
| 8 |
+
// GGML internal header
|
| 9 |
+
|
| 10 |
+
#ifdef __cplusplus
|
| 11 |
+
extern "C" {
|
| 12 |
+
#endif
|
| 13 |
+
|
| 14 |
+
// Quantization
|
| 15 |
+
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
| 16 |
+
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
| 17 |
+
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
| 18 |
+
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
| 19 |
+
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
| 20 |
+
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
| 21 |
+
|
| 22 |
+
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
| 23 |
+
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
| 24 |
+
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
| 25 |
+
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
| 26 |
+
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
| 27 |
+
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
| 28 |
+
|
| 29 |
+
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
| 30 |
+
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
| 31 |
+
|
| 32 |
+
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
| 33 |
+
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
| 34 |
+
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
| 35 |
+
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
| 36 |
+
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
| 37 |
+
|
| 38 |
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 39 |
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 40 |
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 41 |
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 42 |
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 43 |
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 44 |
+
|
| 45 |
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 46 |
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 47 |
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 48 |
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 49 |
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 50 |
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 51 |
+
|
| 52 |
+
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 53 |
+
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 54 |
+
|
| 55 |
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 56 |
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 57 |
+
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 58 |
+
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 59 |
+
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 60 |
+
|
| 61 |
+
// Dequantization
|
| 62 |
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 63 |
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 64 |
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 65 |
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 66 |
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 67 |
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 68 |
+
|
| 69 |
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 70 |
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 71 |
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 72 |
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 73 |
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 74 |
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 75 |
+
|
| 76 |
+
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 77 |
+
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 78 |
+
|
| 79 |
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 80 |
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 81 |
+
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 82 |
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 83 |
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 84 |
+
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 85 |
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 86 |
+
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 87 |
+
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 88 |
+
|
| 89 |
+
// Dot product
|
| 90 |
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 91 |
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 92 |
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 93 |
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 94 |
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 95 |
+
|
| 96 |
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 97 |
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 98 |
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 99 |
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 100 |
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 101 |
+
|
| 102 |
+
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 103 |
+
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 104 |
+
|
| 105 |
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 106 |
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 107 |
+
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 108 |
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 109 |
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 110 |
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 111 |
+
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 112 |
+
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 113 |
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 114 |
+
|
| 115 |
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 116 |
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 117 |
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 118 |
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 119 |
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 120 |
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 121 |
+
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 122 |
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 123 |
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 124 |
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 125 |
+
|
| 126 |
+
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 127 |
+
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 128 |
+
|
| 129 |
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 130 |
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 131 |
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 132 |
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 133 |
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 134 |
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 135 |
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 136 |
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 137 |
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 138 |
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 139 |
+
|
| 140 |
+
void iq2xs_init_impl(enum ggml_type type);
|
| 141 |
+
void iq2xs_free_impl(enum ggml_type type);
|
| 142 |
+
void iq3xs_init_impl(int grid_size);
|
| 143 |
+
void iq3xs_free_impl(int grid_size);
|
| 144 |
+
|
| 145 |
+
#ifdef __cplusplus
|
| 146 |
+
}
|
| 147 |
+
#endif
|
ggml.c
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
main-ctx.cpp
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//#include "ggml.h"
|
| 2 |
+
//
|
| 3 |
+
//#include "common.h"
|
| 4 |
+
//#include "common-ggml.h"
|
| 5 |
+
//
|
| 6 |
+
//#include <cassert>
|
| 7 |
+
//#include <cmath>
|
| 8 |
+
//#include <cstdio>
|
| 9 |
+
//#include <cstring>
|
| 10 |
+
//#include <fstream>
|
| 11 |
+
//#include <map>
|
| 12 |
+
//#include <string>
|
| 13 |
+
//#include <vector>
|
| 14 |
+
//
|
| 15 |
+
//#if defined(_MSC_VER)
|
| 16 |
+
//#pragma warning(disable: 4244 4267) // possible loss of data
|
| 17 |
+
//#endif
|
| 18 |
+
//
|
| 19 |
+
//// default hparams (GPT-2 117M)
|
| 20 |
+
//struct gpt2_hparams {
|
| 21 |
+
// int32_t n_vocab = 50257; // Vocabulary size remains the same
|
| 22 |
+
// int32_t n_ctx = 1024; // Maximum context length (sequence length)
|
| 23 |
+
// int32_t n_embd = 1024; // Embedding dimensionality
|
| 24 |
+
// int32_t n_head = 16; // Number of attention heads
|
| 25 |
+
// int32_t n_layer = 24; // Number of transformer layers
|
| 26 |
+
// int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
|
| 27 |
+
// float eps = 1e-5f; // Small constant for numerical stability
|
| 28 |
+
//};
|
| 29 |
+
//
|
| 30 |
+
//struct gpt2_layer {
|
| 31 |
+
// // normalization
|
| 32 |
+
// struct ggml_tensor * ln_1_g;
|
| 33 |
+
// struct ggml_tensor * ln_1_b;
|
| 34 |
+
//
|
| 35 |
+
// struct ggml_tensor * ln_2_g;
|
| 36 |
+
// struct ggml_tensor * ln_2_b;
|
| 37 |
+
//
|
| 38 |
+
// // attention
|
| 39 |
+
// struct ggml_tensor * c_attn_attn_w;
|
| 40 |
+
// struct ggml_tensor * c_attn_attn_b;
|
| 41 |
+
//
|
| 42 |
+
// struct ggml_tensor * c_attn_proj_w;
|
| 43 |
+
// struct ggml_tensor * c_attn_proj_b;
|
| 44 |
+
//
|
| 45 |
+
// // mlp
|
| 46 |
+
// struct ggml_tensor * c_mlp_fc_w;
|
| 47 |
+
// struct ggml_tensor * c_mlp_fc_b;
|
| 48 |
+
//
|
| 49 |
+
// struct ggml_tensor * c_mlp_proj_w;
|
| 50 |
+
// struct ggml_tensor * c_mlp_proj_b;
|
| 51 |
+
//};
|
| 52 |
+
//
|
| 53 |
+
//struct gpt2_model {
|
| 54 |
+
// gpt2_hparams hparams;
|
| 55 |
+
//
|
| 56 |
+
// // normalization
|
| 57 |
+
// struct ggml_tensor * ln_f_g;
|
| 58 |
+
// struct ggml_tensor * ln_f_b;
|
| 59 |
+
//
|
| 60 |
+
// struct ggml_tensor * wte; // position embedding
|
| 61 |
+
// struct ggml_tensor * wpe; // token embedding
|
| 62 |
+
// struct ggml_tensor * lm_head; // language model head
|
| 63 |
+
//
|
| 64 |
+
// std::vector<gpt2_layer> layers;
|
| 65 |
+
//
|
| 66 |
+
// // key + value memory
|
| 67 |
+
// struct ggml_tensor * memory_k;
|
| 68 |
+
// struct ggml_tensor * memory_v;
|
| 69 |
+
//
|
| 70 |
+
// //
|
| 71 |
+
// struct ggml_context * ctx_w;
|
| 72 |
+
// std::map<std::string, struct ggml_tensor *> tensors;
|
| 73 |
+
//};
|
| 74 |
+
//
|
| 75 |
+
//// load the model's weights from a file
|
| 76 |
+
//bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
|
| 77 |
+
// printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
| 78 |
+
//
|
| 79 |
+
// auto fin = std::ifstream(fname, std::ios::binary);
|
| 80 |
+
// if (!fin) {
|
| 81 |
+
// fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
| 82 |
+
// return false;
|
| 83 |
+
// }
|
| 84 |
+
//
|
| 85 |
+
// // verify magic
|
| 86 |
+
// {
|
| 87 |
+
// uint32_t magic;
|
| 88 |
+
// fin.read((char *) &magic, sizeof(magic));
|
| 89 |
+
// if (magic != GGML_FILE_MAGIC) {
|
| 90 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
| 91 |
+
// return false;
|
| 92 |
+
// }
|
| 93 |
+
// }
|
| 94 |
+
//
|
| 95 |
+
// // load hparams
|
| 96 |
+
// {
|
| 97 |
+
// auto & hparams = model.hparams;
|
| 98 |
+
//
|
| 99 |
+
// fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 100 |
+
// fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
| 101 |
+
// fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 102 |
+
// fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 103 |
+
// fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 104 |
+
// fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
| 105 |
+
//
|
| 106 |
+
// const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
| 107 |
+
//
|
| 108 |
+
// printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 109 |
+
// printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 110 |
+
// printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 111 |
+
// printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 112 |
+
// printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 113 |
+
// printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
| 114 |
+
// printf("%s: qntvr = %d\n", __func__, qntvr);
|
| 115 |
+
//
|
| 116 |
+
// hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
| 117 |
+
// }
|
| 118 |
+
//
|
| 119 |
+
// // load vocab
|
| 120 |
+
// {
|
| 121 |
+
// int32_t n_vocab = 0;
|
| 122 |
+
// fin.read((char *) &n_vocab, sizeof(n_vocab));
|
| 123 |
+
//
|
| 124 |
+
// if (n_vocab != model.hparams.n_vocab) {
|
| 125 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
| 126 |
+
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
| 127 |
+
// return false;
|
| 128 |
+
// }
|
| 129 |
+
//
|
| 130 |
+
// std::string word;
|
| 131 |
+
// std::vector<char> buf(128);
|
| 132 |
+
//
|
| 133 |
+
// for (int i = 0; i < n_vocab; i++) {
|
| 134 |
+
// uint32_t len;
|
| 135 |
+
// fin.read((char *) &len, sizeof(len));
|
| 136 |
+
//
|
| 137 |
+
// buf.resize(len);
|
| 138 |
+
// fin.read((char *) buf.data(), len);
|
| 139 |
+
// word.assign(buf.data(), len);
|
| 140 |
+
//
|
| 141 |
+
// vocab.token_to_id[word] = i;
|
| 142 |
+
// vocab.id_to_token[i] = word;
|
| 143 |
+
// }
|
| 144 |
+
// }
|
| 145 |
+
//
|
| 146 |
+
// // for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 147 |
+
// // in order to save memory and also to speed up the computation
|
| 148 |
+
// ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 149 |
+
// if (wtype == GGML_TYPE_COUNT) {
|
| 150 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
| 151 |
+
// __func__, fname.c_str(), model.hparams.ftype);
|
| 152 |
+
// return false;
|
| 153 |
+
// }
|
| 154 |
+
//
|
| 155 |
+
// auto & ctx = model.ctx_w;
|
| 156 |
+
//
|
| 157 |
+
// size_t ctx_size = 0;
|
| 158 |
+
//
|
| 159 |
+
// {
|
| 160 |
+
// const auto & hparams = model.hparams;
|
| 161 |
+
//
|
| 162 |
+
// const int n_embd = hparams.n_embd;
|
| 163 |
+
// const int n_layer = hparams.n_layer;
|
| 164 |
+
// const int n_ctx = hparams.n_ctx;
|
| 165 |
+
// const int n_vocab = hparams.n_vocab;
|
| 166 |
+
//
|
| 167 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
| 168 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
| 169 |
+
//
|
| 170 |
+
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
|
| 171 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
|
| 172 |
+
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
|
| 173 |
+
//
|
| 174 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
| 175 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
| 176 |
+
//
|
| 177 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
| 178 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
| 179 |
+
//
|
| 180 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
| 181 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
| 182 |
+
//
|
| 183 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
| 184 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
| 185 |
+
//
|
| 186 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
| 187 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
| 188 |
+
//
|
| 189 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
| 190 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
|
| 191 |
+
//
|
| 192 |
+
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
| 193 |
+
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
| 194 |
+
//
|
| 195 |
+
// ctx_size += (6 + 12*n_layer)*512; // object overhead
|
| 196 |
+
//
|
| 197 |
+
// printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
|
| 198 |
+
// printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
| 199 |
+
// }
|
| 200 |
+
//
|
| 201 |
+
// // create the ggml context
|
| 202 |
+
// {
|
| 203 |
+
// struct ggml_init_params params = {
|
| 204 |
+
// /*.mem_size =*/ ctx_size,
|
| 205 |
+
// /*.mem_buffer =*/ NULL,
|
| 206 |
+
// /*.no_alloc =*/ false,
|
| 207 |
+
// };
|
| 208 |
+
//
|
| 209 |
+
// model.ctx_w = ggml_init(params);
|
| 210 |
+
// if (!model.ctx_w) {
|
| 211 |
+
// fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
| 212 |
+
// return false;
|
| 213 |
+
// }
|
| 214 |
+
// }
|
| 215 |
+
//
|
| 216 |
+
// // prepare memory for the weights
|
| 217 |
+
// {
|
| 218 |
+
// const auto & hparams = model.hparams;
|
| 219 |
+
//
|
| 220 |
+
// const int n_embd = hparams.n_embd;
|
| 221 |
+
// const int n_layer = hparams.n_layer;
|
| 222 |
+
// const int n_ctx = hparams.n_ctx;
|
| 223 |
+
// const int n_vocab = hparams.n_vocab;
|
| 224 |
+
//
|
| 225 |
+
// model.layers.resize(n_layer);
|
| 226 |
+
//
|
| 227 |
+
// model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 228 |
+
// model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 229 |
+
//
|
| 230 |
+
// model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 231 |
+
// model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
| 232 |
+
// model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 233 |
+
//
|
| 234 |
+
// // map by name
|
| 235 |
+
// model.tensors["model/ln_f/g"] = model.ln_f_g;
|
| 236 |
+
// model.tensors["model/ln_f/b"] = model.ln_f_b;
|
| 237 |
+
//
|
| 238 |
+
// model.tensors["model/wte"] = model.wte;
|
| 239 |
+
// model.tensors["model/wpe"] = model.wpe;
|
| 240 |
+
// model.tensors["model/lm_head"] = model.lm_head;
|
| 241 |
+
//
|
| 242 |
+
// for (int i = 0; i < n_layer; ++i) {
|
| 243 |
+
// auto & layer = model.layers[i];
|
| 244 |
+
//
|
| 245 |
+
// layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 246 |
+
// layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 247 |
+
//
|
| 248 |
+
// layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 249 |
+
// layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 250 |
+
//
|
| 251 |
+
// layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
| 252 |
+
// layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
| 253 |
+
//
|
| 254 |
+
// layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
| 255 |
+
// layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 256 |
+
//
|
| 257 |
+
// layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
| 258 |
+
// layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
| 259 |
+
//
|
| 260 |
+
// layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
| 261 |
+
// layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 262 |
+
//
|
| 263 |
+
// // map by name
|
| 264 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
| 265 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
|
| 266 |
+
//
|
| 267 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
|
| 268 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
|
| 269 |
+
//
|
| 270 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
|
| 271 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
|
| 272 |
+
//
|
| 273 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
|
| 274 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
|
| 275 |
+
//
|
| 276 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
| 277 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
| 278 |
+
//
|
| 279 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
|
| 280 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
| 281 |
+
// }
|
| 282 |
+
// }
|
| 283 |
+
//
|
| 284 |
+
// // key + value memory
|
| 285 |
+
// {
|
| 286 |
+
// const auto & hparams = model.hparams;
|
| 287 |
+
//
|
| 288 |
+
// const int n_embd = hparams.n_embd;
|
| 289 |
+
// const int n_layer = hparams.n_layer;
|
| 290 |
+
// const int n_ctx = hparams.n_ctx;
|
| 291 |
+
//
|
| 292 |
+
// const int n_mem = n_layer*n_ctx;
|
| 293 |
+
// const int n_elements = n_embd*n_mem;
|
| 294 |
+
//
|
| 295 |
+
// model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
| 296 |
+
// model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
| 297 |
+
//
|
| 298 |
+
// const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
| 299 |
+
//
|
| 300 |
+
// printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
| 301 |
+
// }
|
| 302 |
+
//
|
| 303 |
+
// // load weights
|
| 304 |
+
// {
|
| 305 |
+
// size_t total_size = 0;
|
| 306 |
+
//
|
| 307 |
+
// bool has_lm_head = false;
|
| 308 |
+
//
|
| 309 |
+
// while (true) {
|
| 310 |
+
// int32_t n_dims;
|
| 311 |
+
// int32_t length;
|
| 312 |
+
// int32_t ttype;
|
| 313 |
+
//
|
| 314 |
+
// fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 315 |
+
// fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 316 |
+
// fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 317 |
+
//
|
| 318 |
+
// if (fin.eof()) {
|
| 319 |
+
// break;
|
| 320 |
+
// }
|
| 321 |
+
//
|
| 322 |
+
// int32_t nelements = 1;
|
| 323 |
+
// int32_t ne[2] = { 1, 1 };
|
| 324 |
+
// for (int i = 0; i < n_dims; ++i) {
|
| 325 |
+
// fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 326 |
+
// nelements *= ne[i];
|
| 327 |
+
// }
|
| 328 |
+
//
|
| 329 |
+
// std::string name(length, 0);
|
| 330 |
+
// fin.read(&name[0], length);
|
| 331 |
+
//
|
| 332 |
+
// if (model.tensors.find(name) == model.tensors.end()) {
|
| 333 |
+
// fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
|
| 334 |
+
// return false;
|
| 335 |
+
// }
|
| 336 |
+
//
|
| 337 |
+
// auto tensor = model.tensors[name];
|
| 338 |
+
// if (ggml_nelements(tensor) != nelements) {
|
| 339 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
|
| 340 |
+
// return false;
|
| 341 |
+
// }
|
| 342 |
+
//
|
| 343 |
+
// if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
| 344 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
| 345 |
+
// __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
| 346 |
+
// return false;
|
| 347 |
+
// }
|
| 348 |
+
//
|
| 349 |
+
// // for debugging
|
| 350 |
+
// if (0) {
|
| 351 |
+
// printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
| 352 |
+
// }
|
| 353 |
+
//
|
| 354 |
+
// const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 355 |
+
//
|
| 356 |
+
// if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 357 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 358 |
+
// __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
|
| 359 |
+
// return false;
|
| 360 |
+
// }
|
| 361 |
+
//
|
| 362 |
+
// fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
| 363 |
+
//
|
| 364 |
+
// // GPT-2 models share the WTE tensor as the LM head
|
| 365 |
+
// if (name == "model/wte" && has_lm_head == false) {
|
| 366 |
+
// memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
|
| 367 |
+
// }
|
| 368 |
+
//
|
| 369 |
+
// if (name == "model/lm_head") {
|
| 370 |
+
// has_lm_head = true;
|
| 371 |
+
// }
|
| 372 |
+
//
|
| 373 |
+
// total_size += ggml_nbytes(tensor);
|
| 374 |
+
// }
|
| 375 |
+
//
|
| 376 |
+
// printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
| 377 |
+
// }
|
| 378 |
+
//
|
| 379 |
+
// fin.close();
|
| 380 |
+
//
|
| 381 |
+
// return true;
|
| 382 |
+
//}
|
| 383 |
+
//
|
| 384 |
+
//// evaluate the transformer
|
| 385 |
+
////
|
| 386 |
+
//// - model: the model
|
| 387 |
+
//// - n_threads: number of threads to use
|
| 388 |
+
//// - n_past: the context size so far
|
| 389 |
+
//// - embd_inp: the embeddings of the tokens in the context
|
| 390 |
+
//// - embd_w: the predicted logits for the next token
|
| 391 |
+
////
|
| 392 |
+
//bool gpt2_eval(
|
| 393 |
+
// const gpt2_model & model,
|
| 394 |
+
// const int n_threads,
|
| 395 |
+
// const int n_past,
|
| 396 |
+
// const std::vector<gpt_vocab::id> & embd_inp,
|
| 397 |
+
// std::vector<float> & embd_w,
|
| 398 |
+
// size_t & mem_per_token) {
|
| 399 |
+
// const int N = embd_inp.size();
|
| 400 |
+
//
|
| 401 |
+
// const auto & hparams = model.hparams;
|
| 402 |
+
//
|
| 403 |
+
// const int n_embd = hparams.n_embd;
|
| 404 |
+
// const int n_layer = hparams.n_layer;
|
| 405 |
+
// const int n_ctx = hparams.n_ctx;
|
| 406 |
+
// const int n_head = hparams.n_head;
|
| 407 |
+
// const int n_vocab = hparams.n_vocab;
|
| 408 |
+
//
|
| 409 |
+
// static size_t buf_size = 256u*1024*1024;
|
| 410 |
+
// static void * buf = malloc(buf_size);
|
| 411 |
+
//
|
| 412 |
+
// if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
| 413 |
+
// const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
| 414 |
+
// //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
| 415 |
+
//
|
| 416 |
+
// // reallocate
|
| 417 |
+
// buf_size = buf_size_new;
|
| 418 |
+
// buf = realloc(buf, buf_size);
|
| 419 |
+
// if (buf == nullptr) {
|
| 420 |
+
// fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
| 421 |
+
// return false;
|
| 422 |
+
// }
|
| 423 |
+
// }
|
| 424 |
+
//
|
| 425 |
+
// struct ggml_init_params params = {
|
| 426 |
+
// /*.mem_size =*/ buf_size,
|
| 427 |
+
// /*.mem_buffer =*/ buf,
|
| 428 |
+
// /*.no_alloc =*/ false,
|
| 429 |
+
// };
|
| 430 |
+
//
|
| 431 |
+
// struct ggml_context * ctx0 = ggml_init(params);
|
| 432 |
+
// struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
| 433 |
+
//
|
| 434 |
+
// struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 435 |
+
// memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
| 436 |
+
//
|
| 437 |
+
// struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 438 |
+
// for (int i = 0; i < N; ++i) {
|
| 439 |
+
// ((int32_t *) position->data)[i] = n_past + i;
|
| 440 |
+
// }
|
| 441 |
+
//
|
| 442 |
+
// // wte + wpe
|
| 443 |
+
// struct ggml_tensor * inpL =
|
| 444 |
+
// ggml_add(ctx0,
|
| 445 |
+
// ggml_get_rows(ctx0, model.wte, embd),
|
| 446 |
+
// ggml_get_rows(ctx0, model.wpe, position));
|
| 447 |
+
//
|
| 448 |
+
// for (int il = 0; il < n_layer; ++il) {
|
| 449 |
+
// struct ggml_tensor * cur;
|
| 450 |
+
//
|
| 451 |
+
// // norm
|
| 452 |
+
// {
|
| 453 |
+
// // [ 768, N]
|
| 454 |
+
// cur = ggml_norm(ctx0, inpL, hparams.eps);
|
| 455 |
+
//
|
| 456 |
+
// // cur = ln_1_g*cur + ln_1_b
|
| 457 |
+
// // [ 768, N]
|
| 458 |
+
// cur = ggml_add(ctx0,
|
| 459 |
+
// ggml_mul(ctx0,
|
| 460 |
+
// ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
| 461 |
+
// cur),
|
| 462 |
+
// ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
| 463 |
+
// }
|
| 464 |
+
//
|
| 465 |
+
// // attn
|
| 466 |
+
// // [2304, 768] - model.layers[il].c_attn_attn_w
|
| 467 |
+
// // [2304, 1] - model.layers[il].c_attn_attn_b
|
| 468 |
+
// // [ 768, N] - cur (in)
|
| 469 |
+
// // [2304, N] - cur (out)
|
| 470 |
+
// //
|
| 471 |
+
// // cur = attn_w*cur + attn_b
|
| 472 |
+
// // [2304, N]
|
| 473 |
+
// {
|
| 474 |
+
// cur = ggml_mul_mat(ctx0,
|
| 475 |
+
// model.layers[il].c_attn_attn_w,
|
| 476 |
+
// cur);
|
| 477 |
+
//
|
| 478 |
+
// cur = ggml_add(ctx0,
|
| 479 |
+
// ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
| 480 |
+
// cur);
|
| 481 |
+
// }
|
| 482 |
+
//
|
| 483 |
+
// // self-attention
|
| 484 |
+
// {
|
| 485 |
+
// struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
| 486 |
+
// struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
| 487 |
+
// struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
| 488 |
+
//
|
| 489 |
+
// // store key and value to memory
|
| 490 |
+
// if (N >= 1) {
|
| 491 |
+
// struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
| 492 |
+
// struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
| 493 |
+
//
|
| 494 |
+
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
| 495 |
+
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
| 496 |
+
// }
|
| 497 |
+
//
|
| 498 |
+
// // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
| 499 |
+
// // [64, N, 12]
|
| 500 |
+
// struct ggml_tensor * Q =
|
| 501 |
+
// ggml_permute(ctx0,
|
| 502 |
+
// ggml_cpy(ctx0,
|
| 503 |
+
// Qcur,
|
| 504 |
+
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
| 505 |
+
// 0, 2, 1, 3);
|
| 506 |
+
//
|
| 507 |
+
// // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
| 508 |
+
// // [64, n_past + N, 12]
|
| 509 |
+
// struct ggml_tensor * K =
|
| 510 |
+
// ggml_permute(ctx0,
|
| 511 |
+
// ggml_reshape_3d(ctx0,
|
| 512 |
+
// ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
| 513 |
+
// n_embd/n_head, n_head, n_past + N),
|
| 514 |
+
// 0, 2, 1, 3);
|
| 515 |
+
//
|
| 516 |
+
// // GG: flash attention
|
| 517 |
+
// //struct ggml_tensor * V =
|
| 518 |
+
// // ggml_cpy(ctx0,
|
| 519 |
+
// // ggml_permute(ctx0,
|
| 520 |
+
// // ggml_reshape_3d(ctx0,
|
| 521 |
+
// // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
| 522 |
+
// // n_embd/n_head, n_head, n_past + N),
|
| 523 |
+
// // 1, 2, 0, 3),
|
| 524 |
+
// // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
| 525 |
+
//
|
| 526 |
+
// //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
|
| 527 |
+
//
|
| 528 |
+
// // K * Q
|
| 529 |
+
// // [n_past + N, N, 12]
|
| 530 |
+
// struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 531 |
+
//
|
| 532 |
+
// // KQ_scaled = KQ / sqrt(n_embd/n_head)
|
| 533 |
+
// // [n_past + N, N, 12]
|
| 534 |
+
// struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
|
| 535 |
+
//
|
| 536 |
+
// // KQ_masked = mask_past(KQ_scaled)
|
| 537 |
+
// // [n_past + N, N, 12]
|
| 538 |
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
| 539 |
+
//
|
| 540 |
+
// // KQ = soft_max(KQ_masked)
|
| 541 |
+
// // [n_past + N, N, 12]
|
| 542 |
+
// struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
| 543 |
+
//
|
| 544 |
+
// // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
| 545 |
+
// // [n_past + N, 64, 12]
|
| 546 |
+
// struct ggml_tensor * V_trans =
|
| 547 |
+
// ggml_cpy(ctx0,
|
| 548 |
+
// ggml_permute(ctx0,
|
| 549 |
+
// ggml_reshape_3d(ctx0,
|
| 550 |
+
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
| 551 |
+
// n_embd/n_head, n_head, n_past + N),
|
| 552 |
+
// 1, 2, 0, 3),
|
| 553 |
+
// ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
|
| 554 |
+
//
|
| 555 |
+
// // KQV = transpose(V) * KQ_soft_max
|
| 556 |
+
// // [64, N, 12]
|
| 557 |
+
// struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
| 558 |
+
//
|
| 559 |
+
// // KQV_merged = KQV.permute(0, 2, 1, 3)
|
| 560 |
+
// // [64, 12, N]
|
| 561 |
+
// struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 562 |
+
//
|
| 563 |
+
// // cur = KQV_merged.contiguous().view(n_embd, N)
|
| 564 |
+
// // [768, N]
|
| 565 |
+
// cur = ggml_cpy(ctx0,
|
| 566 |
+
// KQV_merged,
|
| 567 |
+
// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
| 568 |
+
// }
|
| 569 |
+
//
|
| 570 |
+
// // projection
|
| 571 |
+
// // [ 768, 768] - model.layers[il].c_attn_proj_w
|
| 572 |
+
// // [ 768, 1] - model.layers[il].c_attn_proj_b
|
| 573 |
+
// // [ 768, N] - cur (in)
|
| 574 |
+
// // [ 768, N] - cur (out)
|
| 575 |
+
// //
|
| 576 |
+
// // cur = proj_w*cur + proj_b
|
| 577 |
+
// // [768, N]
|
| 578 |
+
// {
|
| 579 |
+
// cur = ggml_mul_mat(ctx0,
|
| 580 |
+
// model.layers[il].c_attn_proj_w,
|
| 581 |
+
// cur);
|
| 582 |
+
//
|
| 583 |
+
// cur = ggml_add(ctx0,
|
| 584 |
+
// ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
|
| 585 |
+
// cur);
|
| 586 |
+
// }
|
| 587 |
+
//
|
| 588 |
+
// // add the input
|
| 589 |
+
// cur = ggml_add(ctx0, cur, inpL);
|
| 590 |
+
//
|
| 591 |
+
// struct ggml_tensor * inpFF = cur;
|
| 592 |
+
//
|
| 593 |
+
// // feed-forward network
|
| 594 |
+
// {
|
| 595 |
+
// // norm
|
| 596 |
+
// {
|
| 597 |
+
// cur = ggml_norm(ctx0, inpFF, hparams.eps);
|
| 598 |
+
//
|
| 599 |
+
// // cur = ln_2_g*cur + ln_2_b
|
| 600 |
+
// // [ 768, N]
|
| 601 |
+
// cur = ggml_add(ctx0,
|
| 602 |
+
// ggml_mul(ctx0,
|
| 603 |
+
// ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
|
| 604 |
+
// cur),
|
| 605 |
+
// ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
| 606 |
+
// }
|
| 607 |
+
//
|
| 608 |
+
// // fully connected
|
| 609 |
+
// // [3072, 768] - model.layers[il].c_mlp_fc_w
|
| 610 |
+
// // [3072, 1] - model.layers[il].c_mlp_fc_b
|
| 611 |
+
// // [ 768, N] - cur (in)
|
| 612 |
+
// // [3072, N] - cur (out)
|
| 613 |
+
// //
|
| 614 |
+
// // cur = fc_w*cur + fc_b
|
| 615 |
+
// // [3072, N]
|
| 616 |
+
// cur = ggml_mul_mat(ctx0,
|
| 617 |
+
// model.layers[il].c_mlp_fc_w,
|
| 618 |
+
// cur);
|
| 619 |
+
//
|
| 620 |
+
// cur = ggml_add(ctx0,
|
| 621 |
+
// ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
| 622 |
+
// cur);
|
| 623 |
+
//
|
| 624 |
+
// // GELU activation
|
| 625 |
+
// // [3072, N]
|
| 626 |
+
// cur = ggml_gelu(ctx0, cur);
|
| 627 |
+
//
|
| 628 |
+
// // projection
|
| 629 |
+
// // [ 768, 3072] - model.layers[il].c_mlp_proj_w
|
| 630 |
+
// // [ 768, 1] - model.layers[il].c_mlp_proj_b
|
| 631 |
+
// // [3072, N] - cur (in)
|
| 632 |
+
// // [ 768, N] - cur (out)
|
| 633 |
+
// //
|
| 634 |
+
// // cur = proj_w*cur + proj_b
|
| 635 |
+
// // [768, N]
|
| 636 |
+
// cur = ggml_mul_mat(ctx0,
|
| 637 |
+
// model.layers[il].c_mlp_proj_w,
|
| 638 |
+
// cur);
|
| 639 |
+
//
|
| 640 |
+
// cur = ggml_add(ctx0,
|
| 641 |
+
// ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
| 642 |
+
// cur);
|
| 643 |
+
// }
|
| 644 |
+
//
|
| 645 |
+
// // input for next layer
|
| 646 |
+
// inpL = ggml_add(ctx0, cur, inpFF);
|
| 647 |
+
// }
|
| 648 |
+
//
|
| 649 |
+
// // norm
|
| 650 |
+
// {
|
| 651 |
+
// // [ 768, N]
|
| 652 |
+
// inpL = ggml_norm(ctx0, inpL, hparams.eps);
|
| 653 |
+
//
|
| 654 |
+
// // inpL = ln_f_g*inpL + ln_f_b
|
| 655 |
+
// // [ 768, N]
|
| 656 |
+
// inpL = ggml_add(ctx0,
|
| 657 |
+
// ggml_mul(ctx0,
|
| 658 |
+
// ggml_repeat(ctx0, model.ln_f_g, inpL),
|
| 659 |
+
// inpL),
|
| 660 |
+
// ggml_repeat(ctx0, model.ln_f_b, inpL));
|
| 661 |
+
// }
|
| 662 |
+
//
|
| 663 |
+
// // inpL = WTE * inpL
|
| 664 |
+
// // [ 768, 50257] - model.lm_head
|
| 665 |
+
// // [ 768, N] - inpL
|
| 666 |
+
// inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
| 667 |
+
//
|
| 668 |
+
// // logits -> probs
|
| 669 |
+
// //inpL = ggml_soft_max_inplace(ctx0, inpL);
|
| 670 |
+
//
|
| 671 |
+
// // run the computation
|
| 672 |
+
// ggml_build_forward_expand(gf, inpL);
|
| 673 |
+
// ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
| 674 |
+
//
|
| 675 |
+
// //if (n_past%100 == 0) {
|
| 676 |
+
// // ggml_graph_print (&gf);
|
| 677 |
+
// // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
| 678 |
+
// //}
|
| 679 |
+
//
|
| 680 |
+
// //embd_w.resize(n_vocab*N);
|
| 681 |
+
// //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
| 682 |
+
//
|
| 683 |
+
// // return result just for the last token
|
| 684 |
+
// embd_w.resize(n_vocab);
|
| 685 |
+
// memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
| 686 |
+
//
|
| 687 |
+
// if (mem_per_token == 0) {
|
| 688 |
+
// mem_per_token = ggml_used_mem(ctx0)/N;
|
| 689 |
+
// }
|
| 690 |
+
// //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
| 691 |
+
//
|
| 692 |
+
// ggml_free(ctx0);
|
| 693 |
+
//
|
| 694 |
+
// return true;
|
| 695 |
+
//}
|
| 696 |
+
//
|
| 697 |
+
//int main(int argc, char ** argv) {
|
| 698 |
+
// ggml_time_init();
|
| 699 |
+
//
|
| 700 |
+
// const int64_t t_main_start_us = ggml_time_us();
|
| 701 |
+
//
|
| 702 |
+
// gpt_params params;
|
| 703 |
+
// params.model = "ggml-model-gpt-2-774M.bin";
|
| 704 |
+
//
|
| 705 |
+
// if (gpt_params_parse(argc, argv, params) == false) {
|
| 706 |
+
// return 1;
|
| 707 |
+
// }
|
| 708 |
+
//
|
| 709 |
+
// if (params.seed < 0) {
|
| 710 |
+
// params.seed = time(NULL);
|
| 711 |
+
// }
|
| 712 |
+
//
|
| 713 |
+
// printf("%s: seed = %d\n", __func__, params.seed);
|
| 714 |
+
//
|
| 715 |
+
// std::mt19937 rng(params.seed);
|
| 716 |
+
// if (params.prompt.empty()) {
|
| 717 |
+
// params.prompt = gpt_random_prompt(rng);
|
| 718 |
+
// }
|
| 719 |
+
//
|
| 720 |
+
// int64_t t_load_us = 0;
|
| 721 |
+
//
|
| 722 |
+
// gpt_vocab vocab;
|
| 723 |
+
// gpt2_model model;
|
| 724 |
+
//
|
| 725 |
+
// // load the model
|
| 726 |
+
// {
|
| 727 |
+
// const int64_t t_start_us = ggml_time_us();
|
| 728 |
+
//
|
| 729 |
+
// if (!gpt2_model_load(params.model, model, vocab)) {
|
| 730 |
+
// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
| 731 |
+
// return 1;
|
| 732 |
+
// }
|
| 733 |
+
//
|
| 734 |
+
// t_load_us = ggml_time_us() - t_start_us;
|
| 735 |
+
//
|
| 736 |
+
// test_gpt_tokenizer(vocab, params.token_test);
|
| 737 |
+
// }
|
| 738 |
+
//
|
| 739 |
+
// while(true) {
|
| 740 |
+
// int n_past = 0;
|
| 741 |
+
//
|
| 742 |
+
// int64_t t_sample_us = 0;
|
| 743 |
+
// int64_t t_predict_us = 0;
|
| 744 |
+
//
|
| 745 |
+
// std::vector<float> logits;
|
| 746 |
+
//
|
| 747 |
+
// // tokenize the prompt
|
| 748 |
+
// std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
|
| 749 |
+
//
|
| 750 |
+
// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
| 751 |
+
//
|
| 752 |
+
// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
| 753 |
+
// printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
|
| 754 |
+
// for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
|
| 755 |
+
// printf("%d ", embd_inp[i]);
|
| 756 |
+
// }
|
| 757 |
+
// printf("\n\n");
|
| 758 |
+
//
|
| 759 |
+
// // submit the input prompt token-by-token
|
| 760 |
+
// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
|
| 761 |
+
// std::vector<gpt_vocab::id> embd;
|
| 762 |
+
//
|
| 763 |
+
// // determine the required inference memory per token:
|
| 764 |
+
// size_t mem_per_token = 0;
|
| 765 |
+
// gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
| 766 |
+
//
|
| 767 |
+
// for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
|
| 768 |
+
// // predict
|
| 769 |
+
// if (embd.size() > 0) {
|
| 770 |
+
// const int64_t t_start_us = ggml_time_us();
|
| 771 |
+
//
|
| 772 |
+
// if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
| 773 |
+
// printf("Failed to predict\n");
|
| 774 |
+
// return 1;
|
| 775 |
+
// }
|
| 776 |
+
//
|
| 777 |
+
// t_predict_us += ggml_time_us() - t_start_us;
|
| 778 |
+
// }
|
| 779 |
+
//
|
| 780 |
+
// n_past += embd.size();
|
| 781 |
+
// embd.clear();
|
| 782 |
+
//
|
| 783 |
+
// if (i >= embd_inp.size()) {
|
| 784 |
+
// // sample next token
|
| 785 |
+
// const int top_k = params.top_k;
|
| 786 |
+
// const float top_p = params.top_p;
|
| 787 |
+
// const float temp = params.temp;
|
| 788 |
+
//
|
| 789 |
+
// const int n_vocab = model.hparams.n_vocab;
|
| 790 |
+
//
|
| 791 |
+
// gpt_vocab::id id = 0;
|
| 792 |
+
//
|
| 793 |
+
// {
|
| 794 |
+
// const int64_t t_start_sample_us = ggml_time_us();
|
| 795 |
+
//
|
| 796 |
+
// id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
|
| 797 |
+
//
|
| 798 |
+
// t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 799 |
+
// }
|
| 800 |
+
//
|
| 801 |
+
// // add it to the context
|
| 802 |
+
// embd.push_back(id);
|
| 803 |
+
// } else {
|
| 804 |
+
// // if here, it means we are still processing the input prompt
|
| 805 |
+
// for (size_t k = i; k < embd_inp.size(); k++) {
|
| 806 |
+
// embd.push_back(embd_inp[k]);
|
| 807 |
+
// if (int32_t(embd.size()) >= params.n_batch) {
|
| 808 |
+
// break;
|
| 809 |
+
// }
|
| 810 |
+
// }
|
| 811 |
+
// i += embd.size() - 1;
|
| 812 |
+
// }
|
| 813 |
+
//
|
| 814 |
+
// // display text
|
| 815 |
+
// for (auto id : embd) {
|
| 816 |
+
// printf("%s", vocab.id_to_token[id].c_str());
|
| 817 |
+
// }
|
| 818 |
+
// fflush(stdout);
|
| 819 |
+
//
|
| 820 |
+
// // end of text token
|
| 821 |
+
// if (embd.back() == 50256) {
|
| 822 |
+
// // report timing
|
| 823 |
+
// {
|
| 824 |
+
// const int64_t t_main_end_us = ggml_time_us();
|
| 825 |
+
//
|
| 826 |
+
// printf("\n\n");
|
| 827 |
+
// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
| 828 |
+
// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
|
| 829 |
+
// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
|
| 830 |
+
// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
|
| 831 |
+
// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
| 832 |
+
// }
|
| 833 |
+
// break;
|
| 834 |
+
// }
|
| 835 |
+
// }
|
| 836 |
+
// }
|
| 837 |
+
//
|
| 838 |
+
// ggml_free(model.ctx_w);
|
| 839 |
+
//
|
| 840 |
+
// return 0;
|
| 841 |
+
//}
|
quantize.cpp
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml.h"
|
| 2 |
+
|
| 3 |
+
#include "common.h"
|
| 4 |
+
#include "common-ggml.h"
|
| 5 |
+
|
| 6 |
+
#include <cassert>
|
| 7 |
+
#include <cmath>
|
| 8 |
+
#include <cstdio>
|
| 9 |
+
#include <cstring>
|
| 10 |
+
#include <fstream>
|
| 11 |
+
#include <map>
|
| 12 |
+
#include <string>
|
| 13 |
+
#include <vector>
|
| 14 |
+
#include <regex>
|
| 15 |
+
|
| 16 |
+
// default hparams (GPT-2 117M)
|
| 17 |
+
struct gpt2_hparams {
|
| 18 |
+
int32_t n_vocab = 50257;
|
| 19 |
+
int32_t n_ctx = 1024;
|
| 20 |
+
int32_t n_embd = 768;
|
| 21 |
+
int32_t n_head = 12;
|
| 22 |
+
int32_t n_layer = 12;
|
| 23 |
+
int32_t ftype = 1;
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
// quantize a model
|
| 27 |
+
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
| 28 |
+
gpt_vocab vocab;
|
| 29 |
+
|
| 30 |
+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
| 31 |
+
|
| 32 |
+
auto finp = std::ifstream(fname_inp, std::ios::binary);
|
| 33 |
+
if (!finp) {
|
| 34 |
+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
|
| 35 |
+
return false;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
auto fout = std::ofstream(fname_out, std::ios::binary);
|
| 39 |
+
if (!fout) {
|
| 40 |
+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
| 41 |
+
return false;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
// verify magic
|
| 45 |
+
{
|
| 46 |
+
uint32_t magic;
|
| 47 |
+
finp.read((char *) &magic, sizeof(magic));
|
| 48 |
+
if (magic != GGML_FILE_MAGIC) {
|
| 49 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
| 50 |
+
return false;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
fout.write((char *) &magic, sizeof(magic));
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
gpt2_hparams hparams;
|
| 57 |
+
|
| 58 |
+
// load hparams
|
| 59 |
+
{
|
| 60 |
+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 61 |
+
finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
| 62 |
+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 63 |
+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 64 |
+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 65 |
+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
| 66 |
+
|
| 67 |
+
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
| 68 |
+
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
| 69 |
+
|
| 70 |
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 71 |
+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 72 |
+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 73 |
+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 74 |
+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 75 |
+
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
| 76 |
+
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
| 77 |
+
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
| 78 |
+
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
| 79 |
+
|
| 80 |
+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 81 |
+
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
| 82 |
+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 83 |
+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 84 |
+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 85 |
+
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
// load vocab
|
| 89 |
+
{
|
| 90 |
+
int32_t n_vocab = 0;
|
| 91 |
+
finp.read ((char *) &n_vocab, sizeof(n_vocab));
|
| 92 |
+
fout.write((char *) &n_vocab, sizeof(n_vocab));
|
| 93 |
+
|
| 94 |
+
if (n_vocab != hparams.n_vocab) {
|
| 95 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
| 96 |
+
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
| 97 |
+
return false;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
std::string word;
|
| 101 |
+
for (int i = 0; i < n_vocab; i++) {
|
| 102 |
+
uint32_t len;
|
| 103 |
+
finp.read ((char *) &len, sizeof(len));
|
| 104 |
+
fout.write((char *) &len, sizeof(len));
|
| 105 |
+
|
| 106 |
+
word.resize(len);
|
| 107 |
+
finp.read ((char *) word.data(), len);
|
| 108 |
+
fout.write((char *) word.data(), len);
|
| 109 |
+
|
| 110 |
+
vocab.token_to_id[word] = i;
|
| 111 |
+
vocab.id_to_token[i] = word;
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// regexes of tensor names to be quantized
|
| 116 |
+
const std::vector<std::string> to_quant = {
|
| 117 |
+
"model/wte",
|
| 118 |
+
"model/lm_head",
|
| 119 |
+
"model/h.*/attn/c_attn/w",
|
| 120 |
+
"model/h.*/attn/c_proj/w",
|
| 121 |
+
"model/h.*/mlp/c_fc/w",
|
| 122 |
+
"model/h.*/mlp/c_proj/w",
|
| 123 |
+
};
|
| 124 |
+
|
| 125 |
+
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
| 126 |
+
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
| 127 |
+
return false;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
finp.close();
|
| 131 |
+
fout.close();
|
| 132 |
+
|
| 133 |
+
return true;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
// usage:
|
| 137 |
+
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
|
| 138 |
+
//
|
| 139 |
+
int main(int argc, char ** argv) {
|
| 140 |
+
if (argc != 4) {
|
| 141 |
+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
| 142 |
+
ggml_print_ftypes(stderr);
|
| 143 |
+
return 1;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// needed to initialize f16 tables
|
| 147 |
+
{
|
| 148 |
+
struct ggml_init_params params = { 0, NULL, false };
|
| 149 |
+
struct ggml_context * ctx = ggml_init(params);
|
| 150 |
+
ggml_free(ctx);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
const std::string fname_inp = argv[1];
|
| 154 |
+
const std::string fname_out = argv[2];
|
| 155 |
+
|
| 156 |
+
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
| 157 |
+
|
| 158 |
+
const int64_t t_main_start_us = ggml_time_us();
|
| 159 |
+
|
| 160 |
+
int64_t t_quantize_us = 0;
|
| 161 |
+
|
| 162 |
+
// load the model
|
| 163 |
+
{
|
| 164 |
+
const int64_t t_start_us = ggml_time_us();
|
| 165 |
+
|
| 166 |
+
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
| 167 |
+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
| 168 |
+
return 1;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
t_quantize_us = ggml_time_us() - t_start_us;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// report timing
|
| 175 |
+
{
|
| 176 |
+
const int64_t t_main_end_us = ggml_time_us();
|
| 177 |
+
|
| 178 |
+
printf("\n");
|
| 179 |
+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
| 180 |
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
return 0;
|
| 184 |
+
}
|