CrashOverrideX commited on
Commit
67c993e
·
verified ·
1 Parent(s): 0a7f9e7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama.cpp/.devops/cann.Dockerfile +130 -0
  2. llama.cpp/.devops/cpu.Dockerfile +88 -0
  3. llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  4. llama.cpp/.devops/cuda.Dockerfile +94 -0
  5. llama.cpp/.devops/intel.Dockerfile +95 -0
  6. llama.cpp/.devops/llama-cli-cann.Dockerfile +45 -0
  7. llama.cpp/.devops/llama-cpp-cuda.srpm.spec +85 -0
  8. llama.cpp/.devops/llama-cpp.srpm.spec +87 -0
  9. llama.cpp/.devops/musa.Dockerfile +101 -0
  10. llama.cpp/.devops/rocm.Dockerfile +113 -0
  11. llama.cpp/.devops/s390x.Dockerfile +126 -0
  12. llama.cpp/.devops/tools.sh +53 -0
  13. llama.cpp/.devops/vulkan.Dockerfile +90 -0
  14. llama.cpp/.gemini/settings.json +1 -0
  15. llama.cpp/.github/labeler.yml +106 -0
  16. llama.cpp/.github/pull_request_template.md +1 -0
  17. llama.cpp/build/CMakeCache.txt +91 -0
  18. llama.cpp/ci/README-MUSA.md +35 -0
  19. llama.cpp/ci/README.md +33 -0
  20. llama.cpp/ci/run.sh +709 -0
  21. llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  22. llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  23. llama.cpp/cmake/build-info.cmake +48 -0
  24. llama.cpp/cmake/common.cmake +58 -0
  25. llama.cpp/cmake/download-models.cmake +21 -0
  26. llama.cpp/cmake/git-vars.cmake +22 -0
  27. llama.cpp/cmake/license.cmake +40 -0
  28. llama.cpp/cmake/llama-config.cmake.in +30 -0
  29. llama.cpp/cmake/llama.pc.in +10 -0
  30. llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  31. llama.cpp/cmake/x64-windows-llvm.cmake +5 -0
  32. llama.cpp/common/CMakeLists.txt +149 -0
  33. llama.cpp/common/arg.cpp +0 -0
  34. llama.cpp/common/arg.h +131 -0
  35. llama.cpp/common/base64.hpp +392 -0
  36. llama.cpp/common/build-info.cpp.in +4 -0
  37. llama.cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  38. llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  39. llama.cpp/common/chat-parser.cpp +1649 -0
  40. llama.cpp/common/chat-parser.h +133 -0
  41. llama.cpp/common/chat-peg-parser.cpp +124 -0
  42. llama.cpp/common/chat-peg-parser.h +105 -0
  43. llama.cpp/common/chat.cpp +0 -0
  44. llama.cpp/common/chat.h +252 -0
  45. llama.cpp/common/common.cpp +1824 -0
  46. llama.cpp/common/common.h +931 -0
  47. llama.cpp/common/console.cpp +1137 -0
  48. llama.cpp/common/console.h +41 -0
  49. llama.cpp/common/debug.cpp +167 -0
  50. llama.cpp/common/debug.h +43 -0
llama.cpp/.devops/cann.Dockerfile ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # ARGUMENTS
3
+ # ==============================================================================
4
+
5
+ # Define the CANN base image for easier version updates later
6
+ ARG CHIP_TYPE=910b
7
+ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
8
+
9
+ # ==============================================================================
10
+ # BUILD STAGE
11
+ # Compile all binary files and libraries
12
+ # ==============================================================================
13
+ FROM ${CANN_BASE_IMAGE} AS build
14
+
15
+ # -- Install build dependencies --
16
+ RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
17
+ yum clean all && \
18
+ rm -rf /var/cache/yum
19
+
20
+ # -- Set the working directory --
21
+ WORKDIR /app
22
+
23
+ # -- Copy project files --
24
+ COPY . .
25
+
26
+ # -- Set CANN environment variables (required for compilation) --
27
+ # Using ENV instead of `source` allows environment variables to persist across the entire image layer
28
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
29
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
30
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
31
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
32
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
33
+ # ... You can add other environment variables from the original file as needed ...
34
+ # For brevity, only core variables are listed here. You can paste the original ENV list here.
35
+
36
+ # -- Build llama.cpp --
37
+ # Use the passed CHIP_TYPE argument and add general build options
38
+ ARG CHIP_TYPE
39
+ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
40
+ && \
41
+ cmake -B build \
42
+ -DGGML_CANN=ON \
43
+ -DCMAKE_BUILD_TYPE=Release \
44
+ -DSOC_TYPE=ascend${CHIP_TYPE} \
45
+ -DUSE_ACL_GRAPH=ON \
46
+ . && \
47
+ cmake --build build --config Release -j$(nproc)
48
+
49
+ # -- Organize build artifacts for copying in later stages --
50
+ # Create a lib directory to store all .so files
51
+ RUN mkdir -p /app/lib && \
52
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
53
+
54
+ # Create a full directory to store all executables and Python scripts
55
+ RUN mkdir -p /app/full && \
56
+ cp build/bin/* /app/full/ && \
57
+ cp *.py /app/full/ && \
58
+ cp -r gguf-py /app/full/ && \
59
+ cp -r requirements /app/full/ && \
60
+ cp requirements.txt /app/full/
61
+ # If you have a tools.sh script, make sure it is copied here
62
+ # cp .devops/tools.sh /app/full/tools.sh
63
+
64
+ # ==============================================================================
65
+ # BASE STAGE
66
+ # Create a minimal base image with CANN runtime and common libraries
67
+ # ==============================================================================
68
+ FROM ${CANN_BASE_IMAGE} AS base
69
+
70
+ # -- Install runtime dependencies --
71
+ RUN yum install -y libgomp curl && \
72
+ yum clean all && \
73
+ rm -rf /var/cache/yum
74
+
75
+ # -- Set CANN environment variables (required for runtime) --
76
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
77
+ ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
78
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
79
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
80
+ # ... You can add other environment variables from the original file as needed ...
81
+
82
+ WORKDIR /app
83
+
84
+ # Copy compiled .so files from the build stage
85
+ COPY --from=build /app/lib/ /app
86
+
87
+ # ==============================================================================
88
+ # FINAL STAGES (TARGETS)
89
+ # ==============================================================================
90
+
91
+ ### Target: full
92
+ # Complete image with all tools, Python bindings, and dependencies
93
+ # ==============================================================================
94
+ FROM base AS full
95
+
96
+ COPY --from=build /app/full /app
97
+
98
+ # Install Python dependencies
99
+ RUN yum install -y git python3 python3-pip && \
100
+ pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
101
+ pip3 install --no-cache-dir -r requirements.txt && \
102
+ yum clean all && \
103
+ rm -rf /var/cache/yum
104
+
105
+ # You need to provide a tools.sh script as the entrypoint
106
+ ENTRYPOINT ["/app/tools.sh"]
107
+ # If there is no tools.sh, you can set the default to start the server
108
+ # ENTRYPOINT ["/app/llama-server"]
109
+
110
+ ### Target: light
111
+ # Lightweight image containing only llama-cli and llama-completion
112
+ # ==============================================================================
113
+ FROM base AS light
114
+
115
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
116
+
117
+ ENTRYPOINT [ "/app/llama-cli" ]
118
+
119
+ ### Target: server
120
+ # Dedicated server image containing only llama-server
121
+ # ==============================================================================
122
+ FROM base AS server
123
+
124
+ ENV LLAMA_ARG_HOST=0.0.0.0
125
+
126
+ COPY --from=build /app/full/llama-server /app
127
+
128
+ HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
129
+
130
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/cpu.Dockerfile ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ FROM ubuntu:$UBUNTU_VERSION AS build
4
+
5
+ ARG TARGETARCH
6
+
7
+ RUN apt-get update && \
8
+ apt-get install -y build-essential git cmake libssl-dev
9
+
10
+ WORKDIR /app
11
+
12
+ COPY . .
13
+
14
+ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
15
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
16
+ else \
17
+ echo "Unsupported architecture"; \
18
+ exit 1; \
19
+ fi && \
20
+ cmake --build build -j $(nproc)
21
+
22
+ RUN mkdir -p /app/lib && \
23
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
24
+
25
+ RUN mkdir -p /app/full \
26
+ && cp build/bin/* /app/full \
27
+ && cp *.py /app/full \
28
+ && cp -r gguf-py /app/full \
29
+ && cp -r requirements /app/full \
30
+ && cp requirements.txt /app/full \
31
+ && cp .devops/tools.sh /app/full/tools.sh
32
+
33
+ ## Base image
34
+ FROM ubuntu:$UBUNTU_VERSION AS base
35
+
36
+ RUN apt-get update \
37
+ && apt-get install -y libgomp1 curl\
38
+ && apt autoremove -y \
39
+ && apt clean -y \
40
+ && rm -rf /tmp/* /var/tmp/* \
41
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
42
+ && find /var/cache -type f -delete
43
+
44
+ COPY --from=build /app/lib/ /app
45
+
46
+ ### Full
47
+ FROM base AS full
48
+
49
+ COPY --from=build /app/full /app
50
+
51
+ WORKDIR /app
52
+
53
+ RUN apt-get update \
54
+ && apt-get install -y \
55
+ git \
56
+ python3 \
57
+ python3-pip \
58
+ && pip install --upgrade pip setuptools wheel \
59
+ && pip install -r requirements.txt \
60
+ && apt autoremove -y \
61
+ && apt clean -y \
62
+ && rm -rf /tmp/* /var/tmp/* \
63
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
64
+ && find /var/cache -type f -delete
65
+
66
+ ENTRYPOINT ["/app/tools.sh"]
67
+
68
+ ### Light, CLI only
69
+ FROM base AS light
70
+
71
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
72
+
73
+ WORKDIR /app
74
+
75
+ ENTRYPOINT [ "/app/llama-cli" ]
76
+
77
+ ### Server, Server only
78
+ FROM base AS server
79
+
80
+ ENV LLAMA_ARG_HOST=0.0.0.0
81
+
82
+ COPY --from=build /app/full/llama-server /app
83
+
84
+ WORKDIR /app
85
+
86
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
87
+
88
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/cuda-new.Dockerfile ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=24.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG CUDA_VERSION=13.1.0
4
+ # Target the CUDA build image
5
+ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6
+
7
+ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
10
+
11
+ # CUDA architecture to build for (defaults to all supported archs)
12
+ ARG CUDA_DOCKER_ARCH=default
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
16
+
17
+ WORKDIR /app
18
+
19
+ COPY . .
20
+
21
+ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
22
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
23
+ fi && \
24
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
25
+ cmake --build build --config Release -j$(nproc)
26
+
27
+ RUN mkdir -p /app/lib && \
28
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
29
+
30
+ RUN mkdir -p /app/full \
31
+ && cp build/bin/* /app/full \
32
+ && cp *.py /app/full \
33
+ && cp -r gguf-py /app/full \
34
+ && cp -r requirements /app/full \
35
+ && cp requirements.txt /app/full \
36
+ && cp .devops/tools.sh /app/full/tools.sh
37
+
38
+ ## Base image
39
+ FROM ${BASE_CUDA_RUN_CONTAINER} AS base
40
+
41
+ RUN apt-get update \
42
+ && apt-get install -y libgomp1 curl\
43
+ && apt autoremove -y \
44
+ && apt clean -y \
45
+ && rm -rf /tmp/* /var/tmp/* \
46
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
47
+ && find /var/cache -type f -delete
48
+
49
+ COPY --from=build /app/lib/ /app
50
+
51
+ ### Full
52
+ FROM base AS full
53
+
54
+ COPY --from=build /app/full /app
55
+
56
+ WORKDIR /app
57
+
58
+ RUN apt-get update \
59
+ && apt-get install -y \
60
+ git \
61
+ python3 \
62
+ python3-pip \
63
+ python3-wheel \
64
+ && pip install --break-system-packages --upgrade setuptools \
65
+ && pip install --break-system-packages -r requirements.txt \
66
+ && apt autoremove -y \
67
+ && apt clean -y \
68
+ && rm -rf /tmp/* /var/tmp/* \
69
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
70
+ && find /var/cache -type f -delete
71
+
72
+
73
+ ENTRYPOINT ["/app/tools.sh"]
74
+
75
+ ### Light, CLI only
76
+ FROM base AS light
77
+
78
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
79
+
80
+ WORKDIR /app
81
+
82
+ ENTRYPOINT [ "/app/llama-cli" ]
83
+
84
+ ### Server, Server only
85
+ FROM base AS server
86
+
87
+ ENV LLAMA_ARG_HOST=0.0.0.0
88
+
89
+ COPY --from=build /app/full/llama-server /app
90
+
91
+ WORKDIR /app
92
+
93
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
94
+
95
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/cuda.Dockerfile ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG CUDA_VERSION=12.4.0
4
+ # Target the CUDA build image
5
+ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6
+
7
+ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
10
+
11
+ # CUDA architecture to build for (defaults to all supported archs)
12
+ ARG CUDA_DOCKER_ARCH=default
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
16
+
17
+ WORKDIR /app
18
+
19
+ COPY . .
20
+
21
+ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
22
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
23
+ fi && \
24
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
25
+ cmake --build build --config Release -j$(nproc)
26
+
27
+ RUN mkdir -p /app/lib && \
28
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
29
+
30
+ RUN mkdir -p /app/full \
31
+ && cp build/bin/* /app/full \
32
+ && cp *.py /app/full \
33
+ && cp -r gguf-py /app/full \
34
+ && cp -r requirements /app/full \
35
+ && cp requirements.txt /app/full \
36
+ && cp .devops/tools.sh /app/full/tools.sh
37
+
38
+ ## Base image
39
+ FROM ${BASE_CUDA_RUN_CONTAINER} AS base
40
+
41
+ RUN apt-get update \
42
+ && apt-get install -y libgomp1 curl\
43
+ && apt autoremove -y \
44
+ && apt clean -y \
45
+ && rm -rf /tmp/* /var/tmp/* \
46
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
47
+ && find /var/cache -type f -delete
48
+
49
+ COPY --from=build /app/lib/ /app
50
+
51
+ ### Full
52
+ FROM base AS full
53
+
54
+ COPY --from=build /app/full /app
55
+
56
+ WORKDIR /app
57
+
58
+ RUN apt-get update \
59
+ && apt-get install -y \
60
+ git \
61
+ python3 \
62
+ python3-pip \
63
+ && pip install --upgrade pip setuptools wheel \
64
+ && pip install --break-system-packages -r requirements.txt \
65
+ && apt autoremove -y \
66
+ && apt clean -y \
67
+ && rm -rf /tmp/* /var/tmp/* \
68
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
69
+ && find /var/cache -type f -delete
70
+
71
+
72
+ ENTRYPOINT ["/app/tools.sh"]
73
+
74
+ ### Light, CLI only
75
+ FROM base AS light
76
+
77
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
78
+
79
+ WORKDIR /app
80
+
81
+ ENTRYPOINT [ "/app/llama-cli" ]
82
+
83
+ ### Server, Server only
84
+ FROM base AS server
85
+
86
+ ENV LLAMA_ARG_HOST=0.0.0.0
87
+
88
+ COPY --from=build /app/full/llama-server /app
89
+
90
+ WORKDIR /app
91
+
92
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
93
+
94
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/intel.Dockerfile ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
2
+
3
+ ## Build Image
4
+
5
+ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
6
+
7
+ ARG GGML_SYCL_F16=OFF
8
+ RUN apt-get update && \
9
+ apt-get install -y git libssl-dev
10
+
11
+ WORKDIR /app
12
+
13
+ COPY . .
14
+
15
+ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
16
+ echo "GGML_SYCL_F16 is set" \
17
+ && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
18
+ fi && \
19
+ echo "Building with dynamic libs" && \
20
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
21
+ cmake --build build --config Release -j$(nproc)
22
+
23
+ RUN mkdir -p /app/lib && \
24
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
25
+
26
+ RUN mkdir -p /app/full \
27
+ && cp build/bin/* /app/full \
28
+ && cp *.py /app/full \
29
+ && cp -r gguf-py /app/full \
30
+ && cp -r requirements /app/full \
31
+ && cp requirements.txt /app/full \
32
+ && cp .devops/tools.sh /app/full/tools.sh
33
+
34
+ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
35
+
36
+ RUN apt-get update \
37
+ && apt-get install -y libgomp1 curl\
38
+ && apt autoremove -y \
39
+ && apt clean -y \
40
+ && rm -rf /tmp/* /var/tmp/* \
41
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
42
+ && find /var/cache -type f -delete
43
+
44
+ ### Full
45
+ FROM base AS full
46
+
47
+ COPY --from=build /app/lib/ /app
48
+ COPY --from=build /app/full /app
49
+
50
+ WORKDIR /app
51
+
52
+ RUN apt-get update && \
53
+ apt-get install -y \
54
+ git \
55
+ python3 \
56
+ python3-pip \
57
+ python3-venv && \
58
+ python3 -m venv /opt/venv && \
59
+ . /opt/venv/bin/activate && \
60
+ pip install --upgrade pip setuptools wheel && \
61
+ pip install -r requirements.txt && \
62
+ apt autoremove -y && \
63
+ apt clean -y && \
64
+ rm -rf /tmp/* /var/tmp/* && \
65
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
66
+ find /var/cache -type f -delete
67
+
68
+ ENV PATH="/opt/venv/bin:$PATH"
69
+
70
+ ENTRYPOINT ["/app/tools.sh"]
71
+
72
+ ### Light, CLI only
73
+ FROM base AS light
74
+
75
+ COPY --from=build /app/lib/ /app
76
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
77
+
78
+ WORKDIR /app
79
+
80
+ ENTRYPOINT [ "/app/llama-cli" ]
81
+
82
+ ### Server, Server only
83
+ FROM base AS server
84
+
85
+ ENV LLAMA_ARG_HOST=0.0.0.0
86
+
87
+ COPY --from=build /app/lib/ /app
88
+ COPY --from=build /app/full/llama-server /app
89
+
90
+ WORKDIR /app
91
+
92
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
93
+
94
+ ENTRYPOINT [ "/app/llama-server" ]
95
+
llama.cpp/.devops/llama-cli-cann.Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
2
+
3
+ FROM ascendai/cann:$ASCEND_VERSION AS build
4
+
5
+ WORKDIR /app
6
+
7
+ COPY . .
8
+
9
+ RUN yum install -y gcc g++ cmake make openssl-devel
10
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
11
+ ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
12
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
13
+ ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
14
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
15
+ ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
16
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
17
+ ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
18
+ ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
19
+
20
+ # find libascend_hal.so, because the drive hasn`t been mounted.
21
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
22
+
23
+ RUN echo "Building with static libs" && \
24
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
25
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
26
+ cmake --build build --config Release --target llama-cli && \
27
+ cmake --build build --config Release --target llama-completion
28
+
29
+ # TODO: use image with NNRT
30
+ FROM ascendai/cann:$ASCEND_VERSION AS runtime
31
+ COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
32
+
33
+ ENV LC_ALL=C.utf8
34
+
35
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
36
+ ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
37
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
38
+ ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
39
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
40
+ ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
41
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
42
+ ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
43
+ ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
44
+
45
+ ENTRYPOINT ["/llama-cli" ]
llama.cpp/.devops/llama-cpp-cuda.srpm.spec ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
3
+ # Built and maintained by John Boero - boeroboy@gmail.com
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
10
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
11
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
12
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
13
+ # It is up to the user to install the correct vendor-specific support.
14
+
15
+ Name: llama.cpp-cuda
16
+ Version: %( date "+%%Y%%m%%d" )
17
+ Release: 1%{?dist}
18
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
19
+ License: MIT
20
+ Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
21
+ BuildRequires: coreutils make gcc-c++ git cuda-toolkit
22
+ Requires: cuda-toolkit
23
+ URL: https://github.com/ggml-org/llama.cpp
24
+
25
+ %define debug_package %{nil}
26
+ %define source_date_epoch_from_changelog 0
27
+
28
+ %description
29
+ CPU inference for Meta's Lllama2 models using default options.
30
+
31
+ %prep
32
+ %setup -n llama.cpp-master
33
+
34
+ %build
35
+ make -j GGML_CUDA=1
36
+
37
+ %install
38
+ mkdir -p %{buildroot}%{_bindir}/
39
+ cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
40
+ cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
41
+ cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
42
+ cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
43
+
44
+ mkdir -p %{buildroot}/usr/lib/systemd/system
45
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
46
+ [Unit]
47
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
48
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
49
+
50
+ [Service]
51
+ Type=simple
52
+ EnvironmentFile=/etc/sysconfig/llama
53
+ ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
54
+ ExecReload=/bin/kill -s HUP $MAINPID
55
+ Restart=never
56
+
57
+ [Install]
58
+ WantedBy=default.target
59
+ EOF
60
+
61
+ mkdir -p %{buildroot}/etc/sysconfig
62
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
63
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
64
+ EOF
65
+
66
+ %clean
67
+ rm -rf %{buildroot}
68
+ rm -rf %{_builddir}/*
69
+
70
+ %files
71
+ %{_bindir}/llama-cuda-cli
72
+ %{_bindir}/llama-cuda-completion
73
+ %{_bindir}/llama-cuda-server
74
+ %{_bindir}/llama-cuda-simple
75
+ /usr/lib/systemd/system/llamacuda.service
76
+ %config /etc/sysconfig/llama
77
+
78
+ %pre
79
+
80
+ %post
81
+
82
+ %preun
83
+ %postun
84
+
85
+ %changelog
llama.cpp/.devops/llama-cpp.srpm.spec ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
3
+ # Built and maintained by John Boero - boeroboy@gmail.com
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # In the meantime, YYYYMMDD format will be used.
10
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
11
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
12
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
13
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
14
+ # It is up to the user to install the correct vendor-specific support.
15
+
16
+ Name: llama.cpp
17
+ Version: %( date "+%%Y%%m%%d" )
18
+ Release: 1%{?dist}
19
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
20
+ License: MIT
21
+ Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
22
+ BuildRequires: coreutils make gcc-c++ git libstdc++-devel
23
+ Requires: libstdc++
24
+ URL: https://github.com/ggml-org/llama.cpp
25
+
26
+ %define debug_package %{nil}
27
+ %define source_date_epoch_from_changelog 0
28
+
29
+ %description
30
+ CPU inference for Meta's Lllama2 models using default options.
31
+ Models are not included in this package and must be downloaded separately.
32
+
33
+ %prep
34
+ %setup -n llama.cpp-master
35
+
36
+ %build
37
+ make -j
38
+
39
+ %install
40
+ mkdir -p %{buildroot}%{_bindir}/
41
+ cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
42
+ cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
43
+ cp -p llama-server %{buildroot}%{_bindir}/llama-server
44
+ cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
45
+
46
+ mkdir -p %{buildroot}/usr/lib/systemd/system
47
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
48
+ [Unit]
49
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
50
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
51
+
52
+ [Service]
53
+ Type=simple
54
+ EnvironmentFile=/etc/sysconfig/llama
55
+ ExecStart=/usr/bin/llama-server $LLAMA_ARGS
56
+ ExecReload=/bin/kill -s HUP $MAINPID
57
+ Restart=never
58
+
59
+ [Install]
60
+ WantedBy=default.target
61
+ EOF
62
+
63
+ mkdir -p %{buildroot}/etc/sysconfig
64
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
65
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
66
+ EOF
67
+
68
+ %clean
69
+ rm -rf %{buildroot}
70
+ rm -rf %{_builddir}/*
71
+
72
+ %files
73
+ %{_bindir}/llama-cli
74
+ %{_bindir}/llama-completion
75
+ %{_bindir}/llama-server
76
+ %{_bindir}/llama-simple
77
+ /usr/lib/systemd/system/llama.service
78
+ %config /etc/sysconfig/llama
79
+
80
+ %pre
81
+
82
+ %post
83
+
84
+ %preun
85
+ %postun
86
+
87
+ %changelog
llama.cpp/.devops/musa.Dockerfile ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG MUSA_VERSION=rc4.3.0
4
+ # Target the MUSA build image
5
+ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
6
+
7
+ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
8
+
9
+ FROM ${BASE_MUSA_DEV_CONTAINER} AS build
10
+
11
+ # MUSA architecture to build for (defaults to all supported archs)
12
+ ARG MUSA_DOCKER_ARCH=default
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y \
16
+ build-essential \
17
+ cmake \
18
+ python3 \
19
+ python3-pip \
20
+ git \
21
+ libssl-dev \
22
+ libgomp1
23
+
24
+ WORKDIR /app
25
+
26
+ COPY . .
27
+
28
+ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
29
+ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
30
+ fi && \
31
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
32
+ cmake --build build --config Release -j$(nproc)
33
+
34
+ RUN mkdir -p /app/lib && \
35
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
36
+
37
+ RUN mkdir -p /app/full \
38
+ && cp build/bin/* /app/full \
39
+ && cp *.py /app/full \
40
+ && cp -r gguf-py /app/full \
41
+ && cp -r requirements /app/full \
42
+ && cp requirements.txt /app/full \
43
+ && cp .devops/tools.sh /app/full/tools.sh
44
+
45
+ ## Base image
46
+ FROM ${BASE_MUSA_RUN_CONTAINER} AS base
47
+
48
+ RUN apt-get update \
49
+ && apt-get install -y libgomp1 curl\
50
+ && apt autoremove -y \
51
+ && apt clean -y \
52
+ && rm -rf /tmp/* /var/tmp/* \
53
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
54
+ && find /var/cache -type f -delete
55
+
56
+ COPY --from=build /app/lib/ /app
57
+
58
+ ### Full
59
+ FROM base AS full
60
+
61
+ COPY --from=build /app/full /app
62
+
63
+ WORKDIR /app
64
+
65
+ RUN apt-get update \
66
+ && apt-get install -y \
67
+ git \
68
+ python3 \
69
+ python3-pip \
70
+ && pip install --upgrade pip setuptools wheel \
71
+ && pip install -r requirements.txt \
72
+ && apt autoremove -y \
73
+ && apt clean -y \
74
+ && rm -rf /tmp/* /var/tmp/* \
75
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
76
+ && find /var/cache -type f -delete
77
+
78
+
79
+ ENTRYPOINT ["/app/tools.sh"]
80
+
81
+ ### Light, CLI only
82
+ FROM base AS light
83
+
84
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
85
+
86
+ WORKDIR /app
87
+
88
+ ENTRYPOINT [ "/app/llama-cli" ]
89
+
90
+ ### Server, Server only
91
+ FROM base AS server
92
+
93
+ ENV LLAMA_ARG_HOST=0.0.0.0
94
+
95
+ COPY --from=build /app/full/llama-server /app
96
+
97
+ WORKDIR /app
98
+
99
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
100
+
101
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/rocm.Dockerfile ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=24.04
2
+
3
+ # This needs to generally match the container host's environment.
4
+ ARG ROCM_VERSION=7.2
5
+ ARG AMDGPU_VERSION=7.2
6
+
7
+ # Target the ROCm build image
8
+ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
9
+
10
+ ### Build image
11
+ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
12
+
13
+ # Unless otherwise specified, we make a fat build.
14
+ # This is mostly tied to rocBLAS supported archs.
15
+ # check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
16
+ # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
17
+ # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
18
+
19
+ ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
20
+
21
+ # Set ROCm architectures
22
+ ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
23
+
24
+ RUN apt-get update \
25
+ && apt-get install -y \
26
+ build-essential \
27
+ cmake \
28
+ git \
29
+ libssl-dev \
30
+ curl \
31
+ libgomp1
32
+
33
+ WORKDIR /app
34
+
35
+ COPY . .
36
+
37
+ RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
38
+ cmake -S . -B build \
39
+ -DGGML_HIP=ON \
40
+ -DGGML_HIP_ROCWMMA_FATTN=ON \
41
+ -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
42
+ -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
43
+ -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
44
+ && cmake --build build --config Release -j$(nproc)
45
+
46
+ RUN mkdir -p /app/lib \
47
+ && find build -name "*.so*" -exec cp -P {} /app/lib \;
48
+
49
+ RUN mkdir -p /app/full \
50
+ && cp build/bin/* /app/full \
51
+ && cp *.py /app/full \
52
+ && cp -r gguf-py /app/full \
53
+ && cp -r requirements /app/full \
54
+ && cp requirements.txt /app/full \
55
+ && cp .devops/tools.sh /app/full/tools.sh
56
+
57
+ ## Base image
58
+ FROM ${BASE_ROCM_DEV_CONTAINER} AS base
59
+
60
+ RUN apt-get update \
61
+ && apt-get install -y libgomp1 curl\
62
+ && apt autoremove -y \
63
+ && apt clean -y \
64
+ && rm -rf /tmp/* /var/tmp/* \
65
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
66
+ && find /var/cache -type f -delete
67
+
68
+ COPY --from=build /app/lib/ /app
69
+
70
+ ### Full
71
+ FROM base AS full
72
+
73
+ COPY --from=build /app/full /app
74
+
75
+ WORKDIR /app
76
+
77
+ RUN apt-get update \
78
+ && apt-get install -y \
79
+ git \
80
+ python3-pip \
81
+ python3 \
82
+ python3-wheel\
83
+ && pip install --break-system-packages --upgrade setuptools \
84
+ && pip install --break-system-packages -r requirements.txt \
85
+ && apt autoremove -y \
86
+ && apt clean -y \
87
+ && rm -rf /tmp/* /var/tmp/* \
88
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
89
+ && find /var/cache -type f -delete
90
+
91
+ ENTRYPOINT ["/app/tools.sh"]
92
+
93
+ ### Light, CLI only
94
+ FROM base AS light
95
+
96
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
97
+
98
+ WORKDIR /app
99
+
100
+ ENTRYPOINT [ "/app/llama-cli" ]
101
+
102
+ ### Server, Server only
103
+ FROM base AS server
104
+
105
+ ENV LLAMA_ARG_HOST=0.0.0.0
106
+
107
+ COPY --from=build /app/full/llama-server /app
108
+
109
+ WORKDIR /app
110
+
111
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
112
+
113
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/s390x.Dockerfile ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG GCC_VERSION=15.2.0
2
+ ARG UBUNTU_VERSION=24.04
3
+
4
+ ### Build Llama.cpp stage
5
+ FROM gcc:${GCC_VERSION} AS build
6
+
7
+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
8
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
9
+ apt update -y && \
10
+ apt upgrade -y && \
11
+ apt install -y --no-install-recommends \
12
+ git cmake ccache ninja-build \
13
+ # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
14
+ libopenblas-dev libssl-dev && \
15
+ rm -rf /var/lib/apt/lists/*
16
+
17
+ WORKDIR /app
18
+ COPY . .
19
+
20
+ RUN --mount=type=cache,target=/root/.ccache \
21
+ --mount=type=cache,target=/app/build \
22
+ cmake -S . -B build -G Ninja \
23
+ -DCMAKE_BUILD_TYPE=Release \
24
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
25
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
26
+ -DLLAMA_BUILD_TESTS=OFF \
27
+ -DGGML_NATIVE=OFF \
28
+ -DGGML_BACKEND_DL=ON \
29
+ -DGGML_CPU_ALL_VARIANTS=ON \
30
+ -DGGML_BLAS=ON \
31
+ -DGGML_BLAS_VENDOR=OpenBLAS && \
32
+ cmake --build build --config Release -j $(nproc) && \
33
+ cmake --install build --prefix /opt/llama.cpp
34
+
35
+ COPY *.py /opt/llama.cpp/bin
36
+ COPY .devops/tools.sh /opt/llama.cpp/bin
37
+
38
+ COPY gguf-py /opt/llama.cpp/gguf-py
39
+ COPY requirements.txt /opt/llama.cpp/gguf-py
40
+ COPY requirements /opt/llama.cpp/gguf-py/requirements
41
+
42
+
43
+ ### Collect all llama.cpp binaries, libraries and distro libraries
44
+ FROM scratch AS collector
45
+
46
+ # Copy llama.cpp binaries and libraries
47
+ COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
48
+ COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
49
+ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
50
+
51
+
52
+ ### Base image
53
+ FROM ubuntu:${UBUNTU_VERSION} AS base
54
+
55
+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
56
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
57
+ apt update -y && \
58
+ apt install -y --no-install-recommends \
59
+ # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
60
+ # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
61
+ curl libgomp1 libopenblas-dev && \
62
+ apt autoremove -y && \
63
+ apt clean -y && \
64
+ rm -rf /tmp/* /var/tmp/* && \
65
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
66
+ find /var/cache -type f -delete
67
+
68
+ # Copy llama.cpp libraries
69
+ COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
70
+
71
+
72
+ ### Full
73
+ FROM base AS full
74
+
75
+ ENV PATH="/root/.cargo/bin:${PATH}"
76
+ WORKDIR /app
77
+
78
+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
79
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
80
+ apt update -y && \
81
+ apt install -y \
82
+ git cmake libjpeg-dev \
83
+ python3 python3-pip python3-dev && \
84
+ apt autoremove -y && \
85
+ apt clean -y && \
86
+ rm -rf /tmp/* /var/tmp/* && \
87
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
88
+ find /var/cache -type f -delete
89
+
90
+ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
91
+
92
+ COPY --from=collector /llama.cpp/bin /app
93
+ COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
94
+
95
+ RUN pip install --no-cache-dir --break-system-packages \
96
+ -r /app/gguf-py/requirements.txt
97
+
98
+ ENTRYPOINT [ "/app/tools.sh" ]
99
+
100
+
101
+ ### CLI Only
102
+ FROM base AS light
103
+
104
+ WORKDIR /llama.cpp/bin
105
+
106
+ # Copy llama.cpp binaries and libraries
107
+ COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
108
+ COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
109
+
110
+ ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
111
+
112
+
113
+ ### Server
114
+ FROM base AS server
115
+
116
+ ENV LLAMA_ARG_HOST=0.0.0.0
117
+
118
+ WORKDIR /llama.cpp/bin
119
+
120
+ # Copy llama.cpp binaries and libraries
121
+ COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
122
+ COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
123
+
124
+ EXPOSE 8080
125
+
126
+ ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
llama.cpp/.devops/tools.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ # Read the first argument into a variable
5
+ arg1="$1"
6
+
7
+ # Shift the arguments to remove the first one
8
+ shift
9
+
10
+ if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
11
+ exec python3 ./convert_hf_to_gguf.py "$@"
12
+ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
13
+ exec ./llama-quantize "$@"
14
+ elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
15
+ exec ./llama-cli "$@"
16
+ elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
17
+ exec ./llama-completion "$@"
18
+ elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
19
+ exec ./llama-bench "$@"
20
+ elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
21
+ exec ./llama-perplexity "$@"
22
+ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
23
+ echo "Converting PTH to GGML..."
24
+ for i in $(ls $1/$2/ggml-model-f16.bin*); do
25
+ if [ -f "${i/f16/q4_0}" ]; then
26
+ echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
27
+ else
28
+ echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
29
+ exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
30
+ fi
31
+ done
32
+ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
33
+ exec ./llama-server "$@"
34
+ else
35
+ echo "Unknown command: $arg1"
36
+ echo "Available commands: "
37
+ echo " --run (-r): Run a model (chat) previously converted into ggml"
38
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin"
39
+ echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
40
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
41
+ echo " --bench (-b): Benchmark the performance of the inference for various parameters."
42
+ echo " ex: -m model.gguf"
43
+ echo " --perplexity (-p): Measure the perplexity of a model over a given text."
44
+ echo " ex: -m model.gguf -f file.txt"
45
+ echo " --convert (-c): Convert a llama model into ggml"
46
+ echo " ex: --outtype f16 \"/models/7B/\" "
47
+ echo " --quantize (-q): Optimize with quantization process ggml"
48
+ echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
49
+ echo " --all-in-one (-a): Execute --convert & --quantize"
50
+ echo " ex: \"/models/\" 7B"
51
+ echo " --server (-s): Run a model on the server"
52
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
53
+ fi
llama.cpp/.devops/vulkan.Dockerfile ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=26.04
2
+
3
+ FROM ubuntu:$UBUNTU_VERSION AS build
4
+
5
+ # Install build tools
6
+ RUN apt update && apt install -y git build-essential cmake wget xz-utils
7
+
8
+ # Install SSL and Vulkan SDK dependencies
9
+ RUN apt install -y libssl-dev curl \
10
+ libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
11
+
12
+ # Build it
13
+ WORKDIR /app
14
+
15
+ COPY . .
16
+
17
+ RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
18
+ cmake --build build --config Release -j$(nproc)
19
+
20
+ RUN mkdir -p /app/lib && \
21
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
22
+
23
+ RUN mkdir -p /app/full \
24
+ && cp build/bin/* /app/full \
25
+ && cp *.py /app/full \
26
+ && cp -r gguf-py /app/full \
27
+ && cp -r requirements /app/full \
28
+ && cp requirements.txt /app/full \
29
+ && cp .devops/tools.sh /app/full/tools.sh
30
+
31
+ ## Base image
32
+ FROM ubuntu:$UBUNTU_VERSION AS base
33
+
34
+ RUN apt-get update \
35
+ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
36
+ libglvnd0 libgl1 libglx0 libegl1 libgles2 \
37
+ && apt autoremove -y \
38
+ && apt clean -y \
39
+ && rm -rf /tmp/* /var/tmp/* \
40
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
41
+ && find /var/cache -type f -delete
42
+
43
+ COPY --from=build /app/lib/ /app
44
+
45
+ ### Full
46
+ FROM base AS full
47
+
48
+ COPY --from=build /app/full /app
49
+
50
+ WORKDIR /app
51
+
52
+ RUN apt-get update \
53
+ && apt-get install -y \
54
+ build-essential \
55
+ git \
56
+ python3 \
57
+ python3-dev \
58
+ python3-pip \
59
+ python3-wheel \
60
+ && pip install --break-system-packages --upgrade setuptools \
61
+ && pip install --break-system-packages -r requirements.txt \
62
+ && apt autoremove -y \
63
+ && apt clean -y \
64
+ && rm -rf /tmp/* /var/tmp/* \
65
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
66
+ && find /var/cache -type f -delete
67
+
68
+ ENTRYPOINT ["/app/tools.sh"]
69
+
70
+ ### Light, CLI only
71
+ FROM base AS light
72
+
73
+ COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
74
+
75
+ WORKDIR /app
76
+
77
+ ENTRYPOINT [ "/app/llama-cli" ]
78
+
79
+ ### Server, Server only
80
+ FROM base AS server
81
+
82
+ ENV LLAMA_ARG_HOST=0.0.0.0
83
+
84
+ COPY --from=build /app/full/llama-server /app
85
+
86
+ WORKDIR /app
87
+
88
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
89
+
90
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.gemini/settings.json ADDED
@@ -0,0 +1 @@
 
 
1
+ { "contextFileName": "AGENTS.md" }
llama.cpp/.github/labeler.yml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/actions/labeler
2
+ Apple Metal:
3
+ - changed-files:
4
+ - any-glob-to-any-file:
5
+ - ggml/include/ggml-metal.h
6
+ - ggml/src/ggml-metal/**
7
+ - README-metal.md
8
+ SYCL:
9
+ - changed-files:
10
+ - any-glob-to-any-file:
11
+ - ggml/include/ggml-sycl.h
12
+ - ggml/src/ggml-sycl/**
13
+ - docs/backend/SYCL.md
14
+ - examples/sycl/**
15
+ Nvidia GPU:
16
+ - changed-files:
17
+ - any-glob-to-any-file:
18
+ - ggml/include/ggml-cuda.h
19
+ - ggml/src/ggml-cuda/**
20
+ Vulkan:
21
+ - changed-files:
22
+ - any-glob-to-any-file:
23
+ - ggml/include/ggml-vulkan.h
24
+ - ggml/src/ggml-vulkan/**
25
+ IBM zDNN:
26
+ - changed-files:
27
+ - any-glob-to-any-file:
28
+ - ggml/include/ggml-zdnn.h
29
+ - ggml/src/ggml-zdnn/**
30
+ documentation:
31
+ - changed-files:
32
+ - any-glob-to-any-file:
33
+ - docs/**
34
+ - media/**
35
+ testing:
36
+ - changed-files:
37
+ - any-glob-to-any-file:
38
+ - tests/**
39
+ build:
40
+ - changed-files:
41
+ - any-glob-to-any-file:
42
+ - cmake/**
43
+ - CMakeLists.txt
44
+ - CMakePresets.json
45
+ examples:
46
+ - changed-files:
47
+ - any-glob-to-any-file:
48
+ - examples/**
49
+ - tools/**
50
+ devops:
51
+ - changed-files:
52
+ - any-glob-to-any-file:
53
+ - .devops/**
54
+ - .github/**
55
+ - ci/**
56
+ python:
57
+ - changed-files:
58
+ - any-glob-to-any-file:
59
+ - "**/*.py"
60
+ - requirements/**
61
+ - gguf-py/**
62
+ - .flake8
63
+ script:
64
+ - changed-files:
65
+ - any-glob-to-any-file:
66
+ - scripts/**
67
+ android:
68
+ - changed-files:
69
+ - any-glob-to-any-file:
70
+ - examples/llama.android/**
71
+ server:
72
+ - changed-files:
73
+ - any-glob-to-any-file:
74
+ - tools/server/**
75
+ ggml:
76
+ - changed-files:
77
+ - any-glob-to-any-file:
78
+ - ggml/**
79
+ model:
80
+ - changed-files:
81
+ - any-glob-to-any-file:
82
+ - src/models/**
83
+ nix:
84
+ - changed-files:
85
+ - any-glob-to-any-file:
86
+ - "**/*.nix"
87
+ - .github/workflows/nix-*.yml
88
+ - .devops/nix/nixpkgs-instances.nix
89
+ embedding:
90
+ - changed-files:
91
+ - any-glob-to-any-file: examples/embedding/
92
+ jinja parser:
93
+ - changed-files:
94
+ - any-glob-to-any-file:
95
+ - common/jinja/**
96
+ Ascend NPU:
97
+ - changed-files:
98
+ - any-glob-to-any-file:
99
+ - ggml/include/ggml-cann.h
100
+ - ggml/src/ggml-cann/**
101
+ - docs/backend/CANN.md
102
+ OpenCL:
103
+ - changed-files:
104
+ - any-glob-to-any-file:
105
+ - ggml/include/ggml-opencl.h
106
+ - ggml/src/ggml-opencl/**
llama.cpp/.github/pull_request_template.md ADDED
@@ -0,0 +1 @@
 
 
1
+ *Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
llama.cpp/build/CMakeCache.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the CMakeCache file.
2
+ # For build in directory: r:/Quillan/Quillan-v4.2-model/llama.cpp/build
3
+ # It was generated by CMake: C:/Program Files/CMake/bin/cmake.exe
4
+ # You can edit this file to change values found and used by cmake.
5
+ # If you do not want to change any of the values, simply exit the editor.
6
+ # If you do want to change a value, simply edit, save, and exit the editor.
7
+ # The syntax for the file is as follows:
8
+ # KEY:TYPE=VALUE
9
+ # KEY is the name of a variable in the cache.
10
+ # TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
11
+ # VALUE is the current value for the KEY.
12
+
13
+ ########################
14
+ # EXTERNAL cache entries
15
+ ########################
16
+
17
+ //Value Computed by CMake.
18
+ CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp/build/CMakeFiles/pkgRedirects
19
+
20
+ //Program used to build from makefiles.
21
+ CMAKE_MAKE_PROGRAM:STRING=nmake
22
+
23
+ //Value Computed by CMake
24
+ CMAKE_PROJECT_COMPAT_VERSION:STATIC=
25
+
26
+ //Value Computed by CMake
27
+ CMAKE_PROJECT_DESCRIPTION:STATIC=
28
+
29
+ //Value Computed by CMake
30
+ CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
31
+
32
+ //Value Computed by CMake
33
+ CMAKE_PROJECT_NAME:STATIC=llama.cpp
34
+
35
+ //Value Computed by CMake
36
+ CMAKE_PROJECT_SPDX_LICENSE:STATIC=
37
+
38
+ //Value Computed by CMake
39
+ llama.cpp_BINARY_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp/build
40
+
41
+ //Value Computed by CMake
42
+ llama.cpp_IS_TOP_LEVEL:STATIC=ON
43
+
44
+ //Value Computed by CMake
45
+ llama.cpp_SOURCE_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp
46
+
47
+
48
+ ########################
49
+ # INTERNAL cache entries
50
+ ########################
51
+
52
+ //This is the directory where this CMakeCache.txt was created
53
+ CMAKE_CACHEFILE_DIR:INTERNAL=r:/Quillan/Quillan-v4.2-model/llama.cpp/build
54
+ //Major version of cmake used to create the current loaded cache
55
+ CMAKE_CACHE_MAJOR_VERSION:INTERNAL=4
56
+ //Minor version of cmake used to create the current loaded cache
57
+ CMAKE_CACHE_MINOR_VERSION:INTERNAL=2
58
+ //Patch version of cmake used to create the current loaded cache
59
+ CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
60
+ //Path to CMake executable.
61
+ CMAKE_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cmake.exe
62
+ //Path to cpack program executable.
63
+ CMAKE_CPACK_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cpack.exe
64
+ //Path to ctest program executable.
65
+ CMAKE_CTEST_COMMAND:INTERNAL=C:/Program Files/CMake/bin/ctest.exe
66
+ //Path to cache edit program executable.
67
+ CMAKE_EDIT_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cmake-gui.exe
68
+ //Name of external makefile project generator.
69
+ CMAKE_EXTRA_GENERATOR:INTERNAL=
70
+ //Name of generator.
71
+ CMAKE_GENERATOR:INTERNAL=NMake Makefiles
72
+ //Generator instance identifier.
73
+ CMAKE_GENERATOR_INSTANCE:INTERNAL=
74
+ //Name of generator platform.
75
+ CMAKE_GENERATOR_PLATFORM:INTERNAL=
76
+ //Name of generator toolset.
77
+ CMAKE_GENERATOR_TOOLSET:INTERNAL=
78
+ //Source directory with the top level CMakeLists.txt file for this
79
+ // project
80
+ CMAKE_HOME_DIRECTORY:INTERNAL=R:/Quillan/Quillan-v4.2-model/llama.cpp
81
+ //Name of CMakeLists files to read
82
+ CMAKE_LIST_FILE_NAME:INTERNAL=CMakeLists.txt
83
+ //ADVANCED property for variable: CMAKE_MAKE_PROGRAM
84
+ CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
85
+ //number of local generators
86
+ CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
87
+ //Platform information initialized
88
+ CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
89
+ //Path to CMake installation.
90
+ CMAKE_ROOT:INTERNAL=C:/Program Files/CMake/share/cmake-4.2
91
+
llama.cpp/ci/README-MUSA.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Running MUSA CI in a Docker Container
2
+
3
+ Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
4
+
5
+ ### 1. Create a local directory to store cached models, configuration files and venv:
6
+
7
+ ```bash
8
+ mkdir -p $HOME/llama.cpp/ci-cache
9
+ ```
10
+
11
+ ### 2. Create a local directory to store CI run results:
12
+
13
+ ```bash
14
+ mkdir -p $HOME/llama.cpp/ci-results
15
+ ```
16
+
17
+ ### 3. Start a Docker container and run the CI:
18
+
19
+ ```bash
20
+ docker run --privileged -it \
21
+ -v $HOME/llama.cpp/ci-cache:/ci-cache \
22
+ -v $HOME/llama.cpp/ci-results:/ci-results \
23
+ -v $PWD:/ws -w /ws \
24
+ mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
25
+ ```
26
+
27
+ Inside the container, execute the following commands:
28
+
29
+ ```bash
30
+ apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
31
+ git config --global --add safe.directory /ws
32
+ GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
33
+ ```
34
+
35
+ This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
llama.cpp/ci/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CI
2
+
3
+ This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
4
+ cover hardware configurations that are not available from Github-hosted runners and/or require more computational
5
+ resource than normally available.
6
+
7
+ It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:
8
+
9
+ ```bash
10
+ mkdir tmp
11
+
12
+ # CPU-only build
13
+ bash ./ci/run.sh ./tmp/results ./tmp/mnt
14
+
15
+ # with CUDA support
16
+ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
17
+
18
+ # with SYCL support
19
+ source /opt/intel/oneapi/setvars.sh
20
+ GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21
+
22
+ # with MUSA support
23
+ GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
24
+
25
+ # etc.
26
+ ```
27
+
28
+ # Adding self-hosted runners
29
+
30
+ - Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
31
+ - Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
32
+ - Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
33
+ - Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env
llama.cpp/ci/run.sh ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # sample usage:
4
+ #
5
+ # mkdir tmp
6
+ #
7
+ # # CPU-only build
8
+ # bash ./ci/run.sh ./tmp/results ./tmp/mnt
9
+ #
10
+ # # with CUDA support
11
+ # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
12
+ #
13
+ # # with SYCL support
14
+ # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
15
+ #
16
+ # # with VULKAN support
17
+ # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
18
+ #
19
+ # # with WebGPU support
20
+ # GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21
+ #
22
+ # # with MUSA support
23
+ # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
24
+ #
25
+ # # with KLEIDIAI support
26
+ # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
27
+ #
28
+
29
+ if [ -z "$2" ]; then
30
+ echo "usage: $0 <output-dir> <mnt-dir>"
31
+ exit 1
32
+ fi
33
+
34
+ mkdir -p "$1"
35
+ mkdir -p "$2"
36
+
37
+ OUT=$(realpath "$1")
38
+ MNT=$(realpath "$2")
39
+
40
+ rm -f $OUT/*.log
41
+ rm -f $OUT/*.exit
42
+ rm -f $OUT/*.md
43
+
44
+ sd=`dirname $0`
45
+ cd $sd/../
46
+ SRC=`pwd`
47
+
48
+ CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
49
+
50
+ if [ ! -z ${GG_BUILD_METAL} ]; then
51
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
52
+ fi
53
+
54
+ if [ ! -z ${GG_BUILD_CUDA} ]; then
55
+ # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
56
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
57
+
58
+ if command -v nvidia-smi >/dev/null 2>&1; then
59
+ CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
60
+ if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
61
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
62
+ else
63
+ echo "Warning: Using fallback CUDA architectures"
64
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
65
+ fi
66
+ else
67
+ echo "Error: nvidia-smi not found, cannot build with CUDA"
68
+ exit 1
69
+ fi
70
+ fi
71
+
72
+ if [ ! -z ${GG_BUILD_ROCM} ]; then
73
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
74
+ if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
75
+ echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
76
+ exit 1
77
+ fi
78
+
79
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
80
+ fi
81
+
82
+ if [ ! -z ${GG_BUILD_SYCL} ]; then
83
+ if [ -z ${ONEAPI_ROOT} ]; then
84
+ echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
85
+ echo "source /opt/intel/oneapi/setvars.sh"
86
+ exit 1
87
+ fi
88
+ # Use only main GPU
89
+ export ONEAPI_DEVICE_SELECTOR="level_zero:0"
90
+ # Enable sysman for correct memory reporting
91
+ export ZES_ENABLE_SYSMAN=1
92
+ # to circumvent precision issues on CPY operations
93
+ export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
94
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
95
+ fi
96
+
97
+ if [ ! -z ${GG_BUILD_VULKAN} ]; then
98
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
99
+
100
+ # if on Mac, disable METAL
101
+ if [[ "$OSTYPE" == "darwin"* ]]; then
102
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
103
+ fi
104
+
105
+ fi
106
+
107
+ if [ ! -z ${GG_BUILD_WEBGPU} ]; then
108
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
109
+
110
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
111
+ if [ -z "${CMAKE_PREFIX_PATH}" ]; then
112
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
113
+ else
114
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
115
+ fi
116
+ fi
117
+
118
+ # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
119
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
120
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
121
+ fi
122
+ fi
123
+
124
+ if [ ! -z ${GG_BUILD_MUSA} ]; then
125
+ # Use qy1 by default (MTT S80)
126
+ MUSA_ARCH=${MUSA_ARCH:-21}
127
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
128
+ fi
129
+
130
+ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
131
+ # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
132
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
133
+ fi
134
+
135
+ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
136
+ echo ">>===== Enabling KleidiAI support"
137
+
138
+ CANDIDATES=(
139
+ "armv9-a+dotprod+i8mm+sve2"
140
+ "armv9-a+dotprod+i8mm"
141
+ "armv8.6-a+dotprod+i8mm"
142
+ "armv8.2-a+dotprod"
143
+ )
144
+ CPU=""
145
+
146
+ for cpu in "${CANDIDATES[@]}"; do
147
+ if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
148
+ CPU="$cpu"
149
+ break
150
+ fi
151
+ done
152
+
153
+ if [ -z "$CPU" ]; then
154
+ echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
155
+ exit 1
156
+ fi
157
+
158
+ echo ">>===== Using ARM baseline: ${CPU}"
159
+
160
+ CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
161
+ -DGGML_NATIVE=OFF \
162
+ -DGGML_CPU_KLEIDIAI=ON \
163
+ -DGGML_CPU_AARCH64=ON \
164
+ -DGGML_CPU_ARM_ARCH=${CPU} \
165
+ -DBUILD_SHARED_LIBS=OFF"
166
+ fi
167
+
168
+ ## helpers
169
+
170
+ # download a file if it does not exist or if it is outdated
171
+ function gg_wget {
172
+ local out=$1
173
+ local url=$2
174
+
175
+ local cwd=`pwd`
176
+
177
+ mkdir -p $out
178
+ cd $out
179
+
180
+ # should not re-download if file is the same
181
+ wget -nv -c -N $url
182
+
183
+ cd $cwd
184
+ }
185
+
186
+ function gg_printf {
187
+ printf -- "$@" >> $OUT/README.md
188
+ }
189
+
190
+ function gg_run {
191
+ ci=$1
192
+
193
+ set -o pipefail
194
+ set -x
195
+
196
+ gg_run_$ci | tee $OUT/$ci.log
197
+ cur=$?
198
+ echo "$cur" > $OUT/$ci.exit
199
+
200
+ set +x
201
+ set +o pipefail
202
+
203
+ gg_sum_$ci
204
+
205
+ ret=$((ret | cur))
206
+ }
207
+
208
+ ## ci
209
+
210
+ # ctest_debug
211
+
212
+ function gg_run_ctest_debug {
213
+ cd ${SRC}
214
+
215
+ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
216
+
217
+ set -e
218
+
219
+ # Check cmake, make and ctest are installed
220
+ gg_check_build_requirements
221
+
222
+ (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
223
+ (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
224
+
225
+ (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
226
+
227
+ set +e
228
+ }
229
+
230
+ function gg_sum_ctest_debug {
231
+ gg_printf '### %s\n\n' "${ci}"
232
+
233
+ gg_printf 'Runs ctest in debug mode\n'
234
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
235
+ gg_printf '```\n'
236
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
237
+ gg_printf '```\n'
238
+ gg_printf '\n'
239
+ }
240
+
241
+ # ctest_release
242
+
243
+ function gg_run_ctest_release {
244
+ cd ${SRC}
245
+
246
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
247
+
248
+ set -e
249
+
250
+ # Check cmake, make and ctest are installed
251
+ gg_check_build_requirements
252
+
253
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
254
+ (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
255
+
256
+ if [ -z ${GG_BUILD_LOW_PERF} ]; then
257
+ (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
258
+ else
259
+ (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
260
+ fi
261
+
262
+ set +e
263
+ }
264
+
265
+ function gg_sum_ctest_release {
266
+ gg_printf '### %s\n\n' "${ci}"
267
+
268
+ gg_printf 'Runs ctest in release mode\n'
269
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
270
+ gg_printf '```\n'
271
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
272
+ gg_printf '```\n'
273
+ }
274
+
275
+ # test_scripts
276
+
277
+ function gg_run_test_scripts {
278
+ cd ${SRC}
279
+
280
+ set -e
281
+
282
+ (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
283
+ (cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
284
+
285
+ set +e
286
+ }
287
+
288
+ function gg_sum_test_scripts {
289
+ gg_printf '### %s\n\n' "${ci}"
290
+
291
+ gg_printf 'Runs test scripts\n'
292
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
293
+ gg_printf '```\n'
294
+ gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
295
+ gg_printf '```\n'
296
+ gg_printf '\n'
297
+ }
298
+
299
+ function gg_get_model {
300
+ #local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
301
+ local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
302
+ if [[ -s $gguf_0 ]]; then
303
+ echo -n "$gguf_0"
304
+ else
305
+ echo >&2 "No model found. Can't run gg_run_ctest_with_model."
306
+ exit 1
307
+ fi
308
+ }
309
+
310
+ function gg_run_ctest_with_model_debug {
311
+ cd ${SRC}
312
+
313
+ local model; model=$(gg_get_model)
314
+ cd build-ci-debug
315
+ set -e
316
+
317
+ (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
318
+
319
+ set +e
320
+ cd ..
321
+ }
322
+
323
+ function gg_run_ctest_with_model_release {
324
+ cd ${SRC}
325
+
326
+ local model; model=$(gg_get_model)
327
+ cd build-ci-release
328
+ set -e
329
+
330
+ (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
331
+
332
+ # test memory leaks
333
+ #if [[ ! -z ${GG_BUILD_METAL} ]]; then
334
+ # # TODO: this hangs for some reason ...
335
+ # (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
336
+ #fi
337
+
338
+ set +e
339
+ cd ..
340
+ }
341
+
342
+ function gg_sum_ctest_with_model_debug {
343
+ gg_printf '### %s\n\n' "${ci}"
344
+
345
+ gg_printf 'Runs ctest with model files in debug mode\n'
346
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
347
+ gg_printf '```\n'
348
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
349
+ gg_printf '```\n'
350
+ }
351
+
352
+ function gg_sum_ctest_with_model_release {
353
+ gg_printf '### %s\n\n' "${ci}"
354
+
355
+ gg_printf 'Runs ctest with model files in release mode\n'
356
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
357
+ gg_printf '```\n'
358
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
359
+ gg_printf '```\n'
360
+ }
361
+
362
+ # qwen3_0_6b
363
+
364
+ function gg_run_qwen3_0_6b {
365
+ cd ${SRC}
366
+
367
+ gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
368
+ gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
369
+ gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
370
+ #gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
371
+ gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
372
+
373
+
374
+ gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
375
+ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
376
+
377
+ path_models="../models-mnt/qwen3/0.6B"
378
+ path_wiki="../models-mnt/wikitext/wikitext-2-raw"
379
+
380
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
381
+
382
+ set -e
383
+
384
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
385
+ (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
386
+
387
+ python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
388
+ python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
389
+
390
+ model_f16="${path_models}/ggml-model-f16.gguf"
391
+ model_bf16="${path_models}/ggml-model-bf16.gguf"
392
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
393
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
394
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
395
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
396
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
397
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
398
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
399
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
400
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
401
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
402
+
403
+ wiki_test="${path_wiki}/wiki.test.raw"
404
+
405
+ ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
406
+ ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
407
+ ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
408
+ ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
409
+ ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
410
+ ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
411
+ ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
412
+ ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
413
+ ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
414
+ ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
415
+
416
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
417
+
418
+ (time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
419
+ (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
420
+ (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
421
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
422
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
423
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
424
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
425
+ (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
426
+ (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
427
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
428
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
429
+ (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
430
+
431
+ (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
432
+ if [ -z ${GG_BUILD_NO_BF16} ]; then
433
+ (time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
434
+ fi
435
+ (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
436
+ (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
437
+ (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
438
+ (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
439
+ (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
440
+ (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
441
+ (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
442
+ (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
443
+ (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
444
+ (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
445
+
446
+ (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
447
+
448
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
449
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
450
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
451
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
452
+
453
+ function check_ppl {
454
+ qnt="$1"
455
+ ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
456
+
457
+ if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
458
+ printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
459
+ return 20
460
+ fi
461
+
462
+ printf ' - %s @ %s OK\n' "$qnt" "$ppl"
463
+ return 0
464
+ }
465
+
466
+ check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
467
+ if [ -z ${GG_BUILD_NO_BF16} ]; then
468
+ check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
469
+ fi
470
+ check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
471
+ check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
472
+ check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
473
+ check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
474
+ check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
475
+ #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
476
+ check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
477
+ check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
478
+ check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
479
+ check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
480
+
481
+ cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
482
+
483
+ set +e
484
+ }
485
+
486
+ function gg_sum_qwen3_0_6b {
487
+ gg_printf '### %s\n\n' "${ci}"
488
+
489
+ gg_printf 'Qwen3 0.6B:\n'
490
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
491
+ gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
492
+ gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
493
+ gg_printf '- f16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
494
+ if [ -z ${GG_BUILD_NO_BF16} ]; then
495
+ gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
496
+ fi
497
+ gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
498
+ gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
499
+ gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
500
+ gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
501
+ gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
502
+ gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
503
+ gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
504
+ gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
505
+ gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
506
+ gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
507
+ gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
508
+ }
509
+
510
+ # bge-small
511
+
512
+ function gg_run_embd_bge_small {
513
+ cd ${SRC}
514
+
515
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
516
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
517
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
518
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
519
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
520
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
521
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
522
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
523
+ gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
524
+
525
+ gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
526
+
527
+ path_models="../models-mnt/bge-small"
528
+
529
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
530
+
531
+ set -e
532
+
533
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
534
+ (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
535
+
536
+ python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
537
+
538
+ model_f16="${path_models}/ggml-model-f16.gguf"
539
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
540
+
541
+ ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
542
+
543
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
544
+
545
+ (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
546
+ (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
547
+
548
+ set +e
549
+ }
550
+
551
+ function gg_sum_embd_bge_small {
552
+ gg_printf '### %s\n\n' "${ci}"
553
+
554
+ gg_printf 'BGE Small (BERT):\n'
555
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
556
+ gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
557
+ gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
558
+ }
559
+
560
+ # rerank_tiny
561
+
562
+ function gg_run_rerank_tiny {
563
+ cd ${SRC}
564
+
565
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
566
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
567
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
568
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
569
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
570
+ gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.json
571
+
572
+ path_models="../models-mnt/rerank-tiny"
573
+
574
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
575
+
576
+ set -e
577
+
578
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
579
+ (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
580
+
581
+ python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
582
+
583
+ model_f16="${path_models}/ggml-model-f16.gguf"
584
+
585
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
586
+
587
+ # for this model, the SEP token is "</s>"
588
+ (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
589
+
590
+ # sample output
591
+ # rerank score 0: 0.029
592
+ # rerank score 1: 0.029
593
+ # rerank score 2: 0.135
594
+
595
+ # check that the score is in the range [$3, $4]
596
+ function check_score {
597
+ qnt="$1"
598
+ score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
599
+
600
+ if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
601
+ printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
602
+ return 20
603
+ fi
604
+
605
+ printf ' - %s @ %s OK\n' "$qnt" "$score"
606
+ return 0
607
+ }
608
+
609
+ check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
610
+ check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
611
+ check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
612
+
613
+ set +e
614
+ }
615
+
616
+ function gg_sum_rerank_tiny {
617
+ gg_printf '### %s\n\n' "${ci}"
618
+
619
+ gg_printf 'Rerank Tiny (Jina):\n'
620
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
621
+ gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
622
+ }
623
+
624
+ function gg_check_build_requirements {
625
+ if ! command -v cmake &> /dev/null; then
626
+ gg_printf 'cmake not found, please install'
627
+ fi
628
+
629
+ if ! command -v make &> /dev/null; then
630
+ gg_printf 'make not found, please install'
631
+ fi
632
+
633
+ if ! command -v ctest &> /dev/null; then
634
+ gg_printf 'ctest not found, please install'
635
+ fi
636
+ }
637
+
638
+ function gg_run_test_backend_ops_cpu {
639
+ cd ${SRC}
640
+
641
+ cd build-ci-release
642
+
643
+ set -e
644
+
645
+ (time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
646
+
647
+ set +e
648
+ }
649
+
650
+ function gg_sum_test_backend_ops_cpu {
651
+ gg_printf '### %s\n\n' "${ci}"
652
+
653
+ gg_printf 'Runs test-backend-ops for CPU backend\n'
654
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
655
+ gg_printf '```\n'
656
+ gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
657
+ gg_printf '```\n'
658
+ gg_printf '\n'
659
+ }
660
+
661
+ ## main
662
+
663
+ export LLAMA_LOG_PREFIX=1
664
+ export LLAMA_LOG_TIMESTAMPS=1
665
+
666
+ if [ -z ${GG_BUILD_LOW_PERF} ]; then
667
+ # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
668
+ rm -rf ${SRC}/models-mnt
669
+ mnt_models=${MNT}/models
670
+ mkdir -p ${mnt_models}
671
+ ln -sfn ${mnt_models} ${SRC}/models-mnt
672
+
673
+ # Create a fresh python3 venv and enter it
674
+ if ! python3 -m venv "$MNT/venv"; then
675
+ echo "Error: Failed to create Python virtual environment at $MNT/venv."
676
+ exit 1
677
+ fi
678
+ source "$MNT/venv/bin/activate"
679
+
680
+ pip install -r ${SRC}/requirements.txt --disable-pip-version-check
681
+ pip install --editable gguf-py --disable-pip-version-check
682
+ fi
683
+
684
+ ret=0
685
+
686
+ test $ret -eq 0 && gg_run ctest_debug
687
+ test $ret -eq 0 && gg_run ctest_release
688
+
689
+ if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
690
+ test $ret -eq 0 && gg_run test_backend_ops_cpu
691
+ fi
692
+
693
+ if [ -z ${GG_BUILD_LOW_PERF} ]; then
694
+ test $ret -eq 0 && gg_run embd_bge_small
695
+ test $ret -eq 0 && gg_run rerank_tiny
696
+
697
+ if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
698
+ test $ret -eq 0 && gg_run test_scripts
699
+ fi
700
+
701
+ test $ret -eq 0 && gg_run qwen3_0_6b
702
+
703
+ test $ret -eq 0 && gg_run ctest_with_model_debug
704
+ test $ret -eq 0 && gg_run ctest_with_model_release
705
+ fi
706
+
707
+ cat $OUT/README.md
708
+
709
+ exit $ret
llama.cpp/cmake/arm64-apple-clang.cmake ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set( CMAKE_SYSTEM_NAME Darwin )
2
+ set( CMAKE_SYSTEM_PROCESSOR arm64 )
3
+
4
+ set( target arm64-apple-darwin-macho )
5
+
6
+ set( CMAKE_C_COMPILER clang )
7
+ set( CMAKE_CXX_COMPILER clang++ )
8
+
9
+ set( CMAKE_C_COMPILER_TARGET ${target} )
10
+ set( CMAKE_CXX_COMPILER_TARGET ${target} )
11
+
12
+ set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13
+ set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
14
+
15
+ set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16
+ set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
llama.cpp/cmake/arm64-windows-llvm.cmake ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set( CMAKE_SYSTEM_NAME Windows )
2
+ set( CMAKE_SYSTEM_PROCESSOR arm64 )
3
+
4
+ set( target arm64-pc-windows-msvc )
5
+
6
+ set( CMAKE_C_COMPILER clang )
7
+ set( CMAKE_CXX_COMPILER clang++ )
8
+
9
+ set( CMAKE_C_COMPILER_TARGET ${target} )
10
+ set( CMAKE_CXX_COMPILER_TARGET ${target} )
11
+
12
+ set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13
+ set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
14
+
15
+ set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16
+ set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
llama.cpp/cmake/build-info.cmake ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set(BUILD_NUMBER 0)
2
+ set(BUILD_COMMIT "unknown")
3
+ set(BUILD_COMPILER "unknown")
4
+ set(BUILD_TARGET "unknown")
5
+
6
+ # Look for git
7
+ find_package(Git)
8
+ if(NOT Git_FOUND)
9
+ find_program(GIT_EXECUTABLE NAMES git git.exe)
10
+ if(GIT_EXECUTABLE)
11
+ set(Git_FOUND TRUE)
12
+ message(STATUS "Found Git: ${GIT_EXECUTABLE}")
13
+ else()
14
+ message(WARNING "Git not found. Build info will not be accurate.")
15
+ endif()
16
+ endif()
17
+
18
+ # Get the commit count and hash
19
+ if(Git_FOUND)
20
+ execute_process(
21
+ COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
22
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
23
+ OUTPUT_VARIABLE HEAD
24
+ OUTPUT_STRIP_TRAILING_WHITESPACE
25
+ RESULT_VARIABLE RES
26
+ )
27
+ if (RES EQUAL 0)
28
+ set(BUILD_COMMIT ${HEAD})
29
+ endif()
30
+ execute_process(
31
+ COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
32
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
33
+ OUTPUT_VARIABLE COUNT
34
+ OUTPUT_STRIP_TRAILING_WHITESPACE
35
+ RESULT_VARIABLE RES
36
+ )
37
+ if (RES EQUAL 0)
38
+ set(BUILD_NUMBER ${COUNT})
39
+ endif()
40
+ endif()
41
+
42
+ set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
43
+
44
+ if(CMAKE_VS_PLATFORM_NAME)
45
+ set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
46
+ else()
47
+ set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
48
+ endif()
llama.cpp/cmake/common.cmake ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include("ggml/cmake/common.cmake")
2
+
3
+ function(llama_add_compile_flags)
4
+ if (LLAMA_FATAL_WARNINGS)
5
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
6
+ list(APPEND C_FLAGS -Werror)
7
+ list(APPEND CXX_FLAGS -Werror)
8
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
9
+ add_compile_options(/WX)
10
+ endif()
11
+ endif()
12
+
13
+ if (LLAMA_ALL_WARNINGS)
14
+ if (NOT MSVC)
15
+ list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
16
+ -Werror=implicit-int -Werror=implicit-function-declaration)
17
+
18
+ list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
19
+
20
+ list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
21
+
22
+ list(APPEND C_FLAGS ${WARNING_FLAGS})
23
+ list(APPEND CXX_FLAGS ${WARNING_FLAGS})
24
+
25
+ ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
26
+
27
+ add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
28
+ "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
29
+ else()
30
+ # todo : msvc
31
+ set(C_FLAGS "" PARENT_SCOPE)
32
+ set(CXX_FLAGS "" PARENT_SCOPE)
33
+ endif()
34
+ endif()
35
+
36
+ if (NOT MSVC)
37
+ if (LLAMA_SANITIZE_THREAD)
38
+ message(STATUS "Using -fsanitize=thread")
39
+
40
+ add_compile_options(-fsanitize=thread)
41
+ link_libraries (-fsanitize=thread)
42
+ endif()
43
+
44
+ if (LLAMA_SANITIZE_ADDRESS)
45
+ message(STATUS "Using -fsanitize=address")
46
+
47
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
48
+ link_libraries (-fsanitize=address)
49
+ endif()
50
+
51
+ if (LLAMA_SANITIZE_UNDEFINED)
52
+ message(STATUS "Using -fsanitize=undefined")
53
+
54
+ add_compile_options(-fsanitize=undefined)
55
+ link_libraries (-fsanitize=undefined)
56
+ endif()
57
+ endif()
58
+ endfunction()
llama.cpp/cmake/download-models.cmake ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
2
+ file(MAKE_DIRECTORY "${DEST_DIR}")
3
+
4
+ if(NOT EXISTS "${DEST}")
5
+ message(STATUS "Downloading ${NAME} from ggml-org/models...")
6
+ endif()
7
+
8
+ file(DOWNLOAD
9
+ "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
10
+ "${DEST}"
11
+ TLS_VERIFY ON
12
+ EXPECTED_HASH ${HASH}
13
+ STATUS status
14
+ )
15
+
16
+ list(GET status 0 code)
17
+
18
+ if(NOT code EQUAL 0)
19
+ list(GET status 1 msg)
20
+ message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
21
+ endif()
llama.cpp/cmake/git-vars.cmake ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ find_package(Git)
2
+
3
+ # the commit's SHA1
4
+ execute_process(COMMAND
5
+ "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
6
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
7
+ OUTPUT_VARIABLE GIT_SHA1
8
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+
10
+ # the date of the commit
11
+ execute_process(COMMAND
12
+ "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14
+ OUTPUT_VARIABLE GIT_DATE
15
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16
+
17
+ # the subject of the commit
18
+ execute_process(COMMAND
19
+ "${GIT_EXECUTABLE}" log -1 --format=%s
20
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21
+ OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
llama.cpp/cmake/license.cmake ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define_property(GLOBAL PROPERTY LICENSE_TEXT
2
+ BRIEF_DOCS "Embedded licenses"
3
+ FULL_DOCS "Global string containing all aggregated licenses"
4
+ )
5
+
6
+ function(license_add_file NAME FILE)
7
+ if(NOT IS_ABSOLUTE "${FILE}")
8
+ set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
9
+ endif()
10
+ if(EXISTS "${FILE}")
11
+ set(TITLE "License for ${NAME}")
12
+ string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
13
+ file(READ "${FILE}" TEXT)
14
+ get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
15
+ string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
16
+ set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
17
+ else()
18
+ message(WARNING "License file '${FILE}' not found")
19
+ endif()
20
+ endfunction()
21
+
22
+ function(license_generate TARGET_NAME)
23
+ message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
24
+ get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
25
+
26
+ set(CPP_CONTENT "// Generated by CMake\n\n")
27
+ string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
28
+ string(APPEND CPP_CONTENT "${TEXT}")
29
+ string(APPEND CPP_CONTENT "nullptr\n")
30
+ string(APPEND CPP_CONTENT "};\n")
31
+
32
+ set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
33
+ file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
34
+
35
+ if(TARGET ${TARGET_NAME})
36
+ target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
37
+ else()
38
+ message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
39
+ endif()
40
+ endfunction()
llama.cpp/cmake/llama-config.cmake.in ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
2
+ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
3
+ set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
4
+ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
5
+
6
+ @PACKAGE_INIT@
7
+
8
+ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
9
+ set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
10
+ set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
11
+
12
+ find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
13
+
14
+ find_library(llama_LIBRARY llama
15
+ REQUIRED
16
+ HINTS ${LLAMA_LIB_DIR}
17
+ NO_CMAKE_FIND_ROOT_PATH
18
+ )
19
+
20
+ add_library(llama UNKNOWN IMPORTED)
21
+ set_target_properties(llama
22
+ PROPERTIES
23
+ INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
24
+ INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
25
+ IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
26
+ IMPORTED_LOCATION "${llama_LIBRARY}"
27
+ INTERFACE_COMPILE_FEATURES c_std_90
28
+ POSITION_INDEPENDENT_CODE ON)
29
+
30
+ check_required_components(Llama)
llama.cpp/cmake/llama.pc.in ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ prefix=@CMAKE_INSTALL_PREFIX@
2
+ exec_prefix=@CMAKE_INSTALL_PREFIX@
3
+ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
4
+ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
5
+
6
+ Name: llama
7
+ Description: Port of Facebook's LLaMA model in C/C++
8
+ Version: @LLAMA_INSTALL_VERSION@
9
+ Libs: -L${libdir} -lggml -lggml-base -lllama
10
+ Cflags: -I${includedir}
llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set(CMAKE_SYSTEM_NAME Linux)
2
+ set(CMAKE_SYSTEM_PROCESSOR riscv64)
3
+ set(CMAKE_SYSTEM_VERSION 1)
4
+
5
+ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
6
+ message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
7
+ else()
8
+ set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
9
+ if (DEFINED ENV{RISCV_ROOT_PATH})
10
+ file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
11
+ else()
12
+ message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
13
+ endif()
14
+
15
+ set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
16
+ set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
17
+ set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
18
+ set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
19
+ set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
20
+ set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
21
+ endif()
22
+
23
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
24
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
25
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
26
+ set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
27
+ set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
28
+ set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
29
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
llama.cpp/cmake/x64-windows-llvm.cmake ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set( CMAKE_SYSTEM_NAME Windows )
2
+ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
3
+
4
+ set( CMAKE_C_COMPILER clang )
5
+ set( CMAKE_CXX_COMPILER clang++ )
llama.cpp/common/CMakeLists.txt ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # common
2
+
3
+ find_package(Threads REQUIRED)
4
+
5
+ llama_add_compile_flags()
6
+
7
+ # Build info header
8
+
9
+ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
10
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
11
+
12
+ # Is git submodule
13
+ if(NOT IS_DIRECTORY "${GIT_DIR}")
14
+ file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
15
+ string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
16
+ string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
17
+ if (SLASH_POS EQUAL 0)
18
+ set(GIT_DIR "${REAL_GIT_DIR}")
19
+ else()
20
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
21
+ endif()
22
+ endif()
23
+
24
+ if(EXISTS "${GIT_DIR}/index")
25
+ # For build-info.cpp below
26
+ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
27
+ else()
28
+ message(WARNING "Git index not found in git repository.")
29
+ endif()
30
+ else()
31
+ message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
32
+ endif()
33
+
34
+ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
35
+ set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
36
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
37
+
38
+ set(TARGET build_info)
39
+ add_library(${TARGET} OBJECT ${OUTPUT_FILE})
40
+ if (BUILD_SHARED_LIBS)
41
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
42
+ endif()
43
+
44
+ set(TARGET common)
45
+
46
+ add_library(${TARGET} STATIC
47
+ arg.cpp
48
+ arg.h
49
+ base64.hpp
50
+ chat-parser.cpp
51
+ chat-parser.h
52
+ chat-parser-xml-toolcall.h
53
+ chat-parser-xml-toolcall.cpp
54
+ chat-peg-parser.cpp
55
+ chat-peg-parser.h
56
+ chat.cpp
57
+ chat.h
58
+ common.cpp
59
+ common.h
60
+ console.cpp
61
+ console.h
62
+ debug.cpp
63
+ debug.h
64
+ download.cpp
65
+ download.h
66
+ http.h
67
+ json-partial.cpp
68
+ json-partial.h
69
+ json-schema-to-grammar.cpp
70
+ llguidance.cpp
71
+ log.cpp
72
+ log.h
73
+ ngram-cache.cpp
74
+ ngram-cache.h
75
+ ngram-map.cpp
76
+ ngram-map.h
77
+ ngram-mod.cpp
78
+ ngram-mod.h
79
+ peg-parser.cpp
80
+ peg-parser.h
81
+ preset.cpp
82
+ preset.h
83
+ regex-partial.cpp
84
+ regex-partial.h
85
+ sampling.cpp
86
+ sampling.h
87
+ speculative.cpp
88
+ speculative.h
89
+ unicode.cpp
90
+ unicode.h
91
+ jinja/lexer.cpp
92
+ jinja/lexer.h
93
+ jinja/parser.cpp
94
+ jinja/parser.h
95
+ jinja/runtime.cpp
96
+ jinja/runtime.h
97
+ jinja/value.cpp
98
+ jinja/value.h
99
+ jinja/string.cpp
100
+ jinja/string.h
101
+ jinja/caps.cpp
102
+ jinja/caps.h
103
+ )
104
+
105
+ target_include_directories(${TARGET} PUBLIC . ../vendor)
106
+ target_compile_features (${TARGET} PUBLIC cxx_std_17)
107
+
108
+ if (BUILD_SHARED_LIBS)
109
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
110
+ endif()
111
+
112
+ target_link_libraries(${TARGET} PRIVATE
113
+ build_info
114
+ cpp-httplib
115
+ )
116
+
117
+ if (LLAMA_LLGUIDANCE)
118
+ include(ExternalProject)
119
+ set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
120
+ set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
121
+ set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
122
+
123
+ ExternalProject_Add(llguidance_ext
124
+ GIT_REPOSITORY https://github.com/guidance-ai/llguidance
125
+ # v1.0.1:
126
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
127
+ PREFIX ${CMAKE_BINARY_DIR}/llguidance
128
+ SOURCE_DIR ${LLGUIDANCE_SRC}
129
+ BUILD_IN_SOURCE TRUE
130
+ CONFIGURE_COMMAND ""
131
+ BUILD_COMMAND cargo build --release --package llguidance
132
+ INSTALL_COMMAND ""
133
+ BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
134
+ UPDATE_COMMAND ""
135
+ )
136
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
137
+
138
+ add_library(llguidance STATIC IMPORTED)
139
+ set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
140
+ add_dependencies(llguidance llguidance_ext)
141
+
142
+ target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
143
+ target_link_libraries(${TARGET} PRIVATE llguidance)
144
+ if (WIN32)
145
+ target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
146
+ endif()
147
+ endif()
148
+
149
+ target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
llama.cpp/common/arg.cpp ADDED
The diff for this file is too large to render. See raw diff
 
llama.cpp/common/arg.h ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "common.h"
4
+
5
+ #include <set>
6
+ #include <map>
7
+ #include <string>
8
+ #include <vector>
9
+ #include <cstring>
10
+
11
+ // pseudo-env variable to identify preset-only arguments
12
+ #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13
+ #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
14
+
15
+ //
16
+ // CLI argument parsing
17
+ //
18
+
19
+ struct common_arg {
20
+ std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
21
+ std::set<enum llama_example> excludes = {};
22
+ std::vector<const char *> args;
23
+ std::vector<const char *> args_neg; // for negated args like --no-xxx
24
+ const char * value_hint = nullptr; // help text or example for arg value
25
+ const char * value_hint_2 = nullptr; // for second arg value
26
+ const char * env = nullptr;
27
+ std::string help;
28
+ bool is_sparam = false; // is current arg a sampling param?
29
+ bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
30
+ void (*handler_void) (common_params & params) = nullptr;
31
+ void (*handler_string) (common_params & params, const std::string &) = nullptr;
32
+ void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
33
+ void (*handler_int) (common_params & params, int) = nullptr;
34
+ void (*handler_bool) (common_params & params, bool) = nullptr;
35
+
36
+ common_arg() = default;
37
+
38
+ common_arg(
39
+ const std::initializer_list<const char *> & args,
40
+ const char * value_hint,
41
+ const std::string & help,
42
+ void (*handler)(common_params & params, const std::string &)
43
+ ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
44
+
45
+ common_arg(
46
+ const std::initializer_list<const char *> & args,
47
+ const char * value_hint,
48
+ const std::string & help,
49
+ void (*handler)(common_params & params, int)
50
+ ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
51
+
52
+ common_arg(
53
+ const std::initializer_list<const char *> & args,
54
+ const std::string & help,
55
+ void (*handler)(common_params & params)
56
+ ) : args(args), help(help), handler_void(handler) {}
57
+
58
+ common_arg(
59
+ const std::initializer_list<const char *> & args,
60
+ const std::initializer_list<const char *> & args_neg,
61
+ const std::string & help,
62
+ void (*handler)(common_params & params, bool)
63
+ ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
64
+
65
+ // support 2 values for arg
66
+ common_arg(
67
+ const std::initializer_list<const char *> & args,
68
+ const char * value_hint,
69
+ const char * value_hint_2,
70
+ const std::string & help,
71
+ void (*handler)(common_params & params, const std::string &, const std::string &)
72
+ ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
73
+
74
+ common_arg & set_examples(std::initializer_list<enum llama_example> examples);
75
+ common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
76
+ common_arg & set_env(const char * env);
77
+ common_arg & set_sparam();
78
+ common_arg & set_preset_only();
79
+ bool in_example(enum llama_example ex);
80
+ bool is_exclude(enum llama_example ex);
81
+ bool get_value_from_env(std::string & output) const;
82
+ bool has_value_from_env() const;
83
+ std::string to_string() const;
84
+
85
+ // for using as key in std::map
86
+ bool operator<(const common_arg& other) const {
87
+ if (args.empty() || other.args.empty()) {
88
+ return false;
89
+ }
90
+ return strcmp(args[0], other.args[0]) < 0;
91
+ }
92
+ bool operator==(const common_arg& other) const {
93
+ if (args.empty() || other.args.empty()) {
94
+ return false;
95
+ }
96
+ return strcmp(args[0], other.args[0]) == 0;
97
+ }
98
+
99
+ // get all args and env vars (including negated args/env)
100
+ std::vector<std::string> get_args() const;
101
+ std::vector<std::string> get_env() const;
102
+ };
103
+
104
+ namespace common_arg_utils {
105
+ bool is_truthy(const std::string & value);
106
+ bool is_falsey(const std::string & value);
107
+ bool is_autoy(const std::string & value);
108
+ }
109
+
110
+ struct common_params_context {
111
+ enum llama_example ex = LLAMA_EXAMPLE_COMMON;
112
+ common_params & params;
113
+ std::vector<common_arg> options;
114
+ void(*print_usage)(int, char **) = nullptr;
115
+ common_params_context(common_params & params) : params(params) {}
116
+ };
117
+
118
+ // parse input arguments from CLI
119
+ // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
120
+ bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
121
+
122
+ // parse input arguments from CLI into a map
123
+ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
124
+
125
+ // populate preset-only arguments
126
+ // these arguments are not treated as command line arguments
127
+ // see: https://github.com/ggml-org/llama.cpp/issues/18163
128
+ void common_params_add_preset_options(std::vector<common_arg> & args);
129
+
130
+ // initialize argument parser context - used by test-arg-parser and preset
131
+ common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
llama.cpp/common/base64.hpp ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ This is free and unencumbered software released into the public domain.
3
+
4
+ Anyone is free to copy, modify, publish, use, compile, sell, or
5
+ distribute this software, either in source code form or as a compiled
6
+ binary, for any purpose, commercial or non-commercial, and by any
7
+ means.
8
+
9
+ In jurisdictions that recognize copyright laws, the author or authors
10
+ of this software dedicate any and all copyright interest in the
11
+ software to the public domain. We make this dedication for the benefit
12
+ of the public at large and to the detriment of our heirs and
13
+ successors. We intend this dedication to be an overt act of
14
+ relinquishment in perpetuity of all present and future rights to this
15
+ software under copyright law.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20
+ IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ OTHER DEALINGS IN THE SOFTWARE.
24
+
25
+ For more information, please refer to <http://unlicense.org>
26
+ */
27
+
28
+ #ifndef PUBLIC_DOMAIN_BASE64_HPP_
29
+ #define PUBLIC_DOMAIN_BASE64_HPP_
30
+
31
+ #include <cstdint>
32
+ #include <iterator>
33
+ #include <stdexcept>
34
+ #include <string>
35
+
36
+ class base64_error : public std::runtime_error
37
+ {
38
+ public:
39
+ using std::runtime_error::runtime_error;
40
+ };
41
+
42
+ class base64
43
+ {
44
+ public:
45
+ enum class alphabet
46
+ {
47
+ /** the alphabet is detected automatically */
48
+ auto_,
49
+ /** the standard base64 alphabet is used */
50
+ standard,
51
+ /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
52
+ url_filename_safe
53
+ };
54
+
55
+ enum class decoding_behavior
56
+ {
57
+ /** if the input is not padded, the remaining bits are ignored */
58
+ moderate,
59
+ /** if a padding character is encounter decoding is finished */
60
+ loose
61
+ };
62
+
63
+ /**
64
+ Encodes all the elements from `in_begin` to `in_end` to `out`.
65
+
66
+ @warning The source and destination cannot overlap. The destination must be able to hold at least
67
+ `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
68
+
69
+ @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
70
+ 8 bits
71
+ @tparam Output_iterator the destination; the elements written to it are from the type `char`
72
+ @param in_begin the beginning of the source
73
+ @param in_end the ending of the source
74
+ @param out the destination iterator
75
+ @param alphabet which alphabet should be used
76
+ @returns the iterator to the next element past the last element copied
77
+ @throws see `Input_iterator` and `Output_iterator`
78
+ */
79
+ template<typename Input_iterator, typename Output_iterator>
80
+ static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
81
+ alphabet alphabet = alphabet::standard)
82
+ {
83
+ constexpr auto pad = '=';
84
+ const char* alpha = alphabet == alphabet::url_filename_safe
85
+ ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
86
+ : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
87
+
88
+ while (in_begin != in_end) {
89
+ std::uint8_t i0 = 0, i1 = 0, i2 = 0;
90
+
91
+ // first character
92
+ i0 = static_cast<std::uint8_t>(*in_begin);
93
+ ++in_begin;
94
+
95
+ *out = alpha[i0 >> 2 & 0x3f];
96
+ ++out;
97
+
98
+ // part of first character and second
99
+ if (in_begin != in_end) {
100
+ i1 = static_cast<std::uint8_t>(*in_begin);
101
+ ++in_begin;
102
+
103
+ *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
104
+ ++out;
105
+ } else {
106
+ *out = alpha[(i0 & 0x3) << 4];
107
+ ++out;
108
+
109
+ // last padding
110
+ *out = pad;
111
+ ++out;
112
+
113
+ // last padding
114
+ *out = pad;
115
+ ++out;
116
+
117
+ break;
118
+ }
119
+
120
+ // part of second character and third
121
+ if (in_begin != in_end) {
122
+ i2 = static_cast<std::uint8_t>(*in_begin);
123
+ ++in_begin;
124
+
125
+ *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
126
+ ++out;
127
+ } else {
128
+ *out = alpha[(i1 & 0xf) << 2];
129
+ ++out;
130
+
131
+ // last padding
132
+ *out = pad;
133
+ ++out;
134
+
135
+ break;
136
+ }
137
+
138
+ // rest of third
139
+ *out = alpha[i2 & 0x3f];
140
+ ++out;
141
+ }
142
+
143
+ return out;
144
+ }
145
+ /**
146
+ Encodes a string.
147
+
148
+ @param str the string that should be encoded
149
+ @param alphabet which alphabet should be used
150
+ @returns the encoded base64 string
151
+ @throws see base64::encode()
152
+ */
153
+ static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
154
+ {
155
+ std::string result;
156
+
157
+ result.reserve(required_encode_size(str.length()) + 1);
158
+
159
+ encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
160
+
161
+ return result;
162
+ }
163
+ /**
164
+ Encodes a char array.
165
+
166
+ @param buffer the char array
167
+ @param size the size of the array
168
+ @param alphabet which alphabet should be used
169
+ @returns the encoded string
170
+ */
171
+ static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
172
+ {
173
+ std::string result;
174
+
175
+ result.reserve(required_encode_size(size) + 1);
176
+
177
+ encode(buffer, buffer + size, std::back_inserter(result), alphabet);
178
+
179
+ return result;
180
+ }
181
+ /**
182
+ Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
183
+ in other words: inplace decoding is possible.
184
+
185
+ @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
186
+ otherwise the behavior depends on the output iterator.
187
+
188
+ @tparam Input_iterator the source; the returned elements are cast to `char`
189
+ @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
190
+ @param in_begin the beginning of the source
191
+ @param in_end the ending of the source
192
+ @param out the destination iterator
193
+ @param alphabet which alphabet should be used
194
+ @param behavior the behavior when an error was detected
195
+ @returns the iterator to the next element past the last element copied
196
+ @throws base64_error depending on the set behavior
197
+ @throws see `Input_iterator` and `Output_iterator`
198
+ */
199
+ template<typename Input_iterator, typename Output_iterator>
200
+ static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
201
+ alphabet alphabet = alphabet::auto_,
202
+ decoding_behavior behavior = decoding_behavior::moderate)
203
+ {
204
+ //constexpr auto pad = '=';
205
+ std::uint8_t last = 0;
206
+ auto bits = 0;
207
+
208
+ while (in_begin != in_end) {
209
+ auto c = *in_begin;
210
+ ++in_begin;
211
+
212
+ if (c == '=') {
213
+ break;
214
+ }
215
+
216
+ auto part = _base64_value(alphabet, c);
217
+
218
+ // enough bits for one byte
219
+ if (bits + 6 >= 8) {
220
+ *out = (last << (8 - bits)) | (part >> (bits - 2));
221
+ ++out;
222
+
223
+ bits -= 2;
224
+ } else {
225
+ bits += 6;
226
+ }
227
+
228
+ last = part;
229
+ }
230
+
231
+ // check padding
232
+ if (behavior != decoding_behavior::loose) {
233
+ while (in_begin != in_end) {
234
+ auto c = *in_begin;
235
+ ++in_begin;
236
+
237
+ if (c != '=') {
238
+ throw base64_error("invalid base64 character.");
239
+ }
240
+ }
241
+ }
242
+
243
+ return out;
244
+ }
245
+ /**
246
+ Decodes a string.
247
+
248
+ @param str the base64 encoded string
249
+ @param alphabet which alphabet should be used
250
+ @param behavior the behavior when an error was detected
251
+ @returns the decoded string
252
+ @throws see base64::decode()
253
+ */
254
+ static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
255
+ decoding_behavior behavior = decoding_behavior::moderate)
256
+ {
257
+ std::string result;
258
+
259
+ result.reserve(max_decode_size(str.length()));
260
+
261
+ decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
262
+
263
+ return result;
264
+ }
265
+ /**
266
+ Decodes a string.
267
+
268
+ @param buffer the base64 encoded buffer
269
+ @param size the size of the buffer
270
+ @param alphabet which alphabet should be used
271
+ @param behavior the behavior when an error was detected
272
+ @returns the decoded string
273
+ @throws see base64::decode()
274
+ */
275
+ static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
276
+ decoding_behavior behavior = decoding_behavior::moderate)
277
+ {
278
+ std::string result;
279
+
280
+ result.reserve(max_decode_size(size));
281
+
282
+ decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
283
+
284
+ return result;
285
+ }
286
+ /**
287
+ Decodes a string inplace.
288
+
289
+ @param[in,out] str the base64 encoded string
290
+ @param alphabet which alphabet should be used
291
+ @param behavior the behavior when an error was detected
292
+ @throws base64::decode_inplace()
293
+ */
294
+ static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
295
+ decoding_behavior behavior = decoding_behavior::moderate)
296
+ {
297
+ str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
298
+ }
299
+ /**
300
+ Decodes a char array inplace.
301
+
302
+ @param[in,out] str the string array
303
+ @param size the length of the array
304
+ @param alphabet which alphabet should be used
305
+ @param behavior the behavior when an error was detected
306
+ @returns the pointer to the next element past the last element decoded
307
+ @throws base64::decode_inplace()
308
+ */
309
+ static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
310
+ decoding_behavior behavior = decoding_behavior::moderate)
311
+ {
312
+ return decode(str, str + size, str, alphabet, behavior);
313
+ }
314
+ /**
315
+ Returns the required decoding size for a given size. The value is calculated with the following formula:
316
+
317
+ $$
318
+ \lceil \frac{size}{4} \rceil \cdot 3
319
+ $$
320
+
321
+ @param size the size of the encoded input
322
+ @returns the size of the resulting decoded buffer; this the absolute maximum
323
+ */
324
+ static std::size_t max_decode_size(std::size_t size) noexcept
325
+ {
326
+ return (size / 4 + (size % 4 ? 1 : 0)) * 3;
327
+ }
328
+ /**
329
+ Returns the required encoding size for a given size. The value is calculated with the following formula:
330
+
331
+ $$
332
+ \lceil \frac{size}{3} \rceil \cdot 4
333
+ $$
334
+
335
+ @param size the size of the decoded input
336
+ @returns the size of the resulting encoded buffer
337
+ */
338
+ static std::size_t required_encode_size(std::size_t size) noexcept
339
+ {
340
+ return (size / 3 + (size % 3 ? 1 : 0)) * 4;
341
+ }
342
+
343
+ private:
344
+ static std::uint8_t _base64_value(alphabet& alphabet, char c)
345
+ {
346
+ if (c >= 'A' && c <= 'Z') {
347
+ return c - 'A';
348
+ } else if (c >= 'a' && c <= 'z') {
349
+ return c - 'a' + 26;
350
+ } else if (c >= '0' && c <= '9') {
351
+ return c - '0' + 52;
352
+ }
353
+
354
+ // comes down to alphabet
355
+ if (alphabet == alphabet::standard) {
356
+ if (c == '+') {
357
+ return 62;
358
+ } else if (c == '/') {
359
+ return 63;
360
+ }
361
+ } else if (alphabet == alphabet::url_filename_safe) {
362
+ if (c == '-') {
363
+ return 62;
364
+ } else if (c == '_') {
365
+ return 63;
366
+ }
367
+ } // auto detect
368
+ else {
369
+ if (c == '+') {
370
+ alphabet = alphabet::standard;
371
+
372
+ return 62;
373
+ } else if (c == '/') {
374
+ alphabet = alphabet::standard;
375
+
376
+ return 63;
377
+ } else if (c == '-') {
378
+ alphabet = alphabet::url_filename_safe;
379
+
380
+ return 62;
381
+ } else if (c == '_') {
382
+ alphabet = alphabet::url_filename_safe;
383
+
384
+ return 63;
385
+ }
386
+ }
387
+
388
+ throw base64_error("invalid base64 character.");
389
+ }
390
+ };
391
+
392
+ #endif // !PUBLIC_DOMAIN_BASE64_HPP_
llama.cpp/common/build-info.cpp.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
2
+ char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
3
+ char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
4
+ char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
llama.cpp/common/chat-parser-xml-toolcall.cpp ADDED
@@ -0,0 +1,879 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "chat.h"
2
+ #include "chat-parser.h"
3
+ #include "common.h"
4
+ #include "json-partial.h"
5
+ #include "json-schema-to-grammar.h"
6
+ #include "log.h"
7
+ #include "regex-partial.h"
8
+
9
+ using json = nlohmann::ordered_json;
10
+
11
+ class xml_toolcall_syntax_exception : public std::runtime_error {
12
+ public:
13
+ xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
14
+ };
15
+
16
+ template<typename T>
17
+ inline void sort_uniq(std::vector<T> &vec) {
18
+ std::sort(vec.begin(), vec.end());
19
+ vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
20
+ }
21
+
22
+ template<typename T>
23
+ inline bool all_space(const T &str) {
24
+ return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
25
+ }
26
+
27
+ static size_t utf8_truncate_safe(const std::string_view s) {
28
+ size_t len = s.size();
29
+ if (len == 0) return 0;
30
+ size_t i = len;
31
+ for (size_t back = 0; back < 4 && i > 0; ++back) {
32
+ --i;
33
+ unsigned char c = s[i];
34
+ if ((c & 0x80) == 0) {
35
+ return len;
36
+ } else if ((c & 0xC0) == 0xC0) {
37
+ size_t expected_len = 0;
38
+ if ((c & 0xE0) == 0xC0) expected_len = 2;
39
+ else if ((c & 0xF0) == 0xE0) expected_len = 3;
40
+ else if ((c & 0xF8) == 0xF0) expected_len = 4;
41
+ else return i;
42
+ if (len - i >= expected_len) {
43
+ return len;
44
+ } else {
45
+ return i;
46
+ }
47
+ }
48
+ }
49
+ return len - std::min(len, size_t(3));
50
+ }
51
+
52
+ inline void utf8_truncate_safe_resize(std::string &s) {
53
+ s.resize(utf8_truncate_safe(s));
54
+ }
55
+
56
+ inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
57
+ return s.substr(0, utf8_truncate_safe(s));
58
+ }
59
+
60
+ static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
61
+ if (literal1.size() == 0) return builder.try_find_literal(literal2);
62
+ const auto saved_pos = builder.pos();
63
+ while (auto res = builder.try_find_literal(literal1)) {
64
+ builder.consume_spaces();
65
+ const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
66
+ if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
67
+ if (res->prelude.size() != res->groups[0].begin - saved_pos) {
68
+ res->prelude = builder.str({saved_pos, res->groups[0].begin});
69
+ }
70
+ builder.move_to(builder.pos() + match_len);
71
+ res->groups[0].end = builder.pos();
72
+ GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
73
+ return res;
74
+ }
75
+ builder.move_to(res->groups[0].begin + 1);
76
+ }
77
+ builder.move_to(saved_pos);
78
+ return std::nullopt;
79
+ }
80
+
81
+ /**
82
+ * make a GBNF that accept any strings except those containing any of the forbidden strings.
83
+ */
84
+ std::string make_gbnf_excluding(std::vector<std::string> forbids) {
85
+ constexpr auto charclass_escape = [](unsigned char c) -> std::string {
86
+ if (c == '\\' || c == ']' || c == '^' || c == '-') {
87
+ std::string s = "\\";
88
+ s.push_back((char)c);
89
+ return s;
90
+ }
91
+ if (isprint(c)) {
92
+ return std::string(1, (char)c);
93
+ }
94
+ char buf[16];
95
+ snprintf(buf, 15, "\\x%02X", c);
96
+ return std::string(buf);
97
+ };
98
+ constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
99
+ std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
100
+ int i = l;
101
+ while (i < r) {
102
+ const std::string &s = forbids[i];
103
+ if ((int)s.size() == depth) {
104
+ ++i;
105
+ continue;
106
+ }
107
+ unsigned char c = (unsigned char)s[depth];
108
+ int j = i;
109
+ while (j < r && (int)forbids[j].size() > depth &&
110
+ (unsigned char)forbids[j][depth] == c) {
111
+ ++j;
112
+ }
113
+ children.push_back({c, {i, j}});
114
+ i = j;
115
+ }
116
+ std::vector<std::string> alts;
117
+ if (!children.empty()) {
118
+ std::string cls;
119
+ for (auto &ch : children) cls += charclass_escape(ch.first);
120
+ alts.push_back(std::string("[^") + cls + "]");
121
+ }
122
+ for (auto &ch : children) {
123
+ std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
124
+ if (!childExpr.empty()) {
125
+ std::string quoted_ch = "\"";
126
+ if (ch.first == '\\') quoted_ch += "\\\\";
127
+ else if (ch.first == '"') quoted_ch += "\\\"";
128
+ else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
129
+ else {
130
+ char buf[16];
131
+ snprintf(buf, 15, "\\x%02X", ch.first);
132
+ quoted_ch += buf;
133
+ }
134
+ quoted_ch += "\"";
135
+ std::string branch = quoted_ch + std::string(" ") + childExpr;
136
+ alts.push_back(branch);
137
+ }
138
+ }
139
+ if (alts.empty()) return "";
140
+ std::ostringstream oss;
141
+ oss << "( ";
142
+ for (size_t k = 0; k < alts.size(); ++k) {
143
+ if (k) oss << " | ";
144
+ oss << alts[k];
145
+ }
146
+ oss << " )";
147
+ return oss.str();
148
+ };
149
+ if (forbids.empty()) return "( . )*";
150
+ sort(forbids.begin(), forbids.end());
151
+ std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
152
+ if (expr.empty()) {
153
+ std::string cls;
154
+ for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
155
+ expr = std::string("( [^") + cls + "] )";
156
+ }
157
+ if (forbids.size() == 1)
158
+ return expr + "*";
159
+ else
160
+ return std::string("( ") + expr + " )*";
161
+ }
162
+
163
+ /**
164
+ * Build grammar for xml-style tool call
165
+ * form.scope_start and form.scope_end can be empty.
166
+ * Requires data.format for model-specific hacks.
167
+ */
168
+ void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
169
+ GGML_ASSERT(!form.tool_start.empty());
170
+ GGML_ASSERT(!form.tool_sep.empty());
171
+ GGML_ASSERT(!form.key_start.empty());
172
+ GGML_ASSERT(!form.val_end.empty());
173
+ GGML_ASSERT(!form.tool_end.empty());
174
+
175
+ std::string key_val_sep = form.key_val_sep;
176
+ if (form.key_val_sep2) {
177
+ key_val_sep += "\n";
178
+ key_val_sep += *form.key_val_sep2;
179
+ }
180
+ GGML_ASSERT(!key_val_sep.empty());
181
+
182
+ if (tools.is_array() && !tools.empty()) {
183
+ data.grammar = build_grammar([&](const common_grammar_builder &builder) {
184
+ auto string_arg_val = form.last_val_end ?
185
+ builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
186
+ builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
187
+
188
+ std::vector<std::string> tool_rules;
189
+ for (const auto & tool : tools) {
190
+ if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
191
+ LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
192
+ continue;
193
+ }
194
+ const auto & function = tool.at("function");
195
+ if (!function.contains("name") || !function.at("name").is_string()) {
196
+ LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
197
+ continue;
198
+ }
199
+ if (!function.contains("parameters") || !function.at("parameters").is_object()) {
200
+ LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
201
+ continue;
202
+ }
203
+ std::string name = function.at("name");
204
+ auto parameters = function.at("parameters");
205
+ builder.resolve_refs(parameters);
206
+
207
+ struct parameter_rule {
208
+ std::string symbol_name;
209
+ bool is_required;
210
+ };
211
+ std::vector<parameter_rule> arg_rules;
212
+ if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
213
+ LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
214
+ continue;
215
+ } else {
216
+ std::vector<std::string> requiredParameters;
217
+ if (parameters.contains("required")) {
218
+ try { parameters.at("required").get_to(requiredParameters); }
219
+ catch (const std::runtime_error&) {
220
+ LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
221
+ }
222
+ }
223
+ sort_uniq(requiredParameters);
224
+ for (const auto & [key, value] : parameters.at("properties").items()) {
225
+ std::string quoted_key = key;
226
+ bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
227
+ if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
228
+ quoted_key = gbnf_format_literal(key);
229
+ quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
230
+ }
231
+ arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
232
+ gbnf_format_literal(form.key_start) + " " +
233
+ gbnf_format_literal(quoted_key) + " " +
234
+ gbnf_format_literal(key_val_sep) + " " +
235
+ ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
236
+ (form.raw_argval ?
237
+ string_arg_val :
238
+ "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
239
+ ) :
240
+ builder.add_schema(name + "-arg-" + key, value)
241
+ )
242
+ ), required});
243
+ }
244
+ }
245
+
246
+ auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
247
+ decltype(next_arg_with_sep) next_arg = "\"\"";
248
+ for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
249
+ std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
250
+ next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
251
+ include_this_arg : "( " + include_this_arg + " ) | " + next_arg
252
+ );
253
+ include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
254
+ next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
255
+ include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
256
+ );
257
+ }
258
+
259
+ std::string quoted_name = name;
260
+ if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
261
+ quoted_name = gbnf_format_literal(name);
262
+ quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
263
+ }
264
+ quoted_name = gbnf_format_literal(quoted_name);
265
+ // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
266
+ if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
267
+ quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
268
+ }
269
+ tool_rules.push_back(builder.add_rule(name + "-call",
270
+ gbnf_format_literal(form.tool_start) + " " +
271
+ quoted_name + " " +
272
+ gbnf_format_literal(form.tool_sep) + " " +
273
+ next_arg
274
+ ));
275
+ }
276
+
277
+ auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
278
+ auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
279
+ auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
280
+ auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
281
+ builder.add_rule("root",
282
+ (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
283
+ tool_call_multiple_with_end + "?" +
284
+ (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
285
+ );
286
+ });
287
+
288
+ // grammar trigger for tool call
289
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
290
+ }
291
+ }
292
+
293
+ /**
294
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
295
+ * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
296
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
297
+ */
298
+ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
299
+ GGML_ASSERT(!form.tool_start.empty());
300
+ GGML_ASSERT(!form.key_start.empty());
301
+ GGML_ASSERT(!form.key_val_sep.empty());
302
+ GGML_ASSERT(!form.val_end.empty());
303
+ GGML_ASSERT(!form.tool_end.empty());
304
+
305
+ // Helper to choose return false or throw error
306
+ constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
307
+ LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
308
+ if (recovery) {
309
+ builder.move_to(start_pos);
310
+ return false;
311
+ } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
312
+ };
313
+ // Drop substring from needle to end from a JSON
314
+ constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
315
+ auto pos = json_str.rfind(needle);
316
+ if (pos == std::string::npos) {
317
+ return false;
318
+ }
319
+ for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
320
+ unsigned char ch = static_cast<unsigned char>(json_str[i]);
321
+ if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
322
+ return false;
323
+ }
324
+ }
325
+ if (pos != 0 && json_str[pos - 1] == '"') {
326
+ --pos;
327
+ }
328
+ json_str.resize(pos);
329
+ return true;
330
+ };
331
+ // Helper to generate a partial argument JSON
332
+ constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
333
+ auto rest = builder.consume_rest();
334
+ utf8_truncate_safe_resize(rest);
335
+ set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
336
+ auto tool_str = arguments.dump();
337
+ if (partial_json(tool_str)) {
338
+ if (builder.add_tool_call(function_name, "", tool_str)) {
339
+ return;
340
+ }
341
+ }
342
+ LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
343
+ };
344
+ // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
345
+ constexpr auto try_find_close = [](
346
+ common_chat_msg_parser & builder,
347
+ const std::string & end,
348
+ const std::optional<std::string> & alt_end,
349
+ const std::string & end_next,
350
+ const std::optional<std::string> & alt_end_next
351
+ ) {
352
+ auto saved_pos = builder.pos();
353
+ auto tc = builder.try_find_literal(end);
354
+ auto val_end_size = end.size();
355
+ if (alt_end) {
356
+ auto pos_1 = builder.pos();
357
+ builder.move_to(saved_pos);
358
+ auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
359
+ if (alt_end_next) {
360
+ builder.move_to(saved_pos);
361
+ auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
362
+ if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
363
+ tc2 = tc3;
364
+ }
365
+ }
366
+ if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
367
+ tc = tc2;
368
+ tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
369
+ builder.move_to(tc->groups[0].end);
370
+ val_end_size = alt_end->size();
371
+ } else {
372
+ builder.move_to(pos_1);
373
+ }
374
+ }
375
+ return std::make_pair(val_end_size, tc);
376
+ };
377
+ // Helper to find a val_end or last_val_end, returns matched pattern size
378
+ const auto try_find_val_end = [try_find_close, &builder, &form]() {
379
+ return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
380
+ };
381
+ // Helper to find a tool_end or last_tool_end, returns matched pattern size
382
+ const auto try_find_tool_end = [try_find_close, &builder, &form]() {
383
+ return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
384
+ };
385
+
386
+ bool recovery = true;
387
+ const auto start_pos = builder.pos();
388
+ if (!all_space(form.scope_start)) {
389
+ if (auto tc = builder.try_find_literal(form.scope_start)) {
390
+ if (all_space(tc->prelude)) {
391
+ if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
392
+ throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
393
+ } else {
394
+ builder.move_to(start_pos);
395
+ return false;
396
+ }
397
+ } else return false;
398
+ }
399
+ while (auto tc = builder.try_find_literal(form.tool_start)) {
400
+ if (!all_space(tc->prelude)) {
401
+ LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
402
+ gbnf_format_literal(form.tool_start).c_str(),
403
+ gbnf_format_literal(tc->prelude).c_str()
404
+ );
405
+ builder.move_to(tc->groups[0].begin - tc->prelude.size());
406
+ break;
407
+ }
408
+
409
+ // Find tool name
410
+ auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
411
+ if (!func_name) {
412
+ auto [sz, tc] = try_find_tool_end();
413
+ func_name = tc;
414
+ }
415
+ if (!func_name) {
416
+ // Partial tool name not supported
417
+ throw common_chat_msg_partial_exception("incomplete tool_call");
418
+ }
419
+ // If the model generate multiple tool call and the first tool call has no argument
420
+ if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
421
+ builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
422
+ auto [sz, tc] = try_find_tool_end();
423
+ func_name = tc;
424
+ }
425
+
426
+ // Parse tool name
427
+ builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
428
+ std::string function_name = string_strip(func_name->prelude);
429
+ // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
430
+ if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
431
+ if (string_starts_with(function_name, "functions.")) {
432
+ static const std::regex re(":\\d+$");
433
+ if (std::regex_search(function_name, re)) {
434
+ function_name = function_name.substr(10, function_name.rfind(":") - 10);
435
+ }
436
+ }
437
+ }
438
+
439
+ // Argument JSON
440
+ json arguments = json::object();
441
+
442
+ // Helper to generate a partial argument JSON
443
+ const auto gen_partial_args = [&](auto set_partial_arg) {
444
+ gen_partial_json(set_partial_arg, arguments, builder, function_name);
445
+ };
446
+
447
+ // Parse all arg_key/arg_value pairs
448
+ while (auto tc = builder.try_find_literal(form.key_start)) {
449
+ if (!all_space(tc->prelude)) {
450
+ LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
451
+ gbnf_format_literal(form.key_start).c_str(),
452
+ gbnf_format_literal(tc->prelude).c_str()
453
+ );
454
+ builder.move_to(tc->groups[0].begin - tc->prelude.size());
455
+ break;
456
+ }
457
+ if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
458
+ auto tool_call_arg = arguments.dump();
459
+ if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
460
+ tool_call_arg.resize(tool_call_arg.size() - 1);
461
+ }
462
+ builder.add_tool_call(function_name, "", tool_call_arg);
463
+ throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
464
+ }
465
+
466
+ // Parse arg_key
467
+ auto key_res = builder.try_find_literal(form.key_val_sep);
468
+ if (!key_res) {
469
+ gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
470
+ throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
471
+ }
472
+ if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
473
+ gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
474
+ throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
475
+ }
476
+ auto &key = key_res->prelude;
477
+ recovery = false;
478
+
479
+ // Parse arg_value
480
+ if (form.key_val_sep2) {
481
+ if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
482
+ if (!all_space(tc->prelude)) {
483
+ LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
484
+ gbnf_format_literal(tc->prelude).c_str(),
485
+ gbnf_format_literal(form.key_val_sep).c_str(),
486
+ gbnf_format_literal(*form.key_val_sep2).c_str()
487
+ );
488
+ return return_error(builder, start_pos, false);
489
+ }
490
+ if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
491
+ gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
492
+ throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
493
+ }
494
+ } else {
495
+ gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
496
+ throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
497
+ }
498
+ }
499
+ auto val_start = builder.pos();
500
+
501
+ // Test if arg_val is a partial JSON
502
+ std::optional<common_json> value_json = std::nullopt;
503
+ if (!form.raw_argval || !*form.raw_argval) {
504
+ try { value_json = builder.try_consume_json(); }
505
+ catch (const std::runtime_error&) { builder.move_to(val_start); }
506
+ // TODO: Delete this when json_partial adds top-level support for null/true/false
507
+ if (builder.pos() == val_start) {
508
+ const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
509
+ builder.consume_spaces();
510
+ std::string_view sv = utf8_truncate_safe_view(builder.input());
511
+ sv.remove_prefix(builder.pos());
512
+ std::string rest = "a";
513
+ if (sv.size() < 6) rest = sv;
514
+ if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
515
+ value_json = {123, {"123", "123"}};
516
+ builder.consume_rest();
517
+ } else {
518
+ builder.move_to(val_start);
519
+ }
520
+ }
521
+ }
522
+
523
+ // If it is a JSON and followed by </arg_value>, parse as json
524
+ // cannot support streaming because it may be a plain text starting with JSON
525
+ if (value_json) {
526
+ auto json_end = builder.pos();
527
+ builder.consume_spaces();
528
+ if (builder.pos() == builder.input().size()) {
529
+ if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
530
+ arguments[key] = value_json->json;
531
+ auto json_str = arguments.dump();
532
+ if (!value_json->healing_marker.json_dump_marker.empty()) {
533
+ GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
534
+ json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
535
+ } else {
536
+ GGML_ASSERT(json_str.back() == '}');
537
+ json_str.resize(json_str.size() - 1);
538
+ }
539
+ builder.add_tool_call(function_name, "", json_str);
540
+ } else {
541
+ gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
542
+ }
543
+ LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
544
+ throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
545
+ }
546
+ builder.move_to(json_end);
547
+ auto [val_end_size, tc] = try_find_val_end();
548
+ if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
549
+ if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
550
+ gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
551
+ LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
552
+ throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
553
+ } else arguments[key] = value_json->json;
554
+ } else builder.move_to(val_start);
555
+ }
556
+
557
+ // If not, parse as plain text
558
+ if (val_start == builder.pos()) {
559
+ if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
560
+ auto &value_str = value_plain->prelude;
561
+ if (form.trim_raw_argval) value_str = string_strip(value_str);
562
+ if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
563
+ gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
564
+ throw common_chat_msg_partial_exception(
565
+ "Expected " + gbnf_format_literal(form.val_end) +
566
+ " after " + gbnf_format_literal(form.key_val_sep) +
567
+ (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
568
+ );
569
+ }
570
+ arguments[key] = value_str;
571
+ } else {
572
+ if (form.trim_raw_argval) {
573
+ gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
574
+ } else {
575
+ gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
576
+ }
577
+ throw common_chat_msg_partial_exception(
578
+ "Expected " + gbnf_format_literal(form.val_end) +
579
+ " after " + gbnf_format_literal(form.key_val_sep) +
580
+ (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
581
+ );
582
+ }
583
+ }
584
+ }
585
+
586
+ // Consume closing tag
587
+ if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
588
+ if (!all_space(tc->prelude)) {
589
+ LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
590
+ gbnf_format_literal(form.tool_end).c_str(),
591
+ gbnf_format_literal(tc->prelude).c_str()
592
+ );
593
+ return return_error(builder, start_pos, recovery);
594
+ }
595
+ if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
596
+ // Add the parsed tool call
597
+ if (!builder.add_tool_call(function_name, "", arguments.dump())) {
598
+ throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
599
+ }
600
+ recovery = false;
601
+ continue;
602
+ }
603
+ }
604
+
605
+ auto tool_call_arg = arguments.dump();
606
+ if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
607
+ tool_call_arg.resize(tool_call_arg.size() - 1);
608
+ }
609
+ builder.add_tool_call(function_name, "", tool_call_arg);
610
+ throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
611
+ }
612
+ if (auto tc = builder.try_find_literal(form.scope_end)) {
613
+ if (!all_space(tc->prelude)) {
614
+ LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
615
+ gbnf_format_literal(form.scope_end).c_str(),
616
+ gbnf_format_literal(tc->prelude).c_str()
617
+ );
618
+ return return_error(builder, start_pos, recovery);
619
+ }
620
+ } else {
621
+ if (all_space(form.scope_end)) return true;
622
+ builder.consume_spaces();
623
+ if (builder.pos() == builder.input().size())
624
+ throw common_chat_msg_partial_exception("incomplete tool calls");
625
+ LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
626
+ gbnf_format_literal(form.scope_end).c_str(),
627
+ gbnf_format_literal(builder.consume_rest()).c_str()
628
+ );
629
+ return return_error(builder, start_pos, recovery);
630
+ }
631
+
632
+ return true;
633
+ }
634
+
635
+ /**
636
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
637
+ * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
638
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
639
+ */
640
+ bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
641
+ auto pos = pos_;
642
+ auto tsize = result_.tool_calls.size();
643
+ try { return parse_xml_tool_calls(*this, form); }
644
+ catch (const xml_toolcall_syntax_exception&) {}
645
+ move_to(pos);
646
+ result_.tool_calls.resize(tsize);
647
+ return false;
648
+ }
649
+
650
+ /**
651
+ * Parse content uses reasoning and XML-Style tool call
652
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
653
+ */
654
+ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
655
+ constexpr auto rstrip = [](std::string &s) {
656
+ s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
657
+ };
658
+ // Erase substring from l to r, along with additional spaces nearby
659
+ constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
660
+ while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
661
+ ++l;
662
+ while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
663
+ if (l < r) str[l] = '\n';
664
+ if (l + 1 < r) str[l + 1] = '\n';
665
+ if (l != 0) l += 2;
666
+ str.erase(l, r - l);
667
+ return l;
668
+ };
669
+ constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
670
+ auto best_match = content.size();
671
+ for (auto pattern: list) {
672
+ if (pattern.size() == 0) continue;
673
+ for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
674
+ auto match_len = content.size() - match_idx;
675
+ if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
676
+ best_match = match_idx;
677
+ }
678
+ }
679
+ }
680
+ if (content.size() > best_match) {
681
+ content.erase(best_match);
682
+ }
683
+ };
684
+ const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
685
+ return trim_suffix(content, {
686
+ start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
687
+ form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
688
+ form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
689
+ form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
690
+ form.scope_end
691
+ });
692
+ };
693
+
694
+
695
+ // Trim leading spaces without affecting keyword matching
696
+ static const common_regex spaces_regex("\\s*");
697
+ {
698
+ auto tc = builder.consume_regex(spaces_regex);
699
+ auto spaces = builder.str(tc.groups[0]);
700
+ auto s1 = spaces.size();
701
+ trim_potential_partial_word(spaces);
702
+ auto s2 = spaces.size();
703
+ builder.move_to(builder.pos() - (s1 - s2));
704
+ }
705
+
706
+ // Parse content
707
+ bool reasoning_unclosed = builder.syntax().thinking_forced_open;
708
+ std::string unclosed_reasoning_content("");
709
+ for (;;) {
710
+ auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
711
+ std::string content;
712
+ std::string tool_call_start;
713
+
714
+ if (tc) {
715
+ content = std::move(tc->prelude);
716
+ tool_call_start = builder.str(tc->groups[0]);
717
+ LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
718
+ } else {
719
+ content = builder.consume_rest();
720
+ utf8_truncate_safe_resize(content);
721
+ }
722
+
723
+ // Handle unclosed think block
724
+ if (reasoning_unclosed) {
725
+ if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
726
+ unclosed_reasoning_content += content;
727
+ if (!(form.allow_toolcall_in_think && tc)) {
728
+ unclosed_reasoning_content += tool_call_start;
729
+ continue;
730
+ }
731
+ } else {
732
+ reasoning_unclosed = false;
733
+ std::string reasoning_content;
734
+ if (pos == std::string::npos) {
735
+ reasoning_content = std::move(content);
736
+ } else {
737
+ reasoning_content = content.substr(0, pos);
738
+ content.erase(0, pos + end_think.size());
739
+ }
740
+ if (builder.pos() == builder.input().size() && all_space(content)) {
741
+ rstrip(reasoning_content);
742
+ trim_potential_partial_word(reasoning_content);
743
+ rstrip(reasoning_content);
744
+ if (reasoning_content.empty()) {
745
+ rstrip(unclosed_reasoning_content);
746
+ trim_potential_partial_word(unclosed_reasoning_content);
747
+ rstrip(unclosed_reasoning_content);
748
+ if (unclosed_reasoning_content.empty()) continue;
749
+ }
750
+ }
751
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
752
+ builder.add_content(start_think);
753
+ builder.add_content(unclosed_reasoning_content);
754
+ builder.add_content(reasoning_content);
755
+ if (builder.pos() != builder.input().size() || !all_space(content))
756
+ builder.add_content(end_think);
757
+ } else {
758
+ builder.add_reasoning_content(unclosed_reasoning_content);
759
+ builder.add_reasoning_content(reasoning_content);
760
+ }
761
+ unclosed_reasoning_content.clear();
762
+ }
763
+ }
764
+
765
+ // Handle multiple think block
766
+ bool toolcall_in_think = false;
767
+ for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
768
+ if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
769
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
770
+ auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
771
+ builder.add_reasoning_content(reasoning_content);
772
+ think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
773
+ } else {
774
+ think_start = think_end + end_think.size() - 1;
775
+ }
776
+ } else {
777
+ // This <tool_call> start is in thinking block, skip this tool call
778
+ // This <tool_call> start is in thinking block
779
+ if (form.allow_toolcall_in_think) {
780
+ unclosed_reasoning_content = content.substr(think_start + start_think.size());
781
+ } else {
782
+ unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
783
+ }
784
+ reasoning_unclosed = true;
785
+ content.resize(think_start);
786
+ toolcall_in_think = true;
787
+ }
788
+ }
789
+
790
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
791
+ rstrip(content);
792
+ // Handle unclosed </think> token from content: delete all </think> token
793
+ if (auto pos = content.rfind(end_think); pos != std::string::npos) {
794
+ while (pos != std::string::npos) {
795
+ pos = erase_spaces(content, pos, pos + end_think.size() - 1);
796
+ pos = content.rfind(end_think, pos);
797
+ }
798
+ }
799
+ // Strip if needed
800
+ if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
801
+ content = string_strip(content);
802
+ }
803
+ }
804
+
805
+ // remove potential partial suffix
806
+ if (builder.pos() == builder.input().size() && builder.is_partial()) {
807
+ if (unclosed_reasoning_content.empty()) {
808
+ rstrip(content);
809
+ trim_potential_partial_word(content);
810
+ rstrip(content);
811
+ } else {
812
+ rstrip(unclosed_reasoning_content);
813
+ trim_potential_partial_word(unclosed_reasoning_content);
814
+ rstrip(unclosed_reasoning_content);
815
+ }
816
+ }
817
+
818
+ // consume unclosed_reasoning_content if allow_toolcall_in_think is set
819
+ if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
820
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
821
+ builder.add_reasoning_content(unclosed_reasoning_content);
822
+ } else {
823
+ if (content.empty()) {
824
+ content = start_think + unclosed_reasoning_content;
825
+ } else {
826
+ content += "\n\n" + start_think;
827
+ content += unclosed_reasoning_content;
828
+ }
829
+ }
830
+ unclosed_reasoning_content.clear();
831
+ }
832
+
833
+ // Add content
834
+ if (!content.empty()) {
835
+ // If there are multiple content blocks
836
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
837
+ builder.add_content("\n\n");
838
+ }
839
+ builder.add_content(content);
840
+ }
841
+
842
+ // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
843
+ if (toolcall_in_think && !form.allow_toolcall_in_think) {
844
+ continue;
845
+ }
846
+
847
+ // There is no tool call and all content is parsed
848
+ if (!tc) {
849
+ GGML_ASSERT(builder.pos() == builder.input().size());
850
+ GGML_ASSERT(unclosed_reasoning_content.empty());
851
+ if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
852
+ break;
853
+ }
854
+
855
+ builder.move_to(tc->groups[0].begin);
856
+ if (builder.try_consume_xml_tool_calls(form)) {
857
+ auto end_of_tool = builder.pos();
858
+ builder.consume_spaces();
859
+ if (builder.pos() != builder.input().size()) {
860
+ builder.move_to(end_of_tool);
861
+ if (!builder.result().content.empty()) {
862
+ builder.add_content("\n\n");
863
+ }
864
+ }
865
+ } else {
866
+ static const common_regex next_char_regex(".");
867
+ auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
868
+ rstrip(c);
869
+ builder.add_content(c);
870
+ }
871
+ }
872
+ }
873
+
874
+ /**
875
+ * Parse content uses reasoning and XML-Style tool call
876
+ */
877
+ void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
878
+ parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
879
+ }
llama.cpp/common/chat-parser-xml-toolcall.h ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "chat.h"
4
+
5
+ #include <nlohmann/json.hpp>
6
+
7
+ #include <optional>
8
+ #include <string>
9
+ #include <vector>
10
+
11
+
12
+ // Sample config:
13
+ // MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
14
+ // GLM 4.5 (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
15
+ struct xml_tool_call_format {
16
+ std::string scope_start; // <minimax:tool_call>\n // \n // can be empty
17
+ std::string tool_start; // <invoke name=\" // <tool_call>
18
+ std::string tool_sep; // \">\n // \n // can be empty only for parse_xml_tool_calls
19
+ std::string key_start; // <parameter name=\" // <arg_key>
20
+ std::string key_val_sep; // \"> // </arg_key>\n<arg_value>
21
+ std::string val_end; // </parameter>\n // </arg_value>\n
22
+ std::string tool_end; // </invoke>\n // </tool_call>\n
23
+ std::string scope_end; // </minimax:tool_call> // // can be empty
24
+ // Set this if there can be dynamic spaces inside key_val_sep.
25
+ // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
26
+ std::optional<std::string> key_val_sep2 = std::nullopt;
27
+ // Set true if argval should only be raw string. e.g. Hello "world" hi
28
+ // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
29
+ // Defaults to std::nullopt, both will be allowed.
30
+ std::optional<bool> raw_argval = std::nullopt;
31
+ std::optional<std::string> last_val_end = std::nullopt;
32
+ std::optional<std::string> last_tool_end = std::nullopt;
33
+ bool trim_raw_argval = false;
34
+ bool allow_toolcall_in_think = false;
35
+ };
36
+
37
+ // make a GBNF that accept any strings except those containing any of the forbidden strings.
38
+ std::string make_gbnf_excluding(std::vector<std::string> forbids);
39
+
40
+ /**
41
+ * Build grammar for xml-style tool call
42
+ * form.scope_start and form.scope_end can be empty.
43
+ * Requires data.format for model-specific hacks.
44
+ */
45
+ void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
llama.cpp/common/chat-parser.cpp ADDED
@@ -0,0 +1,1649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "chat-parser.h"
2
+ #include "chat-peg-parser.h"
3
+ #include "common.h"
4
+ #include "log.h"
5
+ #include "peg-parser.h"
6
+ #include "regex-partial.h"
7
+
8
+ #include <algorithm>
9
+ #include <cctype>
10
+ #include <optional>
11
+ #include <stdexcept>
12
+ #include <string>
13
+ #include <string_view>
14
+ #include <vector>
15
+
16
+ using json = nlohmann::ordered_json;
17
+
18
+ static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder,
19
+ const common_regex & prefix,
20
+ size_t rstrip_prefix = 0) {
21
+ static const std::vector<std::vector<std::string>> args_paths = { { "arguments" } };
22
+ if (auto res = builder.try_find_regex(prefix)) {
23
+ builder.move_back(rstrip_prefix);
24
+ auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
25
+ if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
26
+ throw common_chat_msg_partial_exception("incomplete tool call array");
27
+ }
28
+ } else {
29
+ builder.add_content(builder.consume_rest());
30
+ }
31
+ }
32
+
33
+ static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
34
+ std::string arguments;
35
+ if (builder.is_partial()) {
36
+ arguments = (json{
37
+ { "code", code + builder.healing_marker() }
38
+ })
39
+ .dump();
40
+ auto idx = arguments.find(builder.healing_marker());
41
+ if (idx != std::string::npos) {
42
+ arguments.resize(idx);
43
+ }
44
+ } else {
45
+ arguments = (json{
46
+ { "code", code }
47
+ })
48
+ .dump();
49
+ }
50
+ return arguments;
51
+ }
52
+
53
+ /**
54
+ * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
55
+ * Aggregates the prefix, suffix and in-between text into the content.
56
+ */
57
+ static void parse_json_tool_calls(
58
+ common_chat_msg_parser & builder,
59
+ const std::optional<common_regex> & block_open,
60
+ const std::optional<common_regex> & function_regex_start_only,
61
+ const std::optional<common_regex> & function_regex,
62
+ const common_regex & close_regex,
63
+ const std::optional<common_regex> & block_close,
64
+ bool allow_raw_python = false,
65
+ const std::function<std::string(const common_chat_msg_parser::find_regex_result & fres)> & get_function_name =
66
+ nullptr) {
67
+ auto parse_tool_calls = [&]() {
68
+ size_t from = std::string::npos;
69
+ auto first = true;
70
+ while (true) {
71
+ auto start_pos = builder.pos();
72
+ auto res = function_regex_start_only && first ? builder.try_consume_regex(*function_regex_start_only) :
73
+ function_regex ? builder.try_find_regex(*function_regex, from) :
74
+ std::nullopt;
75
+
76
+ if (res) {
77
+ std::string name;
78
+ if (get_function_name) {
79
+ name = get_function_name(*res);
80
+ } else {
81
+ GGML_ASSERT(res->groups.size() == 2);
82
+ name = builder.str(res->groups[1]);
83
+ }
84
+ first = false;
85
+ if (name.empty()) {
86
+ // get_function_name signalled us that we should skip this match and treat it as content.
87
+ from = res->groups[0].begin + 1;
88
+ continue;
89
+ }
90
+ from = std::string::npos;
91
+
92
+ auto maybe_raw_python = name == "python" && allow_raw_python;
93
+ if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
94
+ if (auto arguments = builder.try_consume_json_with_dumped_args({ {} })) {
95
+ if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) {
96
+ throw common_chat_msg_partial_exception("incomplete tool call");
97
+ }
98
+ builder.consume_regex(close_regex);
99
+ }
100
+ continue;
101
+ }
102
+ if (maybe_raw_python) {
103
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
104
+ if (!builder.add_tool_call(name, "", arguments)) {
105
+ throw common_chat_msg_partial_exception("incomplete tool call");
106
+ }
107
+ return;
108
+ }
109
+ throw common_chat_msg_partial_exception("incomplete tool call");
110
+ } else {
111
+ builder.move_to(start_pos);
112
+ }
113
+ break;
114
+ }
115
+ if (block_close) {
116
+ builder.consume_regex(*block_close);
117
+ }
118
+ builder.consume_spaces();
119
+ builder.add_content(builder.consume_rest());
120
+ };
121
+ if (block_open) {
122
+ if (auto res = builder.try_find_regex(*block_open)) {
123
+ parse_tool_calls();
124
+ } else {
125
+ builder.add_content(builder.consume_rest());
126
+ }
127
+ } else {
128
+ parse_tool_calls();
129
+ }
130
+ }
131
+
132
+ common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
133
+ : input_(input), is_partial_(is_partial), syntax_(syntax)
134
+ {
135
+ result_.role = "assistant";
136
+
137
+ while (true) {
138
+ std::string id = std::to_string(std::rand());
139
+ if (input.find(id) == std::string::npos) {
140
+ healing_marker_ = id;
141
+ break;
142
+ }
143
+ }
144
+ }
145
+
146
+ std::string common_chat_msg_parser::str(const common_string_range & rng) const {
147
+ GGML_ASSERT(rng.begin <= rng.end);
148
+ return input_.substr(rng.begin, rng.end - rng.begin);
149
+ }
150
+
151
+ void common_chat_msg_parser::add_content(const std::string &content) {
152
+ result_.content += content;
153
+ }
154
+
155
+ void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
156
+ result_.reasoning_content += reasoning_content;
157
+ }
158
+
159
+ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
160
+ if (name.empty()) {
161
+ return false;
162
+ }
163
+
164
+ common_chat_tool_call tool_call;
165
+ tool_call.name = name;
166
+ tool_call.arguments = arguments;
167
+ tool_call.id = id;
168
+
169
+ // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
170
+ result_.tool_calls.emplace_back(tool_call);
171
+
172
+ return true;
173
+ }
174
+ bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
175
+ std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
176
+ std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
177
+ std::string arguments = "";
178
+ if (tool_call.contains("arguments")) {
179
+ if (tool_call.at("arguments").is_object()) {
180
+ arguments = tool_call.at("arguments").dump();
181
+ } else {
182
+ arguments = tool_call.at("arguments");
183
+ }
184
+ }
185
+
186
+ return add_tool_call(name, id, arguments);
187
+ }
188
+
189
+ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
190
+ for (const auto & item : arr) {
191
+ if (!add_tool_call(item)) {
192
+ return false;
193
+ }
194
+ }
195
+ return true;
196
+ }
197
+
198
+ bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
199
+ if (!tool_call.is_object() || tool_call.size() != 1) {
200
+ return false;
201
+ }
202
+
203
+ // Get the tool name (the single key in the object)
204
+ auto it = tool_call.begin();
205
+ std::string name = it.key();
206
+
207
+ if (name.empty()) {
208
+ return false;
209
+ }
210
+
211
+ // Get the arguments (the nested object)
212
+ const json & args_json = it.value();
213
+ std::string arguments = "";
214
+
215
+ if (args_json.is_object()) {
216
+ arguments = args_json.dump();
217
+ } else if (args_json.is_string()) {
218
+ arguments = args_json;
219
+ } else if (!args_json.is_null()) {
220
+ // For other types, convert to string representation
221
+ arguments = args_json.dump();
222
+ }
223
+
224
+ return add_tool_call(name, "", arguments);
225
+ }
226
+ void common_chat_msg_parser::finish() {
227
+ if (!is_partial_ && pos_ != input_.size()) {
228
+ throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
229
+ }
230
+ }
231
+
232
+ bool common_chat_msg_parser::consume_spaces() {
233
+ const auto length = input_.size();
234
+ auto consumed = false;
235
+ while (pos_ < length && std::isspace(input_[pos_])) {
236
+ ++pos_;
237
+ consumed = true;
238
+ }
239
+ return consumed;
240
+ }
241
+
242
+ bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
243
+ auto pos = pos_;
244
+ for (auto i = 0u; i < literal.size(); ++i) {
245
+ if (pos >= input_.size()) {
246
+ return false;
247
+ }
248
+ if (input_[pos] != literal[i]) {
249
+ return false;
250
+ }
251
+ ++pos;
252
+ }
253
+ pos_ = pos;
254
+ return true;
255
+ }
256
+
257
+ std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
258
+ auto idx = input_.find(literal, pos_);
259
+ if (idx != std::string::npos) {
260
+ find_regex_result res;
261
+ res.prelude = input_.substr(pos_, idx - pos_);
262
+ auto end = idx + literal.size();
263
+ res.groups.emplace_back(common_string_range{idx, end});
264
+ move_to(end);
265
+ return res;
266
+ }
267
+ if (is_partial_) {
268
+ idx = string_find_partial_stop(input_, literal);
269
+ if (idx != std::string::npos && idx >= pos_) {
270
+ find_regex_result res;
271
+ res.prelude = input_.substr(pos_, idx - pos_);
272
+ auto end = input_.size();
273
+ res.groups.emplace_back(common_string_range{idx, end});
274
+ move_to(end);
275
+ return res;
276
+ }
277
+ }
278
+ return std::nullopt;
279
+ }
280
+
281
+ void common_chat_msg_parser::consume_literal(const std::string & literal) {
282
+ if (!try_consume_literal(literal)) {
283
+ throw common_chat_msg_partial_exception(literal);
284
+ }
285
+ }
286
+
287
+ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
288
+ std::string pending_reasoning_prefix;
289
+
290
+ if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
291
+ return false;
292
+ }
293
+
294
+ auto set_reasoning_prefix = [&](size_t prefix_pos) {
295
+ if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
296
+ return;
297
+ }
298
+ if (prefix_pos + start_think.size() > input_.size()) {
299
+ pending_reasoning_prefix.clear();
300
+ return;
301
+ }
302
+ // Capture the exact literal that opened the reasoning section so we can
303
+ // surface it back to callers. This ensures formats that force the
304
+ // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
305
+ // instead of dropping it during parsing.
306
+ pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
307
+ };
308
+
309
+ auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
310
+ auto stripped_reasoning = string_strip(reasoning);
311
+ if (stripped_reasoning.empty()) {
312
+ return;
313
+ }
314
+ if (syntax_.reasoning_in_content) {
315
+ add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
316
+ add_content(stripped_reasoning);
317
+ if (closed) {
318
+ add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
319
+ }
320
+ } else {
321
+ if (!pending_reasoning_prefix.empty()) {
322
+ add_reasoning_content(pending_reasoning_prefix);
323
+ pending_reasoning_prefix.clear();
324
+ }
325
+ add_reasoning_content(stripped_reasoning);
326
+ }
327
+ };
328
+
329
+ const size_t saved_pos = pos_;
330
+ const size_t saved_content_size = result_.content.size();
331
+ const size_t saved_reasoning_size = result_.reasoning_content.size();
332
+
333
+ auto restore_state = [&]() {
334
+ move_to(saved_pos);
335
+ result_.content.resize(saved_content_size);
336
+ result_.reasoning_content.resize(saved_reasoning_size);
337
+ };
338
+
339
+ // Allow leading whitespace to be preserved as content when reasoning is present at the start
340
+ size_t cursor = pos_;
341
+ size_t whitespace_end = cursor;
342
+ while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
343
+ ++whitespace_end;
344
+ }
345
+
346
+ if (whitespace_end >= input_.size()) {
347
+ restore_state();
348
+ if (syntax_.thinking_forced_open) {
349
+ auto rest = input_.substr(saved_pos);
350
+ if (!rest.empty()) {
351
+ handle_reasoning(rest, /* closed */ !is_partial());
352
+ }
353
+ move_to(input_.size());
354
+ return true;
355
+ }
356
+ return false;
357
+ }
358
+
359
+ cursor = whitespace_end;
360
+ const size_t remaining = input_.size() - cursor;
361
+ const size_t start_prefix = std::min(start_think.size(), remaining);
362
+ const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
363
+
364
+ if (has_start_tag && start_prefix < start_think.size()) {
365
+ move_to(input_.size());
366
+ return true;
367
+ }
368
+
369
+ if (has_start_tag) {
370
+ if (whitespace_end > pos_) {
371
+ add_content(input_.substr(pos_, whitespace_end - pos_));
372
+ }
373
+ set_reasoning_prefix(cursor);
374
+ cursor += start_think.size();
375
+ } else if (syntax_.thinking_forced_open) {
376
+ cursor = whitespace_end;
377
+ } else {
378
+ restore_state();
379
+ return false;
380
+ }
381
+ while (true) {
382
+ if (cursor >= input_.size()) {
383
+ move_to(input_.size());
384
+ return true;
385
+ }
386
+
387
+ size_t end_pos = input_.find(end_think, cursor);
388
+ if (end_pos == std::string::npos) {
389
+ std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
390
+ size_t partial_off = string_find_partial_stop(remaining_view, end_think);
391
+ size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
392
+ if (reasoning_end > cursor) {
393
+ handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
394
+ }
395
+ move_to(input_.size());
396
+ return true;
397
+ }
398
+
399
+ if (end_pos > cursor) {
400
+ handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
401
+ } else {
402
+ handle_reasoning("", /* closed */ true);
403
+ }
404
+
405
+ cursor = end_pos + end_think.size();
406
+
407
+ while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
408
+ ++cursor;
409
+ }
410
+
411
+ const size_t next_remaining = input_.size() - cursor;
412
+ if (next_remaining == 0) {
413
+ move_to(cursor);
414
+ return true;
415
+ }
416
+
417
+ const size_t next_prefix = std::min(start_think.size(), next_remaining);
418
+ if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
419
+ if (next_prefix < start_think.size()) {
420
+ move_to(input_.size());
421
+ return true;
422
+ }
423
+ set_reasoning_prefix(cursor);
424
+ cursor += start_think.size();
425
+ continue;
426
+ }
427
+
428
+ move_to(cursor);
429
+ return true;
430
+ }
431
+ }
432
+
433
+ std::string common_chat_msg_parser::consume_rest() {
434
+ auto rest = input_.substr(pos_);
435
+ pos_ = input_.size();
436
+ return rest;
437
+ }
438
+
439
+ // Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
440
+ std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
441
+ auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
442
+ if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
443
+ return std::nullopt;
444
+ }
445
+ auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
446
+ pos_ = m.groups[0].end;
447
+
448
+ if (add_prelude_to_content) {
449
+ add_content(prelude);
450
+ }
451
+ if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
452
+ if (is_partial()) {
453
+ throw common_chat_msg_partial_exception(regex.str());
454
+ }
455
+ return std::nullopt;
456
+ }
457
+ return find_regex_result{prelude, m.groups};
458
+ }
459
+
460
+ common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
461
+ if (auto result = try_consume_regex(regex)) {
462
+ return *result;
463
+ }
464
+ throw common_chat_msg_partial_exception(regex.str());
465
+ }
466
+
467
+ std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
468
+ auto m = regex.search(input_, pos_);
469
+ if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
470
+ return std::nullopt;
471
+ }
472
+ if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
473
+ if (is_partial()) {
474
+ throw common_chat_msg_partial_exception(regex.str());
475
+ }
476
+ return std::nullopt;
477
+ }
478
+ if (m.groups[0].begin != pos_) {
479
+ // Didn't match at the current position.
480
+ return std::nullopt;
481
+ }
482
+ pos_ = m.groups[0].end;
483
+
484
+ return find_regex_result {
485
+ /* .prelude = */ "",
486
+ m.groups,
487
+ };
488
+ }
489
+
490
+ std::optional<common_json> common_chat_msg_parser::try_consume_json() {
491
+ auto it = input_.cbegin() + pos_;
492
+ const auto end = input_.cend();
493
+ common_json result;
494
+ if (!common_json_parse(it, end, healing_marker_, result)) {
495
+ return std::nullopt;
496
+ }
497
+ pos_ = std::distance(input_.cbegin(), it);
498
+ if (result.healing_marker.marker.empty()) {
499
+ // No healing marker, just return the parsed json
500
+ return result;
501
+ }
502
+ if (!is_partial()) {
503
+ throw common_chat_msg_partial_exception("JSON");
504
+ }
505
+ return result;
506
+ }
507
+
508
+ common_json common_chat_msg_parser::consume_json() {
509
+ if (auto result = try_consume_json()) {
510
+ return *result;
511
+ }
512
+ throw common_chat_msg_partial_exception("JSON");
513
+ }
514
+
515
+ common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
516
+ const std::vector<std::vector<std::string>> & args_paths,
517
+ const std::vector<std::vector<std::string>> & content_paths
518
+ ) {
519
+ if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
520
+ return *result;
521
+ }
522
+ throw common_chat_msg_partial_exception("JSON");
523
+ }
524
+
525
+ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
526
+ const std::vector<std::vector<std::string>> & args_paths,
527
+ const std::vector<std::vector<std::string>> & content_paths
528
+ ) {
529
+ auto partial = try_consume_json();
530
+ if (!partial) {
531
+ return std::nullopt;
532
+ }
533
+ auto is_arguments_path = [&](const std::vector<std::string> & path) {
534
+ return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
535
+ };
536
+ auto is_content_path = [&](const std::vector<std::string> & path) {
537
+ return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
538
+ };
539
+
540
+ if (partial->healing_marker.marker.empty()) {
541
+ if (args_paths.empty()) {
542
+ // No arguments to dump, and JSON was parsed fully.
543
+ return consume_json_result {
544
+ partial->json,
545
+ /* .is_partial = */ false,
546
+ };
547
+ }
548
+ if (is_arguments_path({})) {
549
+ // Entire JSON is the arguments and was parsed fully.
550
+ return consume_json_result {
551
+ partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
552
+ /* .is_partial = */ false,
553
+ };
554
+ }
555
+ }
556
+
557
+ LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
558
+
559
+ auto found_healing_marker = false;
560
+ std::vector<std::string> path;
561
+ std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
562
+ if (is_arguments_path(path)) {
563
+ auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
564
+ if (is_partial() && !partial->healing_marker.marker.empty()) {
565
+ auto idx = arguments.find(partial->healing_marker.json_dump_marker);
566
+ if (idx != std::string::npos) {
567
+ arguments.resize(idx);
568
+ found_healing_marker = true;
569
+ }
570
+ if (arguments == "\"") {
571
+ // This happens because of completing `:"$magic` after `"arguments"`
572
+ arguments = "";
573
+ }
574
+ }
575
+ return arguments;
576
+ }
577
+ if (is_content_path(path)) {
578
+ if (!j.is_string()) {
579
+ throw std::runtime_error("Content path must be a string");
580
+ }
581
+ std::string str = j;
582
+ auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
583
+ if (idx != std::string::npos) {
584
+ str.resize(idx);
585
+ found_healing_marker = true;
586
+ }
587
+ return str;
588
+ }
589
+ if (j.is_object()) {
590
+ auto obj = json::object();
591
+ for (const auto & p : j.items()) {
592
+ const auto & key = p.key();
593
+ const auto & value = p.value();
594
+ const std::string key_str = key; // NOLINT
595
+ auto idx = key_str.find(healing_marker_);
596
+ if (idx != std::string::npos) {
597
+ found_healing_marker = true;
598
+ break;
599
+ }
600
+ path.push_back(key_str);
601
+ if (value.is_string()) {
602
+ const std::string value_str = value;
603
+ if (value_str.find(healing_marker_) != std::string::npos) {
604
+ found_healing_marker = true;
605
+ if (is_content_path(path)) {
606
+ if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
607
+ // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
608
+ obj[key] = remove_unsupported_healings_and_dump_args(value);
609
+ }
610
+ }
611
+ break;
612
+ }
613
+ obj[key] = value;
614
+ } else {
615
+ obj[key] = remove_unsupported_healings_and_dump_args(value);
616
+ }
617
+ path.pop_back();
618
+ }
619
+ return obj;
620
+ }
621
+ if (j.is_array()) {
622
+ auto arr = json::array();
623
+ for (const auto & value : j) {
624
+ if (value.is_string()) {
625
+ std::string str = value;
626
+ auto idx = str.find(healing_marker_);
627
+ if (idx != std::string::npos) {
628
+ // Don't heal array values that aren't in the arguments.
629
+ found_healing_marker = true;
630
+ break;
631
+ }
632
+ }
633
+ arr.push_back(remove_unsupported_healings_and_dump_args(value));
634
+ }
635
+ return arr;
636
+ }
637
+ return j;
638
+ };
639
+
640
+ auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
641
+ LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
642
+ return consume_json_result {
643
+ cleaned,
644
+ /* .is_partial = */ found_healing_marker,
645
+ };
646
+ }
647
+
648
+ void common_chat_msg_parser::clear_tools() {
649
+ result_.tool_calls.clear();
650
+ }
651
+
652
+ /**
653
+ * All common_chat_parse_* moved from chat.cpp to chat-parser.cpp below
654
+ * to reduce incremental compile time for parser changes.
655
+ */
656
+ static void common_chat_parse_generic(common_chat_msg_parser & builder) {
657
+ if (!builder.syntax().parse_tool_calls) {
658
+ builder.add_content(builder.consume_rest());
659
+ return;
660
+ }
661
+ static const std::vector<std::vector<std::string>> content_paths = {
662
+ {"response"},
663
+ };
664
+ static const std::vector<std::vector<std::string>> args_paths = {
665
+ {"tool_call", "arguments"},
666
+ {"tool_calls", "arguments"},
667
+ };
668
+ auto data = builder.consume_json_with_dumped_args(args_paths, content_paths);
669
+ if (data.value.contains("tool_calls")) {
670
+ if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) {
671
+ throw common_chat_msg_partial_exception("incomplete tool calls");
672
+ }
673
+ } else if (data.value.contains("tool_call")) {
674
+ if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) {
675
+ throw common_chat_msg_partial_exception("incomplete tool call");
676
+ }
677
+ } else if (data.value.contains("response")) {
678
+ const auto & response = data.value.at("response");
679
+ builder.add_content(response.is_string() ? response.template get<std::string>() : response.dump(2));
680
+ if (data.is_partial) {
681
+ throw common_chat_msg_partial_exception("incomplete response");
682
+ }
683
+ } else {
684
+ throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON");
685
+ }
686
+ }
687
+
688
+ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
689
+ if (!builder.syntax().parse_tool_calls) {
690
+ builder.add_content(builder.consume_rest());
691
+ return;
692
+ }
693
+
694
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
695
+ parse_prefixed_json_tool_call_array(builder, prefix);
696
+ }
697
+
698
+ static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
699
+ builder.try_parse_reasoning("[THINK]", "[/THINK]");
700
+
701
+ if (!builder.syntax().parse_tool_calls) {
702
+ builder.add_content(builder.consume_rest());
703
+ return;
704
+ }
705
+
706
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
707
+ parse_prefixed_json_tool_call_array(builder, prefix);
708
+ }
709
+
710
+ static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
711
+ builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
712
+
713
+ static const common_regex start_action_regex("<\\|START_ACTION\\|>");
714
+ static const common_regex end_action_regex("<\\|END_ACTION\\|>");
715
+ static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
716
+ static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
717
+
718
+ if (auto res = builder.try_find_regex(start_action_regex)) {
719
+ // If we didn't extract thoughts, prelude includes them.
720
+ auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
721
+ for (const auto & tool_call : tool_calls.value) {
722
+ std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
723
+ std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
724
+ std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
725
+ if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
726
+ throw common_chat_msg_partial_exception("incomplete tool call");
727
+ }
728
+ }
729
+ if (tool_calls.is_partial) {
730
+ throw common_chat_msg_partial_exception("incomplete tool call");
731
+ }
732
+ builder.consume_regex(end_action_regex);
733
+ } else if (auto res = builder.try_find_regex(start_response_regex)) {
734
+ if (!builder.try_find_regex(end_response_regex)) {
735
+ builder.add_content(builder.consume_rest());
736
+ throw common_chat_msg_partial_exception(end_response_regex.str());
737
+ }
738
+ } else {
739
+ builder.add_content(builder.consume_rest());
740
+ }
741
+ }
742
+
743
+ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
744
+ builder.try_parse_reasoning("<think>", "</think>");
745
+
746
+ if (!builder.syntax().parse_tool_calls) {
747
+ builder.add_content(builder.consume_rest());
748
+ return;
749
+ }
750
+
751
+ static const common_regex function_regex(
752
+ "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
753
+ static const common_regex close_regex("\\}\\s*");
754
+
755
+ static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\(");
756
+ static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*");
757
+
758
+ if (with_builtin_tools) {
759
+ static const common_regex builtin_call_regex("<\\|python_tag\\|>");
760
+ if (auto res = builder.try_find_regex(builtin_call_regex)) {
761
+ auto fun_res = builder.consume_regex(function_name_regex);
762
+ auto function_name = builder.str(fun_res.groups[1]);
763
+
764
+ common_healing_marker healing_marker;
765
+ json args = json::object();
766
+ while (true) {
767
+ if (auto arg_res = builder.try_consume_regex(arg_name_regex)) {
768
+ auto arg_name = builder.str(arg_res->groups[1]);
769
+ auto partial = builder.consume_json();
770
+ args[arg_name] = partial.json;
771
+ healing_marker.marker = partial.healing_marker.marker;
772
+ healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker;
773
+ builder.consume_spaces();
774
+ if (!builder.try_consume_literal(",")) {
775
+ break;
776
+ }
777
+ } else {
778
+ break;
779
+ }
780
+ }
781
+ builder.consume_literal(")");
782
+ builder.consume_spaces();
783
+
784
+ auto arguments = args.dump();
785
+ if (!builder.add_tool_call(function_name, "", arguments)) {
786
+ throw common_chat_msg_partial_exception("Incomplete tool call");
787
+ }
788
+ return;
789
+ }
790
+ }
791
+ parse_json_tool_calls(
792
+ builder,
793
+ /* block_open= */ std::nullopt,
794
+ /* function_regex_start_only= */ function_regex,
795
+ /* function_regex= */ std::nullopt,
796
+ close_regex,
797
+ std::nullopt);
798
+
799
+ }
800
+
801
+ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
802
+ builder.try_parse_reasoning("<think>", "</think>");
803
+ if (!builder.syntax().parse_tool_calls) {
804
+ builder.add_content(builder.consume_rest());
805
+ return;
806
+ }
807
+
808
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
809
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
810
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?function<|tool▁sep|>([^\n]+)\n```json\n");
811
+ static const common_regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
812
+
813
+ parse_json_tool_calls(
814
+ builder,
815
+ /* block_open= */ tool_calls_begin,
816
+ /* function_regex_start_only= */ std::nullopt,
817
+ function_regex,
818
+ close_regex,
819
+ tool_calls_end);
820
+ }
821
+
822
+ static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
823
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
824
+
825
+ static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
826
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
827
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
828
+
829
+ if (!builder.syntax().parse_tool_calls) {
830
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
831
+ builder.add_content(builder.consume_rest());
832
+ return;
833
+ }
834
+
835
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
836
+
837
+ parse_json_tool_calls(
838
+ builder,
839
+ /* block_open= */ tool_calls_begin,
840
+ /* function_regex_start_only= */ std::nullopt,
841
+ function_regex,
842
+ close_regex,
843
+ tool_calls_end);
844
+ }
845
+
846
+ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
847
+ // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
848
+ // First try to parse using the standard reasoning parsing method
849
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
850
+
851
+ auto start_pos = builder.pos();
852
+ auto found_end_think = builder.try_find_literal("</think>");
853
+ builder.move_to(start_pos);
854
+
855
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
856
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
857
+ common_chat_parse_deepseek_v3_1_content(builder);
858
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
859
+ // If reasoning was parsed successfully, the remaining content is regular content
860
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
861
+ // </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
862
+ common_chat_parse_deepseek_v3_1_content(builder);
863
+ } else {
864
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
865
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
866
+ common_chat_parse_deepseek_v3_1_content(builder);
867
+ return;
868
+ }
869
+ // If no reasoning tags found, check if we should treat everything as reasoning
870
+ if (builder.syntax().thinking_forced_open) {
871
+ // If thinking is forced open but no tags found, treat everything as reasoning
872
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
873
+ builder.add_reasoning_content(builder.consume_rest());
874
+ } else {
875
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
876
+ // <|tool▁call▁begin|>NAME<���tool▁sep|>JSON<|tool▁call▁end|>
877
+ common_chat_parse_deepseek_v3_1_content(builder);
878
+ }
879
+ }
880
+ }
881
+
882
+ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
883
+ static const xml_tool_call_format form {
884
+ /* form.scope_start = */ "<minimax:tool_call>",
885
+ /* form.tool_start = */ "<invoke name=\"",
886
+ /* form.tool_sep = */ "\">",
887
+ /* form.key_start = */ "<parameter name=\"",
888
+ /* form.key_val_sep = */ "\">",
889
+ /* form.val_end = */ "</parameter>",
890
+ /* form.tool_end = */ "</invoke>",
891
+ /* form.scope_end = */ "</minimax:tool_call>",
892
+ };
893
+ builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
894
+ }
895
+
896
+ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
897
+ static const xml_tool_call_format form = ([]() {
898
+ xml_tool_call_format form {};
899
+ form.scope_start = "<|tool_calls_section_begin|>";
900
+ form.tool_start = "<|tool_call_begin|>";
901
+ form.tool_sep = "<|tool_call_argument_begin|>{";
902
+ form.key_start = "\"";
903
+ form.key_val_sep = "\":";
904
+ form.val_end = ",";
905
+ form.tool_end = "}<|tool_call_end|>";
906
+ form.scope_end = "<|tool_calls_section_end|>";
907
+ form.raw_argval = false;
908
+ form.last_val_end = "";
909
+ form.allow_toolcall_in_think = true;
910
+ return form;
911
+ })();
912
+ builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
913
+ }
914
+
915
+ static void common_chat_parse_apriel_1_5(common_chat_msg_parser & builder) {
916
+ static const xml_tool_call_format form = ([]() {
917
+ xml_tool_call_format form {};
918
+ form.scope_start = "<tool_calls>[";
919
+ form.tool_start = "{\"name\": \"";
920
+ form.tool_sep = "\", \"arguments\": {";
921
+ form.key_start = "\"";
922
+ form.key_val_sep = "\": ";
923
+ form.val_end = ", ";
924
+ form.tool_end = "}, ";
925
+ form.scope_end = "]</tool_calls>";
926
+ form.raw_argval = false;
927
+ form.last_val_end = "";
928
+ form.last_tool_end = "}";
929
+ return form;
930
+ })();
931
+ builder.consume_reasoning_with_xml_tool_calls(form, "<thinking>", "</thinking>");
932
+ }
933
+
934
+ static void common_chat_parse_xiaomi_mimo(common_chat_msg_parser & builder) {
935
+ static const xml_tool_call_format form = ([]() {
936
+ xml_tool_call_format form {};
937
+ form.scope_start = "";
938
+ form.tool_start = "<tool_call>\n{\"name\": \"";
939
+ form.tool_sep = "\", \"arguments\": {";
940
+ form.key_start = "\"";
941
+ form.key_val_sep = "\": ";
942
+ form.val_end = ", ";
943
+ form.tool_end = "}\n</tool_call>";
944
+ form.scope_end = "";
945
+ form.raw_argval = false;
946
+ form.last_val_end = "";
947
+ return form;
948
+ })();
949
+ builder.consume_reasoning_with_xml_tool_calls(form);
950
+ }
951
+
952
+ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
953
+ static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
954
+ static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
955
+
956
+ static const common_regex start_regex("<\\|start\\|>assistant");
957
+ static const common_regex analysis_regex("<\\|channel\\|>analysis");
958
+ static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
959
+ static const common_regex preamble_regex("<\\|channel\\|>commentary");
960
+ static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
961
+ static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
962
+
963
+ auto consume_end = [&](bool include_end = false) {
964
+ if (auto res = builder.try_find_literal("<|end|>")) {
965
+ return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
966
+ }
967
+ return builder.consume_rest();
968
+ };
969
+
970
+ auto handle_tool_call = [&](const std::string & name) {
971
+ if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
972
+ if (builder.syntax().parse_tool_calls) {
973
+ if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
974
+ throw common_chat_msg_partial_exception("incomplete tool call");
975
+ }
976
+ } else if (args->is_partial) {
977
+ throw common_chat_msg_partial_exception("incomplete tool call");
978
+ }
979
+ }
980
+ };
981
+
982
+ auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
983
+ auto match = regex.search(input, 0, true);
984
+ if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
985
+ return match;
986
+ }
987
+ return std::nullopt;
988
+ };
989
+
990
+ do {
991
+ auto header_start_pos = builder.pos();
992
+ auto content_start = builder.try_find_literal("<|message|>");
993
+ if (!content_start) {
994
+ throw common_chat_msg_partial_exception("incomplete header");
995
+ }
996
+
997
+ auto header = content_start->prelude;
998
+
999
+ if (auto match = regex_match(tool_call1_regex, header)) {
1000
+ auto group = match->groups[1];
1001
+ auto name = header.substr(group.begin, group.end - group.begin);
1002
+ handle_tool_call(name);
1003
+ continue;
1004
+ }
1005
+
1006
+ if (auto match = regex_match(tool_call2_regex, header)) {
1007
+ auto group = match->groups[2];
1008
+ auto name = header.substr(group.begin, group.end - group.begin);
1009
+ handle_tool_call(name);
1010
+ continue;
1011
+ }
1012
+
1013
+ if (regex_match(analysis_regex, header)) {
1014
+ builder.move_to(header_start_pos);
1015
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
1016
+ builder.add_content(consume_end(true));
1017
+ } else {
1018
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
1019
+ }
1020
+ continue;
1021
+ }
1022
+
1023
+ if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
1024
+ builder.add_content(consume_end());
1025
+ continue;
1026
+ }
1027
+
1028
+ // Possibly a malformed message, attempt to recover by rolling
1029
+ // back to pick up the next <|start|>
1030
+ LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
1031
+ builder.move_to(header_start_pos);
1032
+ } while (builder.try_find_regex(start_regex, std::string::npos, false));
1033
+
1034
+ auto remaining = builder.consume_rest();
1035
+ if (!remaining.empty()) {
1036
+ LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
1037
+ }
1038
+ }
1039
+
1040
+ static void common_chat_parse_glm_4_5(common_chat_msg_parser & builder) {
1041
+ static const xml_tool_call_format form {
1042
+ /* form.scope_start = */ "",
1043
+ /* form.tool_start = */ "<tool_call>",
1044
+ /* form.tool_sep = */ "",
1045
+ /* form.key_start = */ "<arg_key>",
1046
+ /* form.key_val_sep = */ "</arg_key>",
1047
+ /* form.val_end = */ "</arg_value>",
1048
+ /* form.tool_end = */ "</tool_call>",
1049
+ /* form.scope_end = */ "",
1050
+ /* form.key_val_sep2 = */ "<arg_value>",
1051
+ };
1052
+ builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
1053
+ }
1054
+
1055
+ static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
1056
+ if (!builder.syntax().parse_tool_calls) {
1057
+ builder.add_content(builder.consume_rest());
1058
+ return;
1059
+ }
1060
+ static const common_regex prefix(regex_escape(" functools["));
1061
+ parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
1062
+ }
1063
+
1064
+ static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) {
1065
+ static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))");
1066
+ static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))");
1067
+ static const common_regex close_regex(R"(\s*)");
1068
+
1069
+ parse_json_tool_calls(
1070
+ builder,
1071
+ std::nullopt,
1072
+ function_regex_start_only,
1073
+ function_regex,
1074
+ close_regex,
1075
+ std::nullopt,
1076
+ /* allow_raw_python= */ true,
1077
+ /* get_function_name= */ [&](const auto & res) -> std::string {
1078
+ auto at_start = res.groups[0].begin == 0;
1079
+ auto name = builder.str(res.groups[1]);
1080
+ if (!name.empty() && name.back() == '{') {
1081
+ // Unconsume the opening brace '{' to ensure the JSON parsing goes well.
1082
+ builder.move_back(1);
1083
+ }
1084
+ auto idx = name.find_last_not_of("\n{");
1085
+ name = name.substr(0, idx + 1);
1086
+ if (at_start && name == "all") {
1087
+ return "";
1088
+ }
1089
+ return name;
1090
+ });
1091
+ }
1092
+
1093
+ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
1094
+ if (!builder.syntax().parse_tool_calls) {
1095
+ builder.add_content(builder.consume_rest());
1096
+ return;
1097
+ }
1098
+ // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
1099
+ static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
1100
+
1101
+ static const common_regex function_regex(R"(<function=(\w+)>)");
1102
+ static const common_regex close_regex(R"(</function>)");
1103
+
1104
+ parse_json_tool_calls(
1105
+ builder,
1106
+ /* block_open= */ std::nullopt,
1107
+ /* function_regex_start_only= */ std::nullopt,
1108
+ function_regex,
1109
+ close_regex,
1110
+ std::nullopt);
1111
+
1112
+ if (auto res = builder.try_find_regex(python_tag_regex)) {
1113
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
1114
+ builder.add_tool_call("python", "", arguments);
1115
+ return;
1116
+ }
1117
+ }
1118
+
1119
+ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1120
+ builder.try_parse_reasoning("<think>", "</think>");
1121
+ if (!builder.syntax().parse_tool_calls) {
1122
+ builder.add_content(builder.consume_rest());
1123
+ return;
1124
+ }
1125
+
1126
+ static const common_regex open_regex(
1127
+ "(?:"
1128
+ "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
1129
+ "(" // match 2 (open_tag)
1130
+ "<tool_call>"
1131
+ "|<function_call>"
1132
+ "|<tool>"
1133
+ "|<tools>"
1134
+ "|<response>"
1135
+ "|<json>"
1136
+ "|<xml>"
1137
+ "|<JSON>"
1138
+ ")?"
1139
+ "(\\s*\\{\\s*\"name\")" // match 3 (named tool call)
1140
+ ")"
1141
+ "|<function=([^>]+)>" // match 4 (function name)
1142
+ "|<function name=\"([^\"]+)\">" // match 5 (function name again)
1143
+ );
1144
+
1145
+ while (auto res = builder.try_find_regex(open_regex)) {
1146
+ const auto & block_start = res->groups[1];
1147
+ std::string block_end = block_start.empty() ? "" : "```";
1148
+
1149
+ const auto & open_tag = res->groups[2];
1150
+ std::string close_tag;
1151
+
1152
+ if (!res->groups[3].empty()) {
1153
+ builder.move_to(res->groups[3].begin);
1154
+ close_tag = open_tag.empty() ? "" : "</" + builder.str(open_tag).substr(1);
1155
+
1156
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{"arguments"}})) {
1157
+ if (!builder.add_tool_call(tool_call->value) || tool_call->is_partial) {
1158
+ throw common_chat_msg_partial_exception("incomplete tool call");
1159
+ }
1160
+ builder.consume_spaces();
1161
+ builder.consume_literal(close_tag);
1162
+ builder.consume_spaces();
1163
+ if (!block_end.empty()) {
1164
+ builder.consume_literal(block_end);
1165
+ builder.consume_spaces();
1166
+ }
1167
+ } else {
1168
+ throw common_chat_msg_partial_exception("failed to parse tool call");
1169
+ }
1170
+ } else {
1171
+ auto function_name = builder.str(res->groups[4]);
1172
+ if (function_name.empty()) {
1173
+ function_name = builder.str(res->groups[5]);
1174
+ }
1175
+ GGML_ASSERT(!function_name.empty());
1176
+
1177
+ close_tag = "</function>";
1178
+
1179
+ if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
1180
+ if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) {
1181
+ throw common_chat_msg_partial_exception("incomplete tool call");
1182
+ }
1183
+ builder.consume_spaces();
1184
+ builder.consume_literal(close_tag);
1185
+ builder.consume_spaces();
1186
+ if (!block_end.empty()) {
1187
+ builder.consume_literal(block_end);
1188
+ builder.consume_spaces();
1189
+ }
1190
+ }
1191
+ }
1192
+ }
1193
+
1194
+ builder.add_content(builder.consume_rest());
1195
+ }
1196
+
1197
+ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
1198
+ // Parse thinking tags
1199
+ static const common_regex start_think_regex(regex_escape("<think>"));
1200
+ static const common_regex end_think_regex(regex_escape("</think>"));
1201
+ // Granite models output partial tokens such as "<" and "<think".
1202
+ // By leveraging try_consume_regex()/try_find_regex() throwing
1203
+ // common_chat_msg_partial_exception for these partial tokens,
1204
+ // processing is interrupted and the tokens are not passed to add_content().
1205
+ if (auto res = builder.try_consume_regex(start_think_regex)) {
1206
+ // Restore position for try_parse_reasoning()
1207
+ builder.move_to(res->groups[0].begin);
1208
+ builder.try_find_regex(end_think_regex, std::string::npos, false);
1209
+ // Restore position for try_parse_reasoning()
1210
+ builder.move_to(res->groups[0].begin);
1211
+ }
1212
+ builder.try_parse_reasoning("<think>", "</think>");
1213
+
1214
+ // Parse response tags
1215
+ static const common_regex start_response_regex(regex_escape("<response>"));
1216
+ static const common_regex end_response_regex(regex_escape("</response>"));
1217
+ // Granite models output partial tokens such as "<" and "<response".
1218
+ // Same hack as reasoning parsing.
1219
+ if (builder.try_consume_regex(start_response_regex)) {
1220
+ builder.try_find_regex(end_response_regex);
1221
+ }
1222
+
1223
+ if (!builder.syntax().parse_tool_calls) {
1224
+ builder.add_content(builder.consume_rest());
1225
+ return;
1226
+ }
1227
+
1228
+ // Look for tool calls
1229
+ static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
1230
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
1231
+ builder.move_to(res->groups[0].end);
1232
+
1233
+ // Expect JSON array of tool calls
1234
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
1235
+ if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
1236
+ throw common_chat_msg_partial_exception("incomplete tool call");
1237
+ }
1238
+ }
1239
+ } else {
1240
+ builder.add_content(builder.consume_rest());
1241
+ }
1242
+ }
1243
+
1244
+ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
1245
+ // Parse thinking tags
1246
+ builder.try_parse_reasoning("<think>", "</think>");
1247
+ if (!builder.syntax().parse_tool_calls) {
1248
+ builder.add_content(builder.consume_rest());
1249
+ return;
1250
+ }
1251
+
1252
+ // Look for tool calls
1253
+ static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
1254
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
1255
+ builder.move_to(res->groups[0].end);
1256
+
1257
+ // Expect JSON array of tool calls
1258
+ auto tool_calls_data = builder.consume_json();
1259
+ if (tool_calls_data.json.is_array()) {
1260
+ if (!builder.try_consume_literal("</TOOLCALL>")) {
1261
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1262
+ }
1263
+ builder.add_tool_calls(tool_calls_data.json);
1264
+ } else {
1265
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1266
+ }
1267
+ }
1268
+ builder.add_content(builder.consume_rest());
1269
+ }
1270
+
1271
+ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
1272
+ // Parse thinking tags
1273
+ builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
1274
+ if (!builder.syntax().parse_tool_calls) {
1275
+ builder.add_content(builder.consume_rest());
1276
+ return;
1277
+ }
1278
+
1279
+ // Look for tool calls
1280
+ static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
1281
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
1282
+ builder.move_to(res->groups[0].end);
1283
+
1284
+ auto tool_calls_data = builder.consume_json();
1285
+ if (tool_calls_data.json.is_array()) {
1286
+ builder.consume_spaces();
1287
+ if (!builder.try_consume_literal("<|tools_suffix|>")) {
1288
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1289
+ }
1290
+ for (const auto & value : tool_calls_data.json) {
1291
+ if (value.is_object()) {
1292
+ builder.add_tool_call_short_form(value);
1293
+ }
1294
+ }
1295
+ } else {
1296
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1297
+ }
1298
+ }
1299
+ builder.add_content(builder.consume_rest());
1300
+ }
1301
+
1302
+
1303
+ static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
1304
+ if (!builder.syntax().parse_tool_calls) {
1305
+ builder.add_content(builder.consume_rest());
1306
+ return;
1307
+ }
1308
+
1309
+ // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
1310
+ static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
1311
+ static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
1312
+
1313
+ // Loop through all tool calls
1314
+ while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
1315
+ builder.move_to(res->groups[0].end);
1316
+
1317
+ // Parse JSON array format: [{"name": "...", "arguments": {...}}]
1318
+ auto tool_calls_data = builder.consume_json();
1319
+
1320
+ // Consume end marker
1321
+ builder.consume_spaces();
1322
+ if (!builder.try_consume_regex(tool_call_end_regex)) {
1323
+ throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
1324
+ }
1325
+
1326
+ // Process each tool call in the array
1327
+ if (tool_calls_data.json.is_array()) {
1328
+ for (const auto & tool_call : tool_calls_data.json) {
1329
+ if (!tool_call.is_object()) {
1330
+ throw common_chat_msg_partial_exception("Tool call must be an object");
1331
+ }
1332
+
1333
+ if (!tool_call.contains("name")) {
1334
+ throw common_chat_msg_partial_exception("Tool call missing 'name' field");
1335
+ }
1336
+
1337
+ std::string function_name = tool_call.at("name");
1338
+ std::string arguments = "{}";
1339
+
1340
+ if (tool_call.contains("arguments")) {
1341
+ if (tool_call.at("arguments").is_object()) {
1342
+ arguments = tool_call.at("arguments").dump();
1343
+ } else if (tool_call.at("arguments").is_string()) {
1344
+ arguments = tool_call.at("arguments");
1345
+ }
1346
+ }
1347
+
1348
+ if (!builder.add_tool_call(function_name, "", arguments)) {
1349
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1350
+ }
1351
+ }
1352
+ } else {
1353
+ throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
1354
+ }
1355
+
1356
+ // Consume any trailing whitespace after this tool call
1357
+ builder.consume_spaces();
1358
+ }
1359
+
1360
+ // Consume any remaining content after all tool calls
1361
+ auto remaining = builder.consume_rest();
1362
+ if (!string_strip(remaining).empty()) {
1363
+ builder.add_content(remaining);
1364
+ }
1365
+ }
1366
+
1367
+ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
1368
+ static const xml_tool_call_format form {
1369
+ /* form.scope_start = */ "<seed:tool_call>",
1370
+ /* form.tool_start = */ "<function=",
1371
+ /* form.tool_sep = */ ">",
1372
+ /* form.key_start = */ "<parameter=",
1373
+ /* form.key_val_sep = */ ">",
1374
+ /* form.val_end = */ "</parameter>",
1375
+ /* form.tool_end = */ "</function>",
1376
+ /* form.scope_end = */ "</seed:tool_call>",
1377
+ };
1378
+ builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
1379
+ }
1380
+
1381
+ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1382
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
1383
+
1384
+ // TODO: Tool calling
1385
+
1386
+ builder.add_content(builder.consume_rest());
1387
+ }
1388
+
1389
+ static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
1390
+ // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
1391
+ // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
1392
+ static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
1393
+
1394
+ if (!builder.syntax().parse_tool_calls) {
1395
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1396
+ builder.add_content(builder.consume_rest());
1397
+ return;
1398
+ }
1399
+
1400
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1401
+
1402
+ // Find all <tool_call></tool_call> blocks
1403
+ while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
1404
+ builder.move_to(first->groups[0].end);
1405
+ builder.consume_spaces();
1406
+
1407
+ builder.try_consume_literal("```json");
1408
+ builder.try_consume_literal("```");
1409
+ builder.consume_spaces();
1410
+
1411
+ // Consume JSON object
1412
+ auto data = builder.consume_json();
1413
+
1414
+ builder.consume_spaces();
1415
+ builder.try_consume_literal("```");
1416
+ builder.consume_spaces();
1417
+
1418
+ if (!builder.try_consume_literal("</tool_call>")) {
1419
+ throw common_chat_msg_partial_exception("incomplete tool call");
1420
+ }
1421
+ builder.consume_spaces();
1422
+
1423
+ // Extract name and arguments
1424
+ std::string name;
1425
+ std::string id;
1426
+ nlohmann::ordered_json arguments;
1427
+
1428
+ const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
1429
+ if (!obj.contains("name") || !obj.contains("arguments")) {
1430
+ return false;
1431
+ }
1432
+ name = obj.at("name").get<std::string>();
1433
+ arguments = obj.at("arguments");
1434
+ if (obj.contains("id") && obj.at("id").is_string()) {
1435
+ id = obj.at("id").get<std::string>();
1436
+ }
1437
+ return true;
1438
+ };
1439
+
1440
+ if (!extract_args(data.json)) {
1441
+ if (data.json.contains("function") && data.json.at("function").is_object()) {
1442
+ auto fn = data.json.at("function");
1443
+ extract_args(fn);
1444
+ if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
1445
+ id = data.json.at("id").get<std::string>();
1446
+ }
1447
+ }
1448
+ }
1449
+
1450
+ // If name is empty, treat the JSON object as content
1451
+ if (name.empty()) {
1452
+ LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
1453
+ builder.add_content(data.json.dump());
1454
+ continue;
1455
+ }
1456
+
1457
+ std::string args_str = arguments.dump();
1458
+ if (!builder.add_tool_call(name, id, args_str)) {
1459
+ throw common_chat_msg_partial_exception("incomplete tool call");
1460
+ }
1461
+ }
1462
+
1463
+ builder.add_content(builder.consume_rest());
1464
+ }
1465
+
1466
+ static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
1467
+ LOG_DBG("%s: parsing exaone_moe\n", __func__);
1468
+ // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1469
+ // First try to parse using the standard reasoning parsing method
1470
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1471
+
1472
+ auto start_pos = builder.pos();
1473
+ auto found_end_think = builder.try_find_literal("</think>");
1474
+ builder.move_to(start_pos);
1475
+
1476
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1477
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1478
+ common_chat_parse_exaone_moe_content(builder);
1479
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1480
+ // If reasoning was parsed successfully, the remaining content is regular content
1481
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1482
+ common_chat_parse_exaone_moe_content(builder);
1483
+ } else {
1484
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1485
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1486
+ common_chat_parse_exaone_moe_content(builder);
1487
+ return;
1488
+ }
1489
+ // If no reasoning tags found, check if we should treat everything as reasoning
1490
+ if (builder.syntax().thinking_forced_open) {
1491
+ // If thinking is forced open but no tags found, treat everything as reasoning
1492
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1493
+ builder.add_reasoning_content(builder.consume_rest());
1494
+ } else {
1495
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1496
+ common_chat_parse_exaone_moe_content(builder);
1497
+ }
1498
+ }
1499
+ }
1500
+
1501
+ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1502
+ builder.try_parse_reasoning("<think>", "</think>");
1503
+ builder.add_content(builder.consume_rest());
1504
+ }
1505
+
1506
+ static void common_chat_parse(common_chat_msg_parser & builder) {
1507
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
1508
+
1509
+ switch (builder.syntax().format) {
1510
+ case COMMON_CHAT_FORMAT_CONTENT_ONLY:
1511
+ common_chat_parse_content_only(builder);
1512
+ break;
1513
+ case COMMON_CHAT_FORMAT_GENERIC:
1514
+ common_chat_parse_generic(builder);
1515
+ break;
1516
+ case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
1517
+ common_chat_parse_mistral_nemo(builder);
1518
+ break;
1519
+ case COMMON_CHAT_FORMAT_MAGISTRAL:
1520
+ common_chat_parse_magistral(builder);
1521
+ break;
1522
+ case COMMON_CHAT_FORMAT_LLAMA_3_X:
1523
+ common_chat_parse_llama_3_1(builder);
1524
+ break;
1525
+ case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
1526
+ common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
1527
+ break;
1528
+ case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
1529
+ common_chat_parse_deepseek_r1(builder);
1530
+ break;
1531
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
1532
+ common_chat_parse_deepseek_v3_1(builder);
1533
+ break;
1534
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
1535
+ common_chat_parse_functionary_v3_2(builder);
1536
+ break;
1537
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
1538
+ common_chat_parse_functionary_v3_1_llama_3_1(builder);
1539
+ break;
1540
+ case COMMON_CHAT_FORMAT_HERMES_2_PRO:
1541
+ common_chat_parse_hermes_2_pro(builder);
1542
+ break;
1543
+ case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
1544
+ common_chat_parse_firefunction_v2(builder);
1545
+ break;
1546
+ case COMMON_CHAT_FORMAT_COMMAND_R7B:
1547
+ common_chat_parse_command_r7b(builder);
1548
+ break;
1549
+ case COMMON_CHAT_FORMAT_GRANITE:
1550
+ common_chat_parse_granite(builder);
1551
+ break;
1552
+ case COMMON_CHAT_FORMAT_GPT_OSS:
1553
+ common_chat_parse_gpt_oss(builder);
1554
+ break;
1555
+ case COMMON_CHAT_FORMAT_SEED_OSS:
1556
+ common_chat_parse_seed_oss(builder);
1557
+ break;
1558
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2:
1559
+ common_chat_parse_nemotron_v2(builder);
1560
+ break;
1561
+ case COMMON_CHAT_FORMAT_APERTUS:
1562
+ common_chat_parse_apertus(builder);
1563
+ break;
1564
+ case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
1565
+ common_chat_parse_lfm2(builder);
1566
+ break;
1567
+ case COMMON_CHAT_FORMAT_MINIMAX_M2:
1568
+ common_chat_parse_minimax_m2(builder);
1569
+ break;
1570
+ case COMMON_CHAT_FORMAT_GLM_4_5:
1571
+ common_chat_parse_glm_4_5(builder);
1572
+ break;
1573
+ case COMMON_CHAT_FORMAT_KIMI_K2:
1574
+ common_chat_parse_kimi_k2(builder);
1575
+ break;
1576
+ case COMMON_CHAT_FORMAT_APRIEL_1_5:
1577
+ common_chat_parse_apriel_1_5(builder);
1578
+ break;
1579
+ case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
1580
+ common_chat_parse_xiaomi_mimo(builder);
1581
+ break;
1582
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1583
+ common_chat_parse_solar_open(builder);
1584
+ break;
1585
+ case COMMON_CHAT_FORMAT_EXAONE_MOE:
1586
+ common_chat_parse_exaone_moe(builder);
1587
+ break;
1588
+ default:
1589
+ throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1590
+ }
1591
+ builder.finish();
1592
+ }
1593
+
1594
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
1595
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
1596
+ syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
1597
+ syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
1598
+ return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
1599
+ }
1600
+ common_chat_msg_parser builder(input, is_partial, syntax);
1601
+ try {
1602
+ common_chat_parse(builder);
1603
+ } catch (const common_chat_msg_partial_exception & ex) {
1604
+ LOG_DBG("Partial parse: %s\n", ex.what());
1605
+ if (!is_partial) {
1606
+ builder.clear_tools();
1607
+ builder.move_to(0);
1608
+ common_chat_parse_content_only(builder);
1609
+ }
1610
+ }
1611
+ auto msg = builder.result();
1612
+ if (!is_partial) {
1613
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
1614
+ }
1615
+ return msg;
1616
+ }
1617
+
1618
+ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
1619
+ if (parser.empty()) {
1620
+ throw std::runtime_error("Failed to parse due to missing parser definition.");
1621
+ }
1622
+
1623
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
1624
+
1625
+ common_peg_parse_context ctx(input, is_partial);
1626
+ auto result = parser.parse(ctx);
1627
+ if (result.fail()) {
1628
+ throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
1629
+ }
1630
+
1631
+ common_chat_msg msg;
1632
+ msg.role = "assistant";
1633
+
1634
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
1635
+ auto mapper = common_chat_peg_native_mapper(msg);
1636
+ mapper.from_ast(ctx.ast, result);
1637
+ } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
1638
+ auto mapper = common_chat_peg_constructed_mapper(msg);
1639
+ mapper.from_ast(ctx.ast, result);
1640
+ } else {
1641
+ // Generic mapper
1642
+ auto mapper = common_chat_peg_mapper(msg);
1643
+ mapper.from_ast(ctx.ast, result);
1644
+ }
1645
+ if (!is_partial) {
1646
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
1647
+ }
1648
+ return msg;
1649
+ }
llama.cpp/common/chat-parser.h ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "chat.h"
4
+ #include "chat-parser-xml-toolcall.h"
5
+ #include "json-partial.h"
6
+ #include "regex-partial.h"
7
+
8
+ #include <nlohmann/json_fwd.hpp>
9
+
10
+ #include <optional>
11
+ #include <string>
12
+ #include <vector>
13
+
14
+ class common_chat_msg_partial_exception : public std::runtime_error {
15
+ public:
16
+ common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
17
+ };
18
+
19
+ class common_chat_msg_parser {
20
+ std::string input_;
21
+ bool is_partial_;
22
+ common_chat_parser_params syntax_; // TODO: rename to params
23
+ std::string healing_marker_;
24
+
25
+ size_t pos_ = 0;
26
+ common_chat_msg result_;
27
+
28
+ public:
29
+ common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
30
+ const std::string & input() const { return input_; }
31
+ size_t pos() const { return pos_; }
32
+ const std::string & healing_marker() const { return healing_marker_; }
33
+ const bool & is_partial() const { return is_partial_; }
34
+ const common_chat_msg & result() const { return result_; }
35
+ const common_chat_parser_params & syntax() const { return syntax_; }
36
+
37
+ void move_to(size_t pos) {
38
+ if (pos > input_.size()) {
39
+ throw std::runtime_error("Invalid position!");
40
+ }
41
+ pos_ = pos;
42
+ }
43
+ void move_back(size_t n) {
44
+ if (pos_ < n) {
45
+ throw std::runtime_error("Can't move back that far!");
46
+ }
47
+ pos_ -= n;
48
+ }
49
+
50
+ // Get the substring of the input at the given range
51
+ std::string str(const common_string_range & rng) const;
52
+
53
+ // Appends to the result.content field
54
+ void add_content(const std::string & content);
55
+
56
+ // Appends to the result.reasoning_content field
57
+ void add_reasoning_content(const std::string & reasoning_content);
58
+
59
+ // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
60
+ bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
61
+
62
+ // Adds a tool call using the "name", "id" and "arguments" fields of the json object
63
+ bool add_tool_call(const nlohmann::ordered_json & tool_call);
64
+
65
+ // Adds an array of tool calls using their "name", "id" and "arguments" fields.
66
+ bool add_tool_calls(const nlohmann::ordered_json & arr);
67
+
68
+ // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
69
+ bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
70
+
71
+ void finish();
72
+
73
+ bool consume_spaces();
74
+
75
+ void consume_literal(const std::string & literal);
76
+
77
+ bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
78
+
79
+ std::string consume_rest();
80
+
81
+ struct find_regex_result {
82
+ std::string prelude;
83
+ std::vector<common_string_range> groups;
84
+ };
85
+
86
+ std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
87
+
88
+ bool try_consume_literal(const std::string & literal);
89
+
90
+ std::optional<find_regex_result> try_find_literal(const std::string & literal);
91
+
92
+ find_regex_result consume_regex(const common_regex & regex);
93
+
94
+ std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
95
+
96
+ std::optional<common_json> try_consume_json();
97
+ common_json consume_json();
98
+
99
+ struct consume_json_result {
100
+ nlohmann::ordered_json value;
101
+ bool is_partial;
102
+ };
103
+
104
+ /*
105
+ Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
106
+
107
+ By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
108
+ e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
109
+
110
+ But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
111
+ - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
112
+ - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
113
+ */
114
+ consume_json_result consume_json_with_dumped_args(
115
+ const std::vector<std::vector<std::string>> & args_paths = {},
116
+ const std::vector<std::vector<std::string>> & content_paths = {}
117
+ );
118
+ std::optional<consume_json_result> try_consume_json_with_dumped_args(
119
+ const std::vector<std::vector<std::string>> & args_paths = {},
120
+ const std::vector<std::vector<std::string>> & content_paths = {}
121
+ );
122
+
123
+ /**
124
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
125
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
126
+ */
127
+ bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
128
+
129
+ // Parse content uses reasoning and XML-Style tool call
130
+ void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
131
+
132
+ void clear_tools();
133
+ };
llama.cpp/common/chat-peg-parser.cpp ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "chat-peg-parser.h"
2
+
3
+ #include <nlohmann/json.hpp>
4
+
5
+ using json = nlohmann::json;
6
+
7
+ static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
8
+ int count = 0;
9
+ while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
10
+ if (max != -1 && count <= max) {
11
+ break;
12
+ }
13
+ sv.remove_suffix(1);
14
+ count++;
15
+ }
16
+ return sv;
17
+ }
18
+
19
+ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
20
+ arena.visit(result, [this](const common_peg_ast_node & node) {
21
+ map(node);
22
+ });
23
+ }
24
+
25
+ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
26
+ bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
27
+ bool is_content = node.tag == common_chat_peg_builder::CONTENT;
28
+
29
+ if (is_reasoning) {
30
+ result.reasoning_content = std::string(trim_trailing_space(node.text));
31
+ }
32
+
33
+ if (is_content) {
34
+ result.content = std::string(trim_trailing_space(node.text));
35
+ }
36
+ }
37
+
38
+ void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
39
+ common_chat_peg_mapper::map(node);
40
+
41
+ bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
42
+ bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
43
+ bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
44
+ bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
45
+
46
+ if (is_tool_open) {
47
+ result.tool_calls.emplace_back();
48
+ current_tool = &result.tool_calls.back();
49
+ }
50
+
51
+ if (is_tool_id && current_tool) {
52
+ current_tool->id = std::string(trim_trailing_space(node.text));
53
+ }
54
+
55
+ if (is_tool_name && current_tool) {
56
+ current_tool->name = std::string(trim_trailing_space(node.text));
57
+ }
58
+
59
+ if (is_tool_args && current_tool) {
60
+ current_tool->arguments = std::string(trim_trailing_space(node.text));
61
+ }
62
+ }
63
+
64
+ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
65
+ common_chat_peg_mapper::map(node);
66
+
67
+ bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
68
+ bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
69
+ bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
70
+ bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
71
+ bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
72
+ bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
73
+ bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
74
+ bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
75
+
76
+ if (is_tool_open) {
77
+ result.tool_calls.emplace_back();
78
+ current_tool = &result.tool_calls.back();
79
+ arg_count = 0;
80
+ }
81
+
82
+ if (is_tool_name) {
83
+ current_tool->name = std::string(node.text);
84
+ current_tool->arguments = "{";
85
+ }
86
+
87
+ if (is_arg_open) {
88
+ needs_closing_quote = false;
89
+ }
90
+
91
+ if (is_arg_name && current_tool) {
92
+ if (arg_count > 0) {
93
+ current_tool->arguments += ",";
94
+ }
95
+ current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
96
+ ++arg_count;
97
+ }
98
+
99
+ if (is_arg_string && current_tool) {
100
+ // Serialize to JSON, but exclude the end quote
101
+ std::string dumped = json(trim_trailing_space(node.text)).dump();
102
+ current_tool->arguments += dumped.substr(0, dumped.size() - 1);
103
+ needs_closing_quote = true;
104
+ }
105
+
106
+ if (is_arg_close && current_tool) {
107
+ if (needs_closing_quote) {
108
+ current_tool->arguments += "\"";
109
+ needs_closing_quote = false;
110
+ }
111
+ }
112
+
113
+ if (is_arg_json && current_tool) {
114
+ current_tool->arguments += std::string(trim_trailing_space(node.text));
115
+ }
116
+
117
+ if (is_tool_close && current_tool) {
118
+ if (needs_closing_quote) {
119
+ current_tool->arguments += "\"";
120
+ needs_closing_quote = false;
121
+ }
122
+ current_tool->arguments += "}";
123
+ }
124
+ }
llama.cpp/common/chat-peg-parser.h ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "chat.h"
4
+ #include "peg-parser.h"
5
+
6
+ class common_chat_peg_builder : public common_peg_parser_builder {
7
+ public:
8
+ static constexpr const char * REASONING_BLOCK = "reasoning-block";
9
+ static constexpr const char * REASONING = "reasoning";
10
+ static constexpr const char * CONTENT = "content";
11
+
12
+ common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
13
+ common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
14
+ common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
15
+ };
16
+
17
+ inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
18
+ common_chat_peg_builder builder;
19
+ builder.set_root(fn(builder));
20
+ return builder.build();
21
+ }
22
+
23
+ class common_chat_peg_mapper {
24
+ public:
25
+ common_chat_msg & result;
26
+
27
+ common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
28
+
29
+ virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
30
+ virtual void map(const common_peg_ast_node & node);
31
+ };
32
+
33
+ class common_chat_peg_native_builder : public common_chat_peg_builder {
34
+ public:
35
+ static constexpr const char * TOOL = "tool";
36
+ static constexpr const char * TOOL_OPEN = "tool-open";
37
+ static constexpr const char * TOOL_CLOSE = "tool-close";
38
+ static constexpr const char * TOOL_ID = "tool-id";
39
+ static constexpr const char * TOOL_NAME = "tool-name";
40
+ static constexpr const char * TOOL_ARGS = "tool-args";
41
+
42
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
43
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
44
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
45
+ common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
46
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
47
+ common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
48
+ };
49
+
50
+ class common_chat_peg_native_mapper : public common_chat_peg_mapper {
51
+ common_chat_tool_call * current_tool;
52
+
53
+ public:
54
+ common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
55
+
56
+ void map(const common_peg_ast_node & node) override;
57
+ };
58
+
59
+ inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
60
+ common_chat_peg_native_builder builder;
61
+ builder.set_root(fn(builder));
62
+ return builder.build();
63
+ }
64
+
65
+ class common_chat_peg_constructed_builder : public common_chat_peg_builder {
66
+ public:
67
+ static constexpr const char * TOOL = "tool";
68
+ static constexpr const char * TOOL_OPEN = "tool-open";
69
+ static constexpr const char * TOOL_CLOSE = "tool-close";
70
+ static constexpr const char * TOOL_NAME = "tool-name";
71
+ static constexpr const char * TOOL_ARG = "tool-arg";
72
+ static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
73
+ static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
74
+ static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
75
+ static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
76
+ static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
77
+
78
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
79
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
80
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
81
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
82
+ common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
83
+ common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
84
+ common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
85
+ common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
86
+ common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
87
+ common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
88
+ };
89
+
90
+ class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
91
+ common_chat_tool_call * current_tool;
92
+ int arg_count = 0;
93
+ bool needs_closing_quote = false;
94
+
95
+ public:
96
+ common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
97
+
98
+ void map(const common_peg_ast_node & node) override;
99
+ };
100
+
101
+ inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
102
+ common_chat_peg_constructed_builder builder;
103
+ builder.set_root(fn(builder));
104
+ return builder.build();
105
+ }
llama.cpp/common/chat.cpp ADDED
The diff for this file is too large to render. See raw diff
 
llama.cpp/common/chat.h ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
2
+
3
+ #pragma once
4
+
5
+ #include "common.h"
6
+ #include "peg-parser.h"
7
+ #include <functional>
8
+ #include <chrono>
9
+ #include <string>
10
+ #include <vector>
11
+ #include <map>
12
+
13
+ #include <nlohmann/json_fwd.hpp>
14
+
15
+ struct common_chat_templates;
16
+
17
+ struct common_chat_tool_call {
18
+ std::string name;
19
+ std::string arguments;
20
+ std::string id;
21
+
22
+ bool operator==(const common_chat_tool_call & other) const {
23
+ return name == other.name && arguments == other.arguments && id == other.id;
24
+ }
25
+ };
26
+
27
+ struct common_chat_msg_content_part {
28
+ std::string type;
29
+ std::string text;
30
+
31
+ // TODO @ngxson : no known chat templates support reasoning_content in content parts yet
32
+ // this can be useful for models with interleaved thinking (like Kimi-K2)
33
+ // if you see any templates explicitly support this, please ping me
34
+ // std::string reasoning_content;
35
+
36
+ bool operator==(const common_chat_msg_content_part & other) const {
37
+ return type == other.type && text == other.text;
38
+ }
39
+ };
40
+
41
+ struct common_chat_msg {
42
+ std::string role;
43
+ std::string content;
44
+ std::vector<common_chat_msg_content_part> content_parts;
45
+ std::vector<common_chat_tool_call> tool_calls;
46
+ std::string reasoning_content;
47
+ std::string tool_name;
48
+ std::string tool_call_id;
49
+
50
+ nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
51
+
52
+ bool empty() const {
53
+ return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
54
+ }
55
+ void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
56
+ for (auto i = 0u; i < tool_calls.size(); i++) {
57
+ if (ids_cache.size() <= i) {
58
+ auto id = tool_calls[i].id;
59
+ if (id.empty()) {
60
+ id = gen_tool_call_id();
61
+ }
62
+ ids_cache.push_back(id);
63
+ }
64
+ tool_calls[i].id = ids_cache[i];
65
+ }
66
+ }
67
+ bool operator==(const common_chat_msg & other) const {
68
+ return role == other.role
69
+ && content == other.content
70
+ && content_parts == other.content_parts
71
+ && tool_calls == other.tool_calls
72
+ && reasoning_content == other.reasoning_content
73
+ && tool_name == other.tool_name
74
+ && tool_call_id == other.tool_call_id;
75
+ }
76
+ bool operator!=(const common_chat_msg & other) const {
77
+ return !(*this == other);
78
+ }
79
+ };
80
+
81
+ struct common_chat_msg_diff {
82
+ std::string reasoning_content_delta;
83
+ std::string content_delta;
84
+ size_t tool_call_index = std::string::npos;
85
+ common_chat_tool_call tool_call_delta;
86
+
87
+ static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
88
+
89
+ bool operator==(const common_chat_msg_diff & other) const {
90
+ return content_delta == other.content_delta
91
+ && tool_call_index == other.tool_call_index
92
+ && tool_call_delta == other.tool_call_delta;
93
+ }
94
+ };
95
+
96
+ struct common_chat_tool {
97
+ std::string name;
98
+ std::string description;
99
+ std::string parameters;
100
+ };
101
+
102
+ enum common_chat_tool_choice {
103
+ COMMON_CHAT_TOOL_CHOICE_AUTO,
104
+ COMMON_CHAT_TOOL_CHOICE_REQUIRED,
105
+ COMMON_CHAT_TOOL_CHOICE_NONE,
106
+ };
107
+
108
+ enum common_chat_format {
109
+ COMMON_CHAT_FORMAT_CONTENT_ONLY,
110
+ COMMON_CHAT_FORMAT_GENERIC,
111
+ COMMON_CHAT_FORMAT_MISTRAL_NEMO,
112
+ COMMON_CHAT_FORMAT_MAGISTRAL,
113
+ COMMON_CHAT_FORMAT_LLAMA_3_X,
114
+ COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
115
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
116
+ COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
117
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
118
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
119
+ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
120
+ COMMON_CHAT_FORMAT_HERMES_2_PRO,
121
+ COMMON_CHAT_FORMAT_COMMAND_R7B,
122
+ COMMON_CHAT_FORMAT_GRANITE,
123
+ COMMON_CHAT_FORMAT_GPT_OSS,
124
+ COMMON_CHAT_FORMAT_SEED_OSS,
125
+ COMMON_CHAT_FORMAT_NEMOTRON_V2,
126
+ COMMON_CHAT_FORMAT_APERTUS,
127
+ COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
128
+ COMMON_CHAT_FORMAT_GLM_4_5,
129
+ COMMON_CHAT_FORMAT_MINIMAX_M2,
130
+ COMMON_CHAT_FORMAT_KIMI_K2,
131
+ COMMON_CHAT_FORMAT_APRIEL_1_5,
132
+ COMMON_CHAT_FORMAT_XIAOMI_MIMO,
133
+ COMMON_CHAT_FORMAT_SOLAR_OPEN,
134
+ COMMON_CHAT_FORMAT_EXAONE_MOE,
135
+
136
+ // These are intended to be parsed by the PEG parser
137
+ COMMON_CHAT_FORMAT_PEG_SIMPLE,
138
+ COMMON_CHAT_FORMAT_PEG_NATIVE,
139
+ COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
140
+
141
+ COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
142
+ };
143
+
144
+ struct common_chat_templates_inputs {
145
+ std::vector<common_chat_msg> messages;
146
+ std::string grammar;
147
+ std::string json_schema;
148
+ bool add_generation_prompt = true;
149
+ bool use_jinja = true;
150
+ // Parameters below only supported when use_jinja is true
151
+ std::vector<common_chat_tool> tools;
152
+ common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
153
+ bool parallel_tool_calls = false;
154
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
155
+ bool enable_thinking = true;
156
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
157
+ std::map<std::string, std::string> chat_template_kwargs;
158
+ bool add_bos = false;
159
+ bool add_eos = false;
160
+ };
161
+
162
+ struct common_chat_params {
163
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
164
+ std::string prompt;
165
+ std::string grammar;
166
+ bool grammar_lazy = false;
167
+ bool thinking_forced_open = false;
168
+ std::vector<common_grammar_trigger> grammar_triggers;
169
+ std::vector<std::string> preserved_tokens;
170
+ std::vector<std::string> additional_stops;
171
+ std::string parser;
172
+ };
173
+
174
+ // per-message parsing syntax
175
+ // should be derived from common_chat_params
176
+ struct common_chat_parser_params {
177
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
178
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
179
+ // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
180
+ bool reasoning_in_content = false;
181
+ bool thinking_forced_open = false;
182
+ bool parse_tool_calls = true;
183
+ common_peg_arena parser = {};
184
+ common_chat_parser_params() = default;
185
+ common_chat_parser_params(const common_chat_params & chat_params) {
186
+ format = chat_params.format;
187
+ thinking_forced_open = chat_params.thinking_forced_open;
188
+ }
189
+ };
190
+
191
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
192
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
193
+
194
+ void common_chat_templates_free(struct common_chat_templates * tmpls);
195
+
196
+ struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
197
+
198
+ typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
199
+
200
+ common_chat_templates_ptr common_chat_templates_init(
201
+ const struct llama_model * model,
202
+ const std::string & chat_template_override,
203
+ const std::string & bos_token_override = "",
204
+ const std::string & eos_token_override = "");
205
+
206
+ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
207
+ std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
208
+
209
+
210
+ struct common_chat_params common_chat_templates_apply(
211
+ const struct common_chat_templates * tmpls,
212
+ const struct common_chat_templates_inputs & inputs);
213
+
214
+ // Format single message, while taking into account the position of that message in chat history
215
+ std::string common_chat_format_single(
216
+ const struct common_chat_templates * tmpls,
217
+ const std::vector<common_chat_msg> & past_msg,
218
+ const common_chat_msg & new_msg,
219
+ bool add_ass,
220
+ bool use_jinja);
221
+
222
+ // Returns an example of formatted chat
223
+ std::string common_chat_format_example(
224
+ const struct common_chat_templates * tmpls,
225
+ bool use_jinja,
226
+ const std::map<std::string, std::string> & chat_template_kwargs);
227
+
228
+ const char* common_chat_format_name(common_chat_format format);
229
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
230
+ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
231
+
232
+ // used by arg and server
233
+ const char * common_reasoning_format_name(common_reasoning_format format);
234
+ common_reasoning_format common_reasoning_format_from_name(const std::string & format);
235
+
236
+ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
237
+
238
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
239
+
240
+ // Parses a JSON array of messages in OpenAI's chat completion API format.
241
+ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
242
+
243
+ // DEPRECATED: only used in tests
244
+ nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
245
+
246
+ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
247
+ nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
248
+
249
+ nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
250
+
251
+ // get template caps, useful for reporting to server /props endpoint
252
+ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
llama.cpp/common/common.cpp ADDED
@@ -0,0 +1,1824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "gguf.h"
3
+
4
+ #include "common.h"
5
+ #include "log.h"
6
+ #include "llama.h"
7
+ #include "sampling.h"
8
+ #include "unicode.h"
9
+
10
+ #include <algorithm>
11
+ #include <cinttypes>
12
+ #include <climits>
13
+ #include <cmath>
14
+ #include <chrono>
15
+ #include <cstdarg>
16
+ #include <cstring>
17
+ #include <ctime>
18
+ #include <filesystem>
19
+ #include <fstream>
20
+ #include <iostream>
21
+ #include <iterator>
22
+ #include <regex>
23
+ #include <sstream>
24
+ #include <string>
25
+ #include <thread>
26
+ #include <unordered_set>
27
+ #include <vector>
28
+
29
+ #if defined(__APPLE__) && defined(__MACH__)
30
+ #include <sys/types.h>
31
+ #include <sys/sysctl.h>
32
+ #endif
33
+
34
+ #if defined(_WIN32)
35
+ #define WIN32_LEAN_AND_MEAN
36
+ #ifndef NOMINMAX
37
+ # define NOMINMAX
38
+ #endif
39
+ #include <locale>
40
+ #include <windows.h>
41
+ #include <string.h>
42
+ #include <fcntl.h>
43
+ #include <io.h>
44
+ #else
45
+ #include <sys/ioctl.h>
46
+ #include <sys/stat.h>
47
+ #include <unistd.h>
48
+ #endif
49
+
50
+ #if defined(__linux__)
51
+ #include <sys/types.h>
52
+ #include <pwd.h>
53
+ #endif
54
+
55
+ #if defined(_MSC_VER)
56
+ #pragma warning(disable: 4244 4267) // possible loss of data
57
+ #endif
58
+
59
+ common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
60
+
61
+ common_time_meas::~common_time_meas() {
62
+ if (t_start_us >= 0) {
63
+ t_acc += ggml_time_us() - t_start_us;
64
+ }
65
+ }
66
+
67
+ //
68
+ // CPU utils
69
+ //
70
+
71
+ int32_t cpu_get_num_physical_cores() {
72
+ #ifdef __linux__
73
+ // enumerate the set of thread siblings, num entries is num cores
74
+ std::unordered_set<std::string> siblings;
75
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
76
+ std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
77
+ + std::to_string(cpu) + "/topology/thread_siblings");
78
+ if (!thread_siblings.is_open()) {
79
+ break; // no more cpus
80
+ }
81
+ std::string line;
82
+ if (std::getline(thread_siblings, line)) {
83
+ siblings.insert(line);
84
+ }
85
+ }
86
+ if (!siblings.empty()) {
87
+ return static_cast<int32_t>(siblings.size());
88
+ }
89
+ #elif defined(__APPLE__) && defined(__MACH__)
90
+ int32_t num_physical_cores;
91
+ size_t len = sizeof(num_physical_cores);
92
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
93
+ if (result == 0) {
94
+ return num_physical_cores;
95
+ }
96
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
97
+ if (result == 0) {
98
+ return num_physical_cores;
99
+ }
100
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
101
+ // TODO: windows + arm64 + mingw64
102
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
103
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
104
+
105
+ DWORD buffer_size = 0;
106
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
107
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
108
+ return default_threads;
109
+ }
110
+ }
111
+
112
+ std::vector<char> buffer(buffer_size);
113
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
114
+ return default_threads;
115
+ }
116
+
117
+ int32_t num_physical_cores = 0;
118
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
119
+ while (buffer_size > 0) {
120
+ if (info->Relationship == RelationProcessorCore) {
121
+ num_physical_cores += info->Processor.GroupCount;
122
+ }
123
+ buffer_size -= info->Size;
124
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
125
+ }
126
+
127
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
128
+ #endif
129
+ unsigned int n_threads = std::thread::hardware_concurrency();
130
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
131
+ }
132
+
133
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
134
+ #include <pthread.h>
135
+
136
+ static void cpuid(unsigned leaf, unsigned subleaf,
137
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
138
+ __asm__("movq\t%%rbx,%%rsi\n\t"
139
+ "cpuid\n\t"
140
+ "xchgq\t%%rbx,%%rsi"
141
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
142
+ : "0"(leaf), "2"(subleaf));
143
+ }
144
+
145
+ static int pin_cpu(int cpu) {
146
+ cpu_set_t mask;
147
+ CPU_ZERO(&mask);
148
+ CPU_SET(cpu, &mask);
149
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
150
+ }
151
+
152
+ static bool is_hybrid_cpu(void) {
153
+ unsigned eax, ebx, ecx, edx;
154
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
155
+ return !!(edx & (1u << 15));
156
+ }
157
+
158
+ static bool is_running_on_efficiency_core(void) {
159
+ unsigned eax, ebx, ecx, edx;
160
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
161
+ int intel_atom = 0x20;
162
+ int core_type = (eax & 0xff000000u) >> 24;
163
+ return core_type == intel_atom;
164
+ }
165
+
166
+ static int cpu_count_math_cpus(int n_cpu) {
167
+ int result = 0;
168
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
169
+ if (pin_cpu(cpu)) {
170
+ return -1;
171
+ }
172
+ if (is_running_on_efficiency_core()) {
173
+ continue; // efficiency cores harm lockstep threading
174
+ }
175
+ ++cpu; // hyperthreading isn't useful for linear algebra
176
+ ++result;
177
+ }
178
+ return result;
179
+ }
180
+
181
+ #endif // __x86_64__ && __linux__
182
+
183
+ /**
184
+ * Returns number of CPUs on system that are useful for math.
185
+ */
186
+ int32_t cpu_get_num_math() {
187
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
188
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
189
+ if (n_cpu < 1) {
190
+ return cpu_get_num_physical_cores();
191
+ }
192
+ if (is_hybrid_cpu()) {
193
+ cpu_set_t affinity;
194
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
195
+ int result = cpu_count_math_cpus(n_cpu);
196
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
197
+ if (result > 0) {
198
+ return result;
199
+ }
200
+ }
201
+ }
202
+ #endif
203
+ return cpu_get_num_physical_cores();
204
+ }
205
+
206
+ // Helper for setting process priority
207
+
208
+ #if defined(_WIN32)
209
+
210
+ bool set_process_priority(enum ggml_sched_priority prio) {
211
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
212
+ return true;
213
+ }
214
+
215
+ DWORD p = NORMAL_PRIORITY_CLASS;
216
+ switch (prio) {
217
+ case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
218
+ case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
219
+ case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
220
+ case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
221
+ case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
222
+ }
223
+
224
+ if (!SetPriorityClass(GetCurrentProcess(), p)) {
225
+ LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
226
+ return false;
227
+ }
228
+
229
+ return true;
230
+ }
231
+
232
+ #else // MacOS and POSIX
233
+ #include <sys/types.h>
234
+ #include <sys/resource.h>
235
+
236
+ bool set_process_priority(enum ggml_sched_priority prio) {
237
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
238
+ return true;
239
+ }
240
+
241
+ int p = 0;
242
+ switch (prio) {
243
+ case GGML_SCHED_PRIO_LOW: p = 5; break;
244
+ case GGML_SCHED_PRIO_NORMAL: p = 0; break;
245
+ case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
246
+ case GGML_SCHED_PRIO_HIGH: p = -10; break;
247
+ case GGML_SCHED_PRIO_REALTIME: p = -20; break;
248
+ }
249
+
250
+ if (setpriority(PRIO_PROCESS, 0, p) != 0) {
251
+ LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
252
+ return false;
253
+ }
254
+ return true;
255
+ }
256
+
257
+ #endif
258
+
259
+ //
260
+ // CLI argument parsing
261
+ //
262
+
263
+
264
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
265
+ int32_t n_set = 0;
266
+
267
+ if (cpuparams.n_threads < 0) {
268
+ // Assuming everything about cpuparams is invalid
269
+ if (role_model != nullptr) {
270
+ cpuparams = *role_model;
271
+ } else {
272
+ cpuparams.n_threads = cpu_get_num_math();
273
+ }
274
+ }
275
+
276
+ for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
277
+ if (cpuparams.cpumask[i]) {
278
+ n_set++;
279
+ }
280
+ }
281
+
282
+ if (n_set && n_set < cpuparams.n_threads) {
283
+ // Not enough set bits, may experience performance issues.
284
+ LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
285
+ }
286
+ }
287
+
288
+ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
289
+ size_t dash_loc = range.find('-');
290
+ if (dash_loc == std::string::npos) {
291
+ LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
292
+ return false;
293
+ }
294
+
295
+ size_t start_i;
296
+ size_t end_i;
297
+
298
+ if (dash_loc == 0) {
299
+ start_i = 0;
300
+ } else {
301
+ start_i = std::stoull(range.substr(0, dash_loc));
302
+ if (start_i >= GGML_MAX_N_THREADS) {
303
+ LOG_ERR("Start index out of bounds!\n");
304
+ return false;
305
+ }
306
+ }
307
+
308
+ if (dash_loc == range.length() - 1) {
309
+ end_i = GGML_MAX_N_THREADS - 1;
310
+ } else {
311
+ end_i = std::stoull(range.substr(dash_loc + 1));
312
+ if (end_i >= GGML_MAX_N_THREADS) {
313
+ LOG_ERR("End index out of bounds!\n");
314
+ return false;
315
+ }
316
+ }
317
+
318
+ for (size_t i = start_i; i <= end_i; i++) {
319
+ boolmask[i] = true;
320
+ }
321
+
322
+ return true;
323
+ }
324
+
325
+ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
326
+ // Discard potential 0x prefix
327
+ size_t start_i = 0;
328
+ if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
329
+ start_i = 2;
330
+ }
331
+
332
+ size_t num_digits = mask.length() - start_i;
333
+ if (num_digits > 128) num_digits = 128;
334
+
335
+ size_t end_i = num_digits + start_i;
336
+
337
+ for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
338
+ char c = mask.at(i);
339
+ int8_t id = c;
340
+
341
+ if ((c >= '0' && c <= '9')) {
342
+ id -= '0';
343
+ } else if (c >= 'a' && c <= 'f') {
344
+ id -= 'a' - 10;
345
+ } else if (c >= 'A' && c <= 'F') {
346
+ id -= 'A' - 10;
347
+ } else {
348
+ LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
349
+ return false;
350
+ }
351
+
352
+ boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
353
+ boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
354
+ boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
355
+ boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
356
+ }
357
+
358
+ return true;
359
+ }
360
+
361
+ void common_init() {
362
+ llama_log_set(common_log_default_callback, NULL);
363
+
364
+ #ifdef NDEBUG
365
+ const char * build_type = "";
366
+ #else
367
+ const char * build_type = " (debug)";
368
+ #endif
369
+
370
+ LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
371
+ }
372
+
373
+ std::string common_params_get_system_info(const common_params & params) {
374
+ std::ostringstream os;
375
+
376
+ os << "system_info: n_threads = " << params.cpuparams.n_threads;
377
+ if (params.cpuparams_batch.n_threads != -1) {
378
+ os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
379
+ }
380
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
381
+ // TODO: windows + arm64 + mingw64
382
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
383
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
384
+ #else
385
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
386
+ #endif
387
+
388
+ return os.str();
389
+ }
390
+
391
+ //
392
+ // String utils
393
+ //
394
+
395
+ std::string string_format(const char * fmt, ...) {
396
+ va_list ap;
397
+ va_list ap2;
398
+ va_start(ap, fmt);
399
+ va_copy(ap2, ap);
400
+ int size = vsnprintf(NULL, 0, fmt, ap);
401
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
402
+ std::vector<char> buf(size + 1);
403
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
404
+ GGML_ASSERT(size2 == size);
405
+ va_end(ap2);
406
+ va_end(ap);
407
+ return std::string(buf.data(), size);
408
+ }
409
+
410
+ std::string string_strip(const std::string & str) {
411
+ size_t start = 0;
412
+ size_t end = str.size();
413
+ while (start < end && std::isspace(str[start])) {
414
+ start++;
415
+ }
416
+ while (end > start && std::isspace(str[end - 1])) {
417
+ end--;
418
+ }
419
+ return str.substr(start, end - start);
420
+ }
421
+
422
+ std::string string_get_sortable_timestamp() {
423
+ using clock = std::chrono::system_clock;
424
+
425
+ const clock::time_point current_time = clock::now();
426
+ const time_t as_time_t = clock::to_time_t(current_time);
427
+ char timestamp_no_ns[100];
428
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
429
+
430
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
431
+ current_time.time_since_epoch() % 1000000000).count();
432
+ char timestamp_ns[11];
433
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
434
+
435
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
436
+ }
437
+
438
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
439
+ if (search.empty()) {
440
+ return;
441
+ }
442
+ std::string builder;
443
+ builder.reserve(s.length());
444
+ size_t pos = 0;
445
+ size_t last_pos = 0;
446
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
447
+ builder.append(s, last_pos, pos - last_pos);
448
+ builder.append(replace);
449
+ last_pos = pos + search.length();
450
+ }
451
+ builder.append(s, last_pos, std::string::npos);
452
+ s = std::move(builder);
453
+ }
454
+
455
+ std::string regex_escape(const std::string & s) {
456
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
457
+ return std::regex_replace(s, special_chars, "\\$&");
458
+ }
459
+
460
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
461
+ std::ostringstream result;
462
+ for (size_t i = 0; i < values.size(); ++i) {
463
+ if (i > 0) {
464
+ result << separator;
465
+ }
466
+ result << values[i];
467
+ }
468
+ return result.str();
469
+ }
470
+
471
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
472
+ std::vector<std::string> parts;
473
+ size_t start = 0;
474
+ size_t end = str.find(delimiter);
475
+
476
+ while (end != std::string::npos) {
477
+ parts.push_back(str.substr(start, end - start));
478
+ start = end + delimiter.length();
479
+ end = str.find(delimiter, start);
480
+ }
481
+
482
+ parts.push_back(str.substr(start));
483
+
484
+ return parts;
485
+ }
486
+
487
+ std::string string_repeat(const std::string & str, size_t n) {
488
+ if (n == 0) {
489
+ return "";
490
+ }
491
+
492
+ std::string result;
493
+ result.reserve(str.length() * n);
494
+
495
+ for (size_t i = 0; i < n; ++i) {
496
+ result += str;
497
+ }
498
+
499
+ return result;
500
+ }
501
+
502
+ std::string string_from(bool value) {
503
+ return value ? "true" : "false";
504
+ }
505
+
506
+ std::string string_from(const std::vector<int> & values) {
507
+ std::stringstream buf;
508
+
509
+ buf << "[ ";
510
+ bool first = true;
511
+ for (auto e : values) {
512
+ if (first) {
513
+ first = false;
514
+ } else {
515
+ buf << ", ";
516
+ }
517
+ buf << std::to_string(e);
518
+ }
519
+ buf << " ]";
520
+
521
+ return buf.str();
522
+ }
523
+
524
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
525
+ std::stringstream buf;
526
+
527
+ buf << "[ ";
528
+
529
+ bool first = true;
530
+ for (const auto & token : tokens) {
531
+ if (!first) {
532
+ buf << ", ";
533
+ } else {
534
+ first = false;
535
+ }
536
+
537
+ auto detokenized = common_token_to_piece(ctx, token);
538
+
539
+ buf << "'" << detokenized << "'"
540
+ << ":" << std::to_string(token);
541
+ }
542
+
543
+ buf << " ]";
544
+
545
+ return buf.str();
546
+ }
547
+
548
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
549
+ std::stringstream buf;
550
+
551
+ buf << "[ ";
552
+
553
+ bool first = true;
554
+ for (int i = 0; i < batch.n_tokens; ++i) {
555
+ if (!first) {
556
+ buf << ", ";
557
+ } else {
558
+ first = false;
559
+ }
560
+
561
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
562
+
563
+ buf << "\n" << std::to_string(i)
564
+ << ", token '" << detokenized << "'"
565
+ << ", pos " << std::to_string(batch.pos[i])
566
+ << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
567
+ << ", seq_id " << std::to_string(batch.seq_id[i][0])
568
+ << ", logits " << std::to_string(batch.logits[i]);
569
+ }
570
+
571
+ buf << " ]";
572
+
573
+ return buf.str();
574
+ }
575
+
576
+ void string_process_escapes(std::string & input) {
577
+ std::size_t input_len = input.length();
578
+ std::size_t output_idx = 0;
579
+
580
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
581
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
582
+ switch (input[++input_idx]) {
583
+ case 'n': input[output_idx++] = '\n'; break;
584
+ case 'r': input[output_idx++] = '\r'; break;
585
+ case 't': input[output_idx++] = '\t'; break;
586
+ case '\'': input[output_idx++] = '\''; break;
587
+ case '\"': input[output_idx++] = '\"'; break;
588
+ case '\\': input[output_idx++] = '\\'; break;
589
+ case 'x':
590
+ // Handle \x12, etc
591
+ if (input_idx + 2 < input_len) {
592
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
593
+ char *err_p = nullptr;
594
+ const long val = std::strtol(x, &err_p, 16);
595
+ if (err_p == x + 2) {
596
+ input_idx += 2;
597
+ input[output_idx++] = char(val);
598
+ break;
599
+ }
600
+ }
601
+ // fall through
602
+ default: input[output_idx++] = '\\';
603
+ input[output_idx++] = input[input_idx]; break;
604
+ }
605
+ } else {
606
+ input[output_idx++] = input[input_idx];
607
+ }
608
+ }
609
+
610
+ input.resize(output_idx);
611
+ }
612
+
613
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
614
+ const char * sep = strchr(data, '=');
615
+ if (sep == nullptr || sep - data >= 128) {
616
+ LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
617
+ return false;
618
+ }
619
+ llama_model_kv_override kvo;
620
+ std::strncpy(kvo.key, data, sep - data);
621
+ kvo.key[sep - data] = 0;
622
+ sep++;
623
+ if (strncmp(sep, "int:", 4) == 0) {
624
+ sep += 4;
625
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
626
+ kvo.val_i64 = std::atol(sep);
627
+ } else if (strncmp(sep, "float:", 6) == 0) {
628
+ sep += 6;
629
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
630
+ kvo.val_f64 = std::atof(sep);
631
+ } else if (strncmp(sep, "bool:", 5) == 0) {
632
+ sep += 5;
633
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
634
+ if (std::strcmp(sep, "true") == 0) {
635
+ kvo.val_bool = true;
636
+ } else if (std::strcmp(sep, "false") == 0) {
637
+ kvo.val_bool = false;
638
+ } else {
639
+ LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
640
+ return false;
641
+ }
642
+ } else if (strncmp(sep, "str:", 4) == 0) {
643
+ sep += 4;
644
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
645
+ if (strlen(sep) > 127) {
646
+ LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
647
+ return false;
648
+ }
649
+ strncpy(kvo.val_str, sep, 127);
650
+ kvo.val_str[127] = '\0';
651
+ } else {
652
+ LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
653
+ return false;
654
+ }
655
+ overrides.emplace_back(std::move(kvo));
656
+ return true;
657
+ }
658
+
659
+ //
660
+ // Filesystem utils
661
+ //
662
+
663
+ // Validate if a filename is safe to use
664
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
665
+ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
666
+ if (!filename.length()) {
667
+ // Empty filename invalid
668
+ return false;
669
+ }
670
+ if (filename.length() > 255) {
671
+ // Limit at common largest possible filename on Linux filesystems
672
+ // to avoid unnecessary further validation
673
+ // (On systems with smaller limits it will be caught by the OS)
674
+ return false;
675
+ }
676
+
677
+ size_t offset = 0;
678
+ while (offset < filename.size()) {
679
+ utf8_parse_result result = parse_utf8_codepoint(filename, offset);
680
+
681
+ if (result.status != utf8_parse_result::SUCCESS) {
682
+ return false;
683
+ }
684
+ uint32_t c = result.codepoint;
685
+
686
+ if ((result.bytes_consumed == 2 && c < 0x80) ||
687
+ (result.bytes_consumed == 3 && c < 0x800) ||
688
+ (result.bytes_consumed == 4 && c < 0x10000)) {
689
+ return false;
690
+ }
691
+
692
+ // Check for forbidden codepoints:
693
+ // - Control characters
694
+ // - Unicode equivalents of illegal characters
695
+ // - UTF-16 surrogate pairs
696
+ // - UTF-8 replacement character
697
+ // - Byte order mark (BOM)
698
+ // - Illegal characters: / \ : * ? " < > |
699
+ if (c <= 0x1F // Control characters (C0)
700
+ || c == 0x7F // Control characters (DEL)
701
+ || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
702
+ || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
703
+ || c == 0x2215 // Division Slash (forward slash equivalent)
704
+ || c == 0x2216 // Set Minus (backslash equivalent)
705
+ || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
706
+ || c > 0x10FFFF // Max Unicode limit
707
+ || c == 0xFFFD // Replacement Character (UTF-8)
708
+ || c == 0xFEFF // Byte Order Mark (BOM)
709
+ || c == ':' || c == '*' // Illegal characters
710
+ || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
711
+ return false;
712
+ }
713
+ if (!allow_subdirs && (c == '/' || c == '\\')) {
714
+ // Subdirectories not allowed, reject path separators
715
+ return false;
716
+ }
717
+ offset += result.bytes_consumed;
718
+ }
719
+
720
+ // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
721
+ // Unicode and other whitespace is not affected, only 0x20 space
722
+ if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
723
+ return false;
724
+ }
725
+
726
+ // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
727
+ if (filename.find("..") != std::string::npos) {
728
+ return false;
729
+ }
730
+
731
+ // Reject "."
732
+ if (filename == ".") {
733
+ return false;
734
+ }
735
+
736
+ return true;
737
+ }
738
+
739
+ #include <iostream>
740
+
741
+
742
+ #ifdef _WIN32
743
+ static std::wstring utf8_to_wstring(const std::string & str) {
744
+ if (str.empty()) {
745
+ return std::wstring();
746
+ }
747
+
748
+ int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
749
+
750
+ if (size <= 0) {
751
+ return std::wstring();
752
+ }
753
+
754
+ std::wstring wstr(size, 0);
755
+ MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
756
+
757
+ return wstr;
758
+ }
759
+ #endif
760
+
761
+ // returns true if successful, false otherwise
762
+ bool fs_create_directory_with_parents(const std::string & path) {
763
+ #ifdef _WIN32
764
+ std::wstring wpath = utf8_to_wstring(path);
765
+
766
+ // if the path already exists, check whether it's a directory
767
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
768
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
769
+ return true;
770
+ }
771
+
772
+ size_t pos_slash = 0;
773
+
774
+ // process path from front to back, procedurally creating directories
775
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
776
+ const std::wstring subpath = wpath.substr(0, pos_slash);
777
+
778
+ pos_slash += 1;
779
+
780
+ // skip the drive letter, in some systems it can return an access denied error
781
+ if (subpath.length() == 2 && subpath[1] == ':') {
782
+ continue;
783
+ }
784
+
785
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
786
+
787
+ if (!success) {
788
+ const DWORD error = GetLastError();
789
+
790
+ // if the path already exists, ensure that it's a directory
791
+ if (error == ERROR_ALREADY_EXISTS) {
792
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
793
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
794
+ return false;
795
+ }
796
+ } else {
797
+ return false;
798
+ }
799
+ }
800
+ }
801
+
802
+ return true;
803
+ #else
804
+ // if the path already exists, check whether it's a directory
805
+ struct stat info;
806
+ if (stat(path.c_str(), &info) == 0) {
807
+ return S_ISDIR(info.st_mode);
808
+ }
809
+
810
+ size_t pos_slash = 1; // skip leading slashes for directory creation
811
+
812
+ // process path from front to back, procedurally creating directories
813
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
814
+ const std::string subpath = path.substr(0, pos_slash);
815
+ struct stat info;
816
+
817
+ // if the path already exists, ensure that it's a directory
818
+ if (stat(subpath.c_str(), &info) == 0) {
819
+ if (!S_ISDIR(info.st_mode)) {
820
+ return false;
821
+ }
822
+ } else {
823
+ // create parent directories
824
+ const int ret = mkdir(subpath.c_str(), 0755);
825
+ if (ret != 0) {
826
+ return false;
827
+ }
828
+ }
829
+
830
+ pos_slash += 1;
831
+ }
832
+
833
+ return true;
834
+ #endif // _WIN32
835
+ }
836
+
837
+ bool fs_is_directory(const std::string & path) {
838
+ std::filesystem::path dir(path);
839
+ return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
840
+ }
841
+
842
+ std::string fs_get_cache_directory() {
843
+ std::string cache_directory = "";
844
+ auto ensure_trailing_slash = [](std::string p) {
845
+ // Make sure to add trailing slash
846
+ if (p.back() != DIRECTORY_SEPARATOR) {
847
+ p += DIRECTORY_SEPARATOR;
848
+ }
849
+ return p;
850
+ };
851
+ if (getenv("LLAMA_CACHE")) {
852
+ cache_directory = std::getenv("LLAMA_CACHE");
853
+ } else {
854
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || \
855
+ defined(__OpenBSD__) || defined(__NetBSD__)
856
+ if (std::getenv("XDG_CACHE_HOME")) {
857
+ cache_directory = std::getenv("XDG_CACHE_HOME");
858
+ } else if (std::getenv("HOME")) {
859
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
860
+ } else {
861
+ #if defined(__linux__)
862
+ /* no $HOME is defined, fallback to getpwuid */
863
+ struct passwd *pw = getpwuid(getuid());
864
+ if ((!pw) || (!pw->pw_dir)) {
865
+ throw std::runtime_error("Failed to find $HOME directory");
866
+ }
867
+
868
+ cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
869
+ #else /* defined(__linux__) */
870
+ throw std::runtime_error("Failed to find $HOME directory");
871
+ #endif /* defined(__linux__) */
872
+ }
873
+ #elif defined(__APPLE__)
874
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
875
+ #elif defined(_WIN32)
876
+ cache_directory = std::getenv("LOCALAPPDATA");
877
+ #elif defined(__EMSCRIPTEN__)
878
+ GGML_ABORT("not implemented on this platform");
879
+ #else
880
+ # error Unknown architecture
881
+ #endif
882
+ cache_directory = ensure_trailing_slash(cache_directory);
883
+ cache_directory += "llama.cpp";
884
+ }
885
+ return ensure_trailing_slash(cache_directory);
886
+ }
887
+
888
+ std::string fs_get_cache_file(const std::string & filename) {
889
+ GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
890
+ std::string cache_directory = fs_get_cache_directory();
891
+ const bool success = fs_create_directory_with_parents(cache_directory);
892
+ if (!success) {
893
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
894
+ }
895
+ return cache_directory + filename;
896
+ }
897
+
898
+ std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
899
+ std::vector<common_file_info> files;
900
+ if (path.empty()) return files;
901
+
902
+ std::filesystem::path dir(path);
903
+ if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
904
+ return files;
905
+ }
906
+
907
+ for (const auto & entry : std::filesystem::directory_iterator(dir)) {
908
+ try {
909
+ // Only include regular files (skip directories)
910
+ const auto & p = entry.path();
911
+ if (std::filesystem::is_regular_file(p)) {
912
+ common_file_info info;
913
+ info.path = p.string();
914
+ info.name = p.filename().string();
915
+ info.is_dir = false;
916
+ try {
917
+ info.size = static_cast<size_t>(std::filesystem::file_size(p));
918
+ } catch (const std::filesystem::filesystem_error &) {
919
+ info.size = 0;
920
+ }
921
+ files.push_back(std::move(info));
922
+ } else if (include_directories && std::filesystem::is_directory(p)) {
923
+ common_file_info info;
924
+ info.path = p.string();
925
+ info.name = p.filename().string();
926
+ info.size = 0; // Directories have no size
927
+ info.is_dir = true;
928
+ files.push_back(std::move(info));
929
+ }
930
+ } catch (const std::filesystem::filesystem_error &) {
931
+ // skip entries we cannot inspect
932
+ continue;
933
+ }
934
+ }
935
+
936
+ return files;
937
+ }
938
+
939
+ //
940
+ // TTY utils
941
+ //
942
+
943
+ bool tty_can_use_colors() {
944
+ // Check NO_COLOR environment variable (https://no-color.org/)
945
+ if (const char * no_color = std::getenv("NO_COLOR")) {
946
+ if (no_color[0] != '\0') {
947
+ return false;
948
+ }
949
+ }
950
+
951
+ // Check TERM environment variable
952
+ if (const char * term = std::getenv("TERM")) {
953
+ if (std::strcmp(term, "dumb") == 0) {
954
+ return false;
955
+ }
956
+ }
957
+
958
+ // Check if stdout and stderr are connected to a terminal
959
+ // We check both because log messages can go to either
960
+ bool stdout_is_tty = isatty(fileno(stdout));
961
+ bool stderr_is_tty = isatty(fileno(stderr));
962
+
963
+ return stdout_is_tty || stderr_is_tty;
964
+ }
965
+
966
+ //
967
+ // Model utils
968
+ //
969
+
970
+ // TODO: move to common/sampling
971
+ static void common_init_sampler_from_model(
972
+ const llama_model * model,
973
+ common_params_sampling & sparams) {
974
+
975
+ const uint64_t config = sparams.user_sampling_config;
976
+
977
+ auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
978
+ if (config & user_config) {
979
+ return;
980
+ }
981
+
982
+ char buf[64] = {0};
983
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
984
+ char * end = nullptr;
985
+ int32_t v = strtol(buf, &end, 10);
986
+ if (end && end != buf) {
987
+ dst = v;
988
+ }
989
+ }
990
+ };
991
+
992
+ auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
993
+ if (config & user_config) {
994
+ return;
995
+ }
996
+
997
+ char buf[128] = {0};
998
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
999
+ char * end = nullptr;
1000
+ float v = strtof(buf, &end);
1001
+ if (end && end != buf) {
1002
+ dst = v;
1003
+ }
1004
+ }
1005
+ };
1006
+
1007
+ // Sampling sequence
1008
+ if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
1009
+ char buf[512] = {0};
1010
+ if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
1011
+ const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
1012
+ if (!sampler_names.empty()) {
1013
+ sparams.samplers = common_sampler_types_from_names(sampler_names, true);
1014
+ }
1015
+ }
1016
+ }
1017
+
1018
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
1019
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
1020
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
1021
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
1022
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
1023
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
1024
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
1025
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
1026
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
1027
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
1028
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1029
+ }
1030
+
1031
+ struct common_init_result::impl {
1032
+ impl() = default;
1033
+ ~impl() = default;
1034
+
1035
+ // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
1036
+
1037
+ llama_model_ptr model;
1038
+ llama_context_ptr context;
1039
+
1040
+ std::vector<llama_adapter_lora_ptr> lora;
1041
+
1042
+ std::vector<common_sampler_ptr> samplers;
1043
+ std::vector<llama_sampler_seq_config> samplers_seq_config;
1044
+ };
1045
+
1046
+ common_init_result::common_init_result(common_params & params) :
1047
+ pimpl(new impl{}) {
1048
+ auto mparams = common_model_params_to_llama(params);
1049
+ auto cparams = common_context_params_to_llama(params);
1050
+
1051
+ if (params.fit_params) {
1052
+ LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1053
+ llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1054
+ params.tensor_split,
1055
+ params.tensor_buft_overrides.data(),
1056
+ params.fit_params_target.data(),
1057
+ params.fit_params_min_ctx,
1058
+ params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1059
+ }
1060
+
1061
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
1062
+ if (model == NULL) {
1063
+ return;
1064
+ }
1065
+
1066
+ pimpl->model.reset(model);
1067
+
1068
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1069
+
1070
+ // load and optionally apply lora adapters (must be loaded before context creation)
1071
+ for (auto & la : params.lora_adapters) {
1072
+ llama_adapter_lora_ptr lora;
1073
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1074
+ if (lora == nullptr) {
1075
+ LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
1076
+ pimpl->model.reset(model);
1077
+ return;
1078
+ }
1079
+
1080
+ char buf[1024];
1081
+ la.ptr = lora.get();
1082
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1083
+ la.task_name = buf;
1084
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1085
+ la.prompt_prefix = buf;
1086
+ pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1087
+ }
1088
+
1089
+ // updates params.sampling
1090
+ // TODO: fix naming
1091
+ common_init_sampler_from_model(model, params.sampling);
1092
+
1093
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1094
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1095
+ params.sampling.ignore_eos = false;
1096
+ }
1097
+
1098
+ // initialize once
1099
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1100
+ if (llama_vocab_is_eog(vocab, i)) {
1101
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
1102
+ params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1103
+ }
1104
+ }
1105
+
1106
+ if (params.sampling.ignore_eos) {
1107
+ // add EOG biases to the active set of logit biases
1108
+ params.sampling.logit_bias.insert(
1109
+ params.sampling.logit_bias.end(),
1110
+ params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1111
+ }
1112
+
1113
+ //if (params.sampling.penalty_last_n == -1) {
1114
+ // LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1115
+ // params.sampling.penalty_last_n = llama_n_ctx(lctx);
1116
+ //}
1117
+
1118
+ //if (params.sampling.dry_penalty_last_n == -1) {
1119
+ // LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1120
+ // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1121
+ //}
1122
+
1123
+ // init the backend samplers as part of the context creation
1124
+ pimpl->samplers.resize(cparams.n_seq_max);
1125
+ pimpl->samplers_seq_config.resize(cparams.n_seq_max);
1126
+
1127
+ for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
1128
+ pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
1129
+ pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
1130
+ }
1131
+
1132
+ if (params.sampling.backend_sampling) {
1133
+ cparams.samplers = pimpl->samplers_seq_config.data();
1134
+ cparams.n_samplers = pimpl->samplers_seq_config.size();
1135
+ }
1136
+
1137
+ llama_context * lctx = llama_init_from_model(model, cparams);
1138
+ if (lctx == NULL) {
1139
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
1140
+ return;
1141
+ }
1142
+
1143
+ pimpl->context.reset(lctx);
1144
+ }
1145
+
1146
+ llama_model * common_init_result::model() {
1147
+ return pimpl->model.get();
1148
+ }
1149
+
1150
+ llama_context * common_init_result::context() {
1151
+ return pimpl->context.get();
1152
+ }
1153
+
1154
+ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
1155
+ return pimpl->samplers[seq_id].get();
1156
+ }
1157
+
1158
+ void common_init_result::reset_samplers() {
1159
+ for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
1160
+ llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
1161
+ }
1162
+ }
1163
+
1164
+ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
1165
+ return pimpl->lora;
1166
+ }
1167
+
1168
+ common_init_result_ptr common_init_from_params(common_params & params) {
1169
+ common_init_result_ptr res(new common_init_result(params));
1170
+
1171
+ llama_model * model = res->model();
1172
+ if (model == NULL) {
1173
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
1174
+ return res;
1175
+ }
1176
+
1177
+ llama_context * lctx = res->context();
1178
+ if (lctx == NULL) {
1179
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
1180
+ return res;
1181
+ }
1182
+
1183
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1184
+
1185
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
1186
+ LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
1187
+ params.ctx_shift = false;
1188
+ }
1189
+
1190
+ if (!params.control_vectors.empty()) {
1191
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
1192
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
1193
+
1194
+ const auto cvec = common_control_vector_load(params.control_vectors);
1195
+ if (cvec.n_embd == -1) {
1196
+ return res;
1197
+ }
1198
+
1199
+ int err = llama_set_adapter_cvec(
1200
+ lctx,
1201
+ cvec.data.data(),
1202
+ cvec.data.size(),
1203
+ cvec.n_embd,
1204
+ params.control_vector_layer_start,
1205
+ params.control_vector_layer_end);
1206
+ if (err) {
1207
+ return res;
1208
+ }
1209
+ }
1210
+
1211
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
1212
+ bool ok = true;
1213
+
1214
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
1215
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
1216
+ ok = false;
1217
+ }
1218
+
1219
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
1220
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
1221
+ bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
1222
+
1223
+ if (!has_eos && !has_sep && !has_rerank_prompt) {
1224
+ LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
1225
+ ok = false;
1226
+ } else if (!has_eos) {
1227
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
1228
+ }
1229
+
1230
+ if (!ok) {
1231
+ return res;
1232
+ }
1233
+ }
1234
+
1235
+ if (!params.lora_init_without_apply) {
1236
+ common_set_adapter_lora(lctx, params.lora_adapters);
1237
+ }
1238
+
1239
+ if (params.warmup) {
1240
+ LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1241
+
1242
+ llama_set_warmup(lctx, true);
1243
+
1244
+ std::vector<llama_token> tmp;
1245
+ llama_token bos = llama_vocab_bos(vocab);
1246
+ llama_token eos = llama_vocab_eos(vocab);
1247
+
1248
+ // some models (e.g. T5) don't have a BOS token
1249
+ if (bos != LLAMA_TOKEN_NULL) {
1250
+ tmp.push_back(bos);
1251
+ }
1252
+ if (eos != LLAMA_TOKEN_NULL) {
1253
+ tmp.push_back(eos);
1254
+ }
1255
+ if (tmp.empty()) {
1256
+ tmp.push_back(0);
1257
+ }
1258
+
1259
+ if (llama_model_has_encoder(model)) {
1260
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
1261
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
1262
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
1263
+ decoder_start_token_id = bos;
1264
+ }
1265
+ tmp.clear();
1266
+ tmp.push_back(decoder_start_token_id);
1267
+ }
1268
+ if (llama_model_has_decoder(model)) {
1269
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1270
+ }
1271
+ llama_memory_clear(llama_get_memory(lctx), true);
1272
+ llama_synchronize(lctx);
1273
+ llama_perf_context_reset(lctx);
1274
+ llama_set_warmup(lctx, false);
1275
+
1276
+ // reset samplers to reset RNG state after warmup to the seeded state
1277
+ res->reset_samplers();
1278
+ }
1279
+
1280
+ return res;
1281
+ }
1282
+
1283
+ common_init_result::~common_init_result() = default;
1284
+
1285
+ std::string get_model_endpoint() {
1286
+ const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
1287
+ // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
1288
+ const char * hf_endpoint_env = getenv("HF_ENDPOINT");
1289
+ const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
1290
+ std::string model_endpoint = "https://huggingface.co/";
1291
+ if (endpoint_env) {
1292
+ model_endpoint = endpoint_env;
1293
+ if (model_endpoint.back() != '/') {
1294
+ model_endpoint += '/';
1295
+ }
1296
+ }
1297
+ return model_endpoint;
1298
+ }
1299
+
1300
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1301
+ std::vector<llama_adapter_lora *> loras;
1302
+ std::vector<float> scales;
1303
+
1304
+ for (auto & la: lora) {
1305
+ loras.push_back(la.ptr);
1306
+ scales.push_back(la.scale);
1307
+ }
1308
+
1309
+ llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
1310
+ }
1311
+
1312
+ struct llama_model_params common_model_params_to_llama(common_params & params) {
1313
+ auto mparams = llama_model_default_params();
1314
+
1315
+ if (!params.devices.empty()) {
1316
+ mparams.devices = params.devices.data();
1317
+ }
1318
+
1319
+ mparams.n_gpu_layers = params.n_gpu_layers;
1320
+ mparams.main_gpu = params.main_gpu;
1321
+ mparams.split_mode = params.split_mode;
1322
+ mparams.tensor_split = params.tensor_split;
1323
+ mparams.use_mmap = params.use_mmap;
1324
+ mparams.use_direct_io = params.use_direct_io;
1325
+ mparams.use_mlock = params.use_mlock;
1326
+ mparams.check_tensors = params.check_tensors;
1327
+ mparams.use_extra_bufts = !params.no_extra_bufts;
1328
+ mparams.no_host = params.no_host;
1329
+
1330
+ if (params.kv_overrides.empty()) {
1331
+ mparams.kv_overrides = NULL;
1332
+ } else {
1333
+ GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1334
+ mparams.kv_overrides = params.kv_overrides.data();
1335
+ }
1336
+
1337
+ if (params.tensor_buft_overrides.empty()) {
1338
+ mparams.tensor_buft_overrides = NULL;
1339
+ } else {
1340
+ GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1341
+ mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1342
+ }
1343
+
1344
+ mparams.progress_callback = params.load_progress_callback;
1345
+ mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1346
+
1347
+ return mparams;
1348
+ }
1349
+
1350
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1351
+ auto cparams = llama_context_default_params();
1352
+
1353
+ cparams.n_ctx = params.n_ctx;
1354
+ cparams.n_seq_max = params.n_parallel;
1355
+ cparams.n_batch = params.n_batch;
1356
+ cparams.n_ubatch = params.n_ubatch;
1357
+ cparams.n_threads = params.cpuparams.n_threads;
1358
+ cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1359
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1360
+ cparams.embeddings = params.embedding;
1361
+ cparams.rope_scaling_type = params.rope_scaling_type;
1362
+ cparams.rope_freq_base = params.rope_freq_base;
1363
+ cparams.rope_freq_scale = params.rope_freq_scale;
1364
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
1365
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
1366
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
1367
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
1368
+ cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1369
+ cparams.pooling_type = params.pooling_type;
1370
+ cparams.attention_type = params.attention_type;
1371
+ cparams.flash_attn_type = params.flash_attn_type;
1372
+ cparams.cb_eval = params.cb_eval;
1373
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
1374
+ cparams.offload_kqv = !params.no_kv_offload;
1375
+ cparams.no_perf = params.no_perf;
1376
+ cparams.op_offload = !params.no_op_offload;
1377
+ cparams.swa_full = params.swa_full;
1378
+ cparams.kv_unified = params.kv_unified;
1379
+
1380
+ cparams.type_k = params.cache_type_k;
1381
+ cparams.type_v = params.cache_type_v;
1382
+
1383
+ return cparams;
1384
+ }
1385
+
1386
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1387
+ struct ggml_threadpool_params tpp;
1388
+
1389
+ ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
1390
+
1391
+ if (params.mask_valid) {
1392
+ std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
1393
+ }
1394
+
1395
+ tpp.prio = params.priority;
1396
+ tpp.poll = params.poll;
1397
+ tpp.strict_cpu = params.strict_cpu;
1398
+
1399
+ return tpp;
1400
+ }
1401
+
1402
+ //
1403
+ // Batch utils
1404
+ //
1405
+
1406
+ void common_batch_clear(struct llama_batch & batch) {
1407
+ batch.n_tokens = 0;
1408
+ }
1409
+
1410
+ void common_batch_add(
1411
+ struct llama_batch & batch,
1412
+ llama_token id,
1413
+ llama_pos pos,
1414
+ const std::vector<llama_seq_id> & seq_ids,
1415
+ bool logits) {
1416
+ GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1417
+
1418
+ batch.token [batch.n_tokens] = id;
1419
+ batch.pos [batch.n_tokens] = pos;
1420
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1421
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
1422
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1423
+ }
1424
+ batch.logits [batch.n_tokens] = logits;
1425
+
1426
+ batch.n_tokens++;
1427
+ }
1428
+
1429
+ //
1430
+ // Vocab utils
1431
+ //
1432
+
1433
+ std::vector<llama_token> common_tokenize(
1434
+ const struct llama_context * ctx,
1435
+ const std::string & text,
1436
+ bool add_special,
1437
+ bool parse_special) {
1438
+ const llama_model * model = llama_get_model(ctx);
1439
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1440
+ return common_tokenize(vocab, text, add_special, parse_special);
1441
+ }
1442
+
1443
+ std::vector<llama_token> common_tokenize(
1444
+ const struct llama_vocab * vocab,
1445
+ const std::string & text,
1446
+ bool add_special,
1447
+ bool parse_special) {
1448
+ // upper limit for the number of tokens
1449
+ int n_tokens = text.length() + 2 * add_special;
1450
+ std::vector<llama_token> result(n_tokens);
1451
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1452
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1453
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1454
+ }
1455
+ if (n_tokens < 0) {
1456
+ result.resize(-n_tokens);
1457
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1458
+ GGML_ASSERT(check == -n_tokens);
1459
+ } else {
1460
+ result.resize(n_tokens);
1461
+ }
1462
+ return result;
1463
+ }
1464
+
1465
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1466
+ const llama_model * model = llama_get_model(ctx);
1467
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1468
+ return common_token_to_piece(vocab, token, special);
1469
+ }
1470
+
1471
+ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1472
+ std::string piece;
1473
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1474
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1475
+ if (n_chars < 0) {
1476
+ piece.resize(-n_chars);
1477
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1478
+ GGML_ASSERT(check == -n_chars);
1479
+ }
1480
+ else {
1481
+ piece.resize(n_chars);
1482
+ }
1483
+
1484
+ return piece;
1485
+ }
1486
+
1487
+ std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1488
+ const llama_model * model = llama_get_model(ctx);
1489
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1490
+ return common_detokenize(vocab, tokens, special);
1491
+ }
1492
+
1493
+ std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1494
+ std::string text;
1495
+ text.resize(std::max(text.capacity(), tokens.size()));
1496
+ int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1497
+ if (n_chars < 0) {
1498
+ text.resize(-n_chars);
1499
+ n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1500
+ GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1501
+ }
1502
+
1503
+ text.resize(n_chars);
1504
+
1505
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1506
+ return text;
1507
+ }
1508
+
1509
+ //
1510
+ // Embedding utils
1511
+ //
1512
+
1513
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1514
+ double sum = 0.0;
1515
+
1516
+ switch (embd_norm) {
1517
+ case -1: // no normalisation
1518
+ sum = 1.0;
1519
+ break;
1520
+ case 0: // max absolute
1521
+ for (int i = 0; i < n; i++) {
1522
+ if (sum < std::abs(inp[i])) {
1523
+ sum = std::abs(inp[i]);
1524
+ }
1525
+ }
1526
+ sum /= 32760.0; // make an int16 range
1527
+ break;
1528
+ case 2: // euclidean
1529
+ for (int i = 0; i < n; i++) {
1530
+ sum += inp[i] * inp[i];
1531
+ }
1532
+ sum = std::sqrt(sum);
1533
+ break;
1534
+ default: // p-norm (euclidean is p-norm p=2)
1535
+ for (int i = 0; i < n; i++) {
1536
+ sum += std::pow(std::abs(inp[i]), embd_norm);
1537
+ }
1538
+ sum = std::pow(sum, 1.0 / embd_norm);
1539
+ break;
1540
+ }
1541
+
1542
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1543
+
1544
+ for (int i = 0; i < n; i++) {
1545
+ out[i] = inp[i] * norm;
1546
+ }
1547
+ }
1548
+
1549
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1550
+ double sum = 0.0;
1551
+ double sum1 = 0.0;
1552
+ double sum2 = 0.0;
1553
+
1554
+ for (int i = 0; i < n; i++) {
1555
+ sum += embd1[i] * embd2[i];
1556
+ sum1 += embd1[i] * embd1[i];
1557
+ sum2 += embd2[i] * embd2[i];
1558
+ }
1559
+
1560
+ // Handle the case where one or both vectors are zero vectors
1561
+ if (sum1 == 0.0 || sum2 == 0.0) {
1562
+ if (sum1 == 0.0 && sum2 == 0.0) {
1563
+ return 1.0f; // two zero vectors are similar
1564
+ }
1565
+ return 0.0f;
1566
+ }
1567
+
1568
+ return sum / (sqrt(sum1) * sqrt(sum2));
1569
+ }
1570
+
1571
+ //
1572
+ // Control vector utils
1573
+ //
1574
+
1575
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1576
+ common_control_vector_data result = { -1, {} };
1577
+
1578
+ ggml_context * ctx = nullptr;
1579
+ struct gguf_init_params meta_gguf_params = {
1580
+ /* .no_alloc = */ false,
1581
+ /* .ctx = */ &ctx,
1582
+ };
1583
+ struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
1584
+ if (!ctx_gguf) {
1585
+ LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1586
+ return result;
1587
+ }
1588
+
1589
+ int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
1590
+ if (n_tensors == 0) {
1591
+ LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1592
+ }
1593
+
1594
+ for (int i = 0; i < n_tensors; i++) {
1595
+ std::string name = gguf_get_tensor_name(ctx_gguf, i);
1596
+
1597
+ int layer_idx = -1;
1598
+
1599
+ // split on '.'
1600
+ size_t dotpos = name.find('.');
1601
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1602
+ try {
1603
+ layer_idx = std::stoi(name.substr(dotpos + 1));
1604
+ } catch (...) {
1605
+ layer_idx = -1;
1606
+ }
1607
+ }
1608
+ if (layer_idx < 0) {
1609
+ LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1610
+ result.n_embd = -1;
1611
+ break;
1612
+ } else if (layer_idx == 0) {
1613
+ LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1614
+ result.n_embd = -1;
1615
+ break;
1616
+ }
1617
+
1618
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
1619
+ if (tensor->type != GGML_TYPE_F32) {
1620
+ LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1621
+ result.n_embd = -1;
1622
+ break;
1623
+ }
1624
+ if (ggml_n_dims(tensor) != 1) {
1625
+ LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1626
+ result.n_embd = -1;
1627
+ break;
1628
+ }
1629
+
1630
+ if (result.n_embd == -1) {
1631
+ result.n_embd = ggml_nelements(tensor);
1632
+ } else if (ggml_nelements(tensor) != result.n_embd) {
1633
+ LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1634
+ result.n_embd = -1;
1635
+ break;
1636
+ }
1637
+
1638
+ // extend if necessary - do not store data for layer 0 (it's not used)
1639
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
1640
+
1641
+ const float * src = (const float *) tensor->data;
1642
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1643
+ for (int j = 0; j < result.n_embd; j++) {
1644
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1645
+ }
1646
+
1647
+ }
1648
+
1649
+ if (result.n_embd == -1) {
1650
+ LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1651
+ result.data.clear();
1652
+ }
1653
+
1654
+ gguf_free(ctx_gguf);
1655
+ ggml_free(ctx);
1656
+
1657
+ return result;
1658
+ }
1659
+
1660
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1661
+ common_control_vector_data result = { -1, {} };
1662
+
1663
+ for (const auto & info : load_infos) {
1664
+ auto cur = common_control_vector_load_one(info);
1665
+
1666
+ if (cur.n_embd == -1) {
1667
+ result.n_embd = -1;
1668
+ break;
1669
+ }
1670
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
1671
+ LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1672
+ result.n_embd = -1;
1673
+ break;
1674
+ }
1675
+
1676
+ if (result.n_embd == -1) {
1677
+ result = std::move(cur);
1678
+ } else {
1679
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
1680
+ for (size_t i = 0; i < cur.data.size(); i++) {
1681
+ result.data[i] += cur.data[i];
1682
+ }
1683
+ }
1684
+ }
1685
+
1686
+ if (result.n_embd == -1) {
1687
+ LOG_ERR("%s: no valid control vector files passed\n", __func__);
1688
+ result.data.clear();
1689
+ }
1690
+
1691
+ return result;
1692
+ }
1693
+
1694
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1695
+ const int64_t ne_datapoint = llama_n_ctx(ctx);
1696
+ const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1697
+ ggml_opt_dataset_t result = ggml_opt_dataset_init(
1698
+ GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
1699
+
1700
+ llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
1701
+ llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
1702
+
1703
+ for (int64_t idata = 0; idata < ndata; ++idata) {
1704
+ memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
1705
+ memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
1706
+ }
1707
+
1708
+ return result;
1709
+ }
1710
+
1711
+ ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
1712
+ ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
1713
+ const lr_opt & d = *(lr_opt *) userdata;
1714
+ result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
1715
+ result.sgd.wd = result.adamw.wd = d.wd;
1716
+ return result;
1717
+ }
1718
+
1719
+ // TODO make all command line args case-insensitive
1720
+ static inline bool eq_case_insensitive(char const* a, char const* b) {
1721
+ return !
1722
+ #if defined(_MSC_VER)
1723
+ _stricmp
1724
+ #else
1725
+ strcasecmp
1726
+ #endif // defined(_MSC_VER)
1727
+ (a, b);
1728
+ }
1729
+
1730
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
1731
+ if (eq_case_insensitive("adamw", n)) {
1732
+ return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
1733
+ }
1734
+ if (eq_case_insensitive("sgd", n)) {
1735
+ return GGML_OPT_OPTIMIZER_TYPE_SGD;
1736
+ }
1737
+ return GGML_OPT_OPTIMIZER_TYPE_COUNT;
1738
+ }
1739
+
1740
+ // TODO simplify to use just log and exp
1741
+ static float const k_log_2 = std::log(2.f);
1742
+
1743
+ void lr_opt::init() {
1744
+ if (lr_min > 0 && lr_min < lr0) {
1745
+ float nhalf = std::log(lr0 / lr_min) / k_log_2;
1746
+ float e = epochs;
1747
+ if (decay_epochs > 0 && decay_epochs < e) {
1748
+ e = decay_epochs;
1749
+ } else {
1750
+ decay_epochs = e;
1751
+ }
1752
+ scale_epoch = nhalf / e;
1753
+ }
1754
+ }
1755
+
1756
+ float lr_opt::get_lr(float epoch) const {
1757
+ float r = lr_min <= 0 ? lr0 :
1758
+ epoch >= decay_epochs ? lr_min :
1759
+ lr0 * std::pow(0.5f, epoch * scale_epoch);
1760
+ LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
1761
+ return r;
1762
+ }
1763
+
1764
+ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
1765
+ llama_batch batch = llama_batch_get_one(&last_token, 1);
1766
+ batch.pos = &pos;
1767
+ if (llama_decode(ctx, batch)) {
1768
+ LOG_ERR("%s: failed to replay last token\n", __func__);
1769
+ return false;
1770
+ }
1771
+ return true;
1772
+ }
1773
+
1774
+ bool common_prompt_batch_decode(
1775
+ struct llama_context * ctx,
1776
+ const std::vector<llama_token> & tokens,
1777
+ int & n_past,
1778
+ int n_batch,
1779
+ std::string_view state_path,
1780
+ bool save_state) {
1781
+ const int n_eval = tokens.size();
1782
+ if (n_eval == 0) {
1783
+ return true;
1784
+ }
1785
+
1786
+ if (save_state && n_eval > 1) {
1787
+ const int n_tokens_before_last = n_eval - 1;
1788
+
1789
+ GGML_ASSERT(n_eval <= n_batch);
1790
+
1791
+ // Decode all but the last token so we can save the memory state before decoding the last token.
1792
+ // This is done so we can restore the session state later and replay the last token.
1793
+ // Memory implementations in recurrent/hybrid models don't support removing tokens from their
1794
+ // memory, so we can't just remove the last token from the memory and replay the last token which
1795
+ // is the reason for this logic.
1796
+ if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
1797
+ LOG_ERR("%s : failed to eval\n", __func__);
1798
+ return false;
1799
+ }
1800
+ n_past += n_tokens_before_last;
1801
+
1802
+ llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
1803
+ LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
1804
+
1805
+ llama_token last_token = tokens.back();
1806
+ llama_batch batch = llama_batch_get_one(&last_token, 1);
1807
+ int32_t pos = n_past;
1808
+ batch.pos = &pos;
1809
+
1810
+ if (llama_decode(ctx, batch)) {
1811
+ LOG_ERR("%s : failed to eval last token\n", __func__);
1812
+ return false;
1813
+ }
1814
+ n_past++;
1815
+ } else {
1816
+ if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
1817
+ LOG_ERR("%s : failed to eval\n", __func__);
1818
+ return false;
1819
+ }
1820
+ n_past += n_eval;
1821
+ }
1822
+
1823
+ return true;
1824
+ }
llama.cpp/common/common.h ADDED
@@ -0,0 +1,931 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-opt.h"
6
+ #include "llama-cpp.h"
7
+
8
+ #include <set>
9
+ #include <sstream>
10
+ #include <string>
11
+ #include <string_view>
12
+ #include <vector>
13
+ #include <map>
14
+
15
+ #if defined(_WIN32) && !defined(_WIN32_WINNT)
16
+ #define _WIN32_WINNT 0x0A00
17
+ #endif
18
+
19
+ #ifdef _WIN32
20
+ #define DIRECTORY_SEPARATOR '\\'
21
+ #else
22
+ #define DIRECTORY_SEPARATOR '/'
23
+ #endif // _WIN32
24
+
25
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
26
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
27
+
28
+ #define print_build_info() do { \
29
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
31
+ } while(0)
32
+
33
+ struct common_time_meas {
34
+ common_time_meas(int64_t & t_acc, bool disable = false);
35
+ ~common_time_meas();
36
+
37
+ const int64_t t_start_us;
38
+
39
+ int64_t & t_acc;
40
+ };
41
+
42
+ struct common_adapter_lora_info {
43
+ std::string path;
44
+ float scale;
45
+
46
+ std::string task_name;
47
+ std::string prompt_prefix;
48
+
49
+ struct llama_adapter_lora * ptr;
50
+ };
51
+
52
+ using llama_tokens = std::vector<llama_token>;
53
+
54
+ // build info
55
+ extern int LLAMA_BUILD_NUMBER;
56
+ extern const char * LLAMA_COMMIT;
57
+ extern const char * LLAMA_COMPILER;
58
+ extern const char * LLAMA_BUILD_TARGET;
59
+
60
+ const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
61
+
62
+ struct common_control_vector_load_info;
63
+
64
+ //
65
+ // CPU utils
66
+ //
67
+
68
+ struct cpu_params {
69
+ int n_threads = -1;
70
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
71
+ bool mask_valid = false; // Default: any CPU
72
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
73
+ bool strict_cpu = false; // Use strict CPU placement
74
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
75
+ };
76
+
77
+ int32_t cpu_get_num_physical_cores();
78
+ int32_t cpu_get_num_math();
79
+
80
+ //
81
+ // Common params
82
+ //
83
+
84
+ enum llama_example {
85
+ LLAMA_EXAMPLE_BATCHED,
86
+ LLAMA_EXAMPLE_DEBUG,
87
+ LLAMA_EXAMPLE_COMMON,
88
+ LLAMA_EXAMPLE_SPECULATIVE,
89
+ LLAMA_EXAMPLE_COMPLETION,
90
+ LLAMA_EXAMPLE_CLI,
91
+ LLAMA_EXAMPLE_EMBEDDING,
92
+ LLAMA_EXAMPLE_PERPLEXITY,
93
+ LLAMA_EXAMPLE_RETRIEVAL,
94
+ LLAMA_EXAMPLE_PASSKEY,
95
+ LLAMA_EXAMPLE_IMATRIX,
96
+ LLAMA_EXAMPLE_BENCH,
97
+ LLAMA_EXAMPLE_SERVER,
98
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
99
+ LLAMA_EXAMPLE_EXPORT_LORA,
100
+ LLAMA_EXAMPLE_MTMD,
101
+ LLAMA_EXAMPLE_LOOKUP,
102
+ LLAMA_EXAMPLE_PARALLEL,
103
+ LLAMA_EXAMPLE_TTS,
104
+ LLAMA_EXAMPLE_DIFFUSION,
105
+ LLAMA_EXAMPLE_FINETUNE,
106
+ LLAMA_EXAMPLE_FIT_PARAMS,
107
+
108
+ LLAMA_EXAMPLE_COUNT,
109
+ };
110
+
111
+ enum common_sampler_type {
112
+ COMMON_SAMPLER_TYPE_NONE = 0,
113
+ COMMON_SAMPLER_TYPE_DRY = 1,
114
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
115
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
116
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
117
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
118
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
119
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
120
+ COMMON_SAMPLER_TYPE_XTC = 8,
121
+ COMMON_SAMPLER_TYPE_INFILL = 9,
122
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
123
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
124
+ COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
125
+ };
126
+
127
+ // dimensionality reduction methods, used by cvector-generator
128
+ enum dimre_method {
129
+ DIMRE_METHOD_PCA,
130
+ DIMRE_METHOD_MEAN,
131
+ };
132
+
133
+ enum common_conversation_mode {
134
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
135
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
136
+ COMMON_CONVERSATION_MODE_AUTO = 2,
137
+ };
138
+
139
+ enum common_grammar_trigger_type {
140
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
141
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
142
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
143
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
144
+ };
145
+
146
+ struct common_grammar_trigger {
147
+ common_grammar_trigger_type type;
148
+ std::string value;
149
+ llama_token token = LLAMA_TOKEN_NULL;
150
+ };
151
+
152
+ enum common_params_sampling_config : uint64_t {
153
+ COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
154
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
155
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
156
+ COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
157
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
158
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
159
+ COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
160
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
161
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
162
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
163
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
164
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
165
+ };
166
+
167
+ enum common_speculative_type {
168
+ COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
169
+ COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
170
+ COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
171
+ COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
172
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
173
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
174
+ COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
175
+ COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
176
+ COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
177
+ };
178
+
179
+ // sampling parameters
180
+ struct common_params_sampling {
181
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
182
+
183
+ int32_t n_prev = 64; // number of previous tokens to remember
184
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
185
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
186
+ int32_t top_k = 40; // <= 0 to use vocab size
187
+ float top_p = 0.95f; // 1.0 = disabled
188
+ float min_p = 0.05f; // 0.0 = disabled
189
+ float xtc_probability = 0.00f; // 0.0 = disabled
190
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
191
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
192
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
193
+ float dynatemp_range = 0.00f; // 0.0 = disabled
194
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
195
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
196
+ float penalty_repeat = 1.00f; // 1.0 = disabled
197
+ float penalty_freq = 0.00f; // 0.0 = disabled
198
+ float penalty_present = 0.00f; // 0.0 = disabled
199
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
200
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
201
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
202
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
203
+ float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
204
+ float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
205
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
206
+ float top_n_sigma = -1.00f; // -1.0 = disabled
207
+ float mirostat_tau = 5.00f; // target entropy
208
+ float mirostat_eta = 0.10f; // learning rate
209
+ bool ignore_eos = false;
210
+ bool no_perf = false; // disable performance metrics
211
+ bool timing_per_token = false;
212
+
213
+ uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
214
+
215
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
216
+
217
+ std::vector<enum common_sampler_type> samplers = {
218
+ COMMON_SAMPLER_TYPE_PENALTIES,
219
+ COMMON_SAMPLER_TYPE_DRY,
220
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
221
+ COMMON_SAMPLER_TYPE_TOP_K,
222
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
223
+ COMMON_SAMPLER_TYPE_TOP_P,
224
+ COMMON_SAMPLER_TYPE_MIN_P,
225
+ COMMON_SAMPLER_TYPE_XTC,
226
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
227
+ };
228
+
229
+ std::string grammar; // optional BNF-like grammar to constrain sampling
230
+ bool grammar_lazy = false;
231
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
232
+ std::set<llama_token> preserved_tokens;
233
+
234
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
235
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
236
+
237
+ bool backend_sampling = false;
238
+
239
+ bool has_logit_bias() const {
240
+ return !logit_bias.empty();
241
+ }
242
+
243
+ // print the parameters into a string
244
+ std::string print() const;
245
+ };
246
+
247
+ struct common_params_model {
248
+ std::string path = ""; // model local path // NOLINT
249
+ std::string url = ""; // model url to download // NOLINT
250
+ std::string hf_repo = ""; // HF repo // NOLINT
251
+ std::string hf_file = ""; // HF file // NOLINT
252
+ std::string docker_repo = ""; // Docker repo // NOLINT
253
+ std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
254
+ };
255
+
256
+ struct common_ngram_mod;
257
+
258
+ struct common_params_speculative {
259
+ common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
260
+
261
+ // general-purpose speculative decoding parameters
262
+
263
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
264
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
265
+ float p_split = 0.1f; // speculative decoding split probability
266
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
267
+
268
+ // ngram-based speculative decoding
269
+
270
+ uint16_t ngram_size_n = 12; // ngram size for lookup
271
+ uint16_t ngram_size_m = 48; // mgram size for speculative tokens
272
+ uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
273
+
274
+ std::shared_ptr<common_ngram_mod> ngram_mod;
275
+
276
+ std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
277
+ std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
278
+
279
+ // draft-model speculative decoding
280
+
281
+ struct common_params_model mparams_dft;
282
+
283
+ llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
284
+
285
+ llama_context_params cparams_dft; // these are the parameters for the draft llama_context
286
+
287
+ int32_t n_ctx = 0; // draft context size
288
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
289
+
290
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
291
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
292
+
293
+ struct cpu_params cpuparams;
294
+ struct cpu_params cpuparams_batch;
295
+
296
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
297
+
298
+ std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
299
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
300
+
301
+ bool has_dft() const {
302
+ return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
303
+ }
304
+ };
305
+
306
+ struct common_params_vocoder {
307
+ struct common_params_model model;
308
+
309
+ std::string speaker_file = ""; // speaker file path // NOLINT
310
+
311
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
312
+ };
313
+
314
+ struct common_params_diffusion {
315
+ int32_t steps = 128;
316
+ bool visual_mode = false;
317
+
318
+ float eps = 0; // epsilon for timesteps
319
+ int32_t block_length = 0; // block length for generation
320
+
321
+ int32_t algorithm = 4; // default algorithm: low-confidence
322
+ float alg_temp = 0.0f; // algorithm temperature
323
+
324
+ float cfg_scale = 0; // classifier-free guidance scale
325
+ bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
326
+ };
327
+
328
+ // reasoning API response format (not to be confused as chat template's reasoning format)
329
+ // only used by server
330
+ enum common_reasoning_format {
331
+ COMMON_REASONING_FORMAT_NONE,
332
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
333
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
334
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
335
+ // do not extend this enum unless you absolutely have to
336
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
337
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
338
+ };
339
+
340
+
341
+ struct lr_opt {
342
+ float lr0 = 1e-5; // learning rate at first epoch
343
+ float lr_min = -1;
344
+ float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
345
+ float scale_epoch = 0;
346
+ float wd = 0;
347
+ unsigned epochs = 2;
348
+
349
+ unsigned epoch; // set by optimizer outer (epochs) loop
350
+ // learning rate decay - constant LR per epoch only for now
351
+ float get_lr(float e) const;
352
+ float get_lr() const { return get_lr(epoch); }
353
+ // must call after arg parse, before get_lr
354
+ void init();
355
+ };
356
+
357
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
358
+
359
+ struct common_params {
360
+ int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
361
+ int32_t n_ctx = 0; // context size, 0 == context the model was trained with
362
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
363
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
364
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
365
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
366
+ int32_t n_parallel = 1; // number of parallel sequences to decode
367
+ int32_t n_sequences = 1; // number of sequences to decode
368
+ int32_t grp_attn_n = 1; // group-attention factor
369
+ int32_t grp_attn_w = 512; // group-attention width
370
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
371
+ float rope_freq_base = 0.0f; // RoPE base frequency
372
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
373
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
374
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
375
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
376
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
377
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
378
+
379
+ // offload params
380
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
381
+
382
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
383
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
384
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
385
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
386
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
387
+
388
+ // margin per device in bytes for fitting parameters to free memory:
389
+ std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
390
+
391
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
392
+
393
+ struct cpu_params cpuparams;
394
+ struct cpu_params cpuparams_batch;
395
+
396
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
397
+ void * cb_eval_user_data = nullptr;
398
+
399
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
400
+
401
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
402
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
403
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
404
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
405
+
406
+ struct common_params_sampling sampling;
407
+ struct common_params_speculative speculative;
408
+ struct common_params_vocoder vocoder;
409
+ struct common_params_diffusion diffusion;
410
+
411
+ struct common_params_model model;
412
+
413
+ std::set<std::string> model_alias; // model aliases // NOLINT
414
+ std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
415
+ std::string hf_token = ""; // HF token // NOLINT
416
+ std::string prompt = ""; // NOLINT
417
+ std::string system_prompt = ""; // NOLINT
418
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
419
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
420
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
421
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
422
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
423
+
424
+ // llama-debug specific options
425
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
426
+ bool save_logits = false; // whether to save logits to files // NOLINT
427
+ std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
428
+
429
+ std::vector<std::string> in_files; // all input files
430
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
431
+ std::vector<llama_model_kv_override> kv_overrides;
432
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
433
+
434
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
435
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
436
+
437
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
438
+
439
+ int32_t verbosity = 3; // LOG_LEVEL_INFO
440
+ int32_t control_vector_layer_start = -1; // layer range for control vector
441
+ int32_t control_vector_layer_end = -1; // layer range for control vector
442
+ bool offline = false;
443
+
444
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
445
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
446
+ // (which is more convenient to use for plotting)
447
+ //
448
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
449
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
450
+
451
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
452
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
453
+
454
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
455
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
456
+
457
+ bool kl_divergence = false; // compute KL divergence
458
+
459
+ bool usage = false; // print usage
460
+ bool completion = false; // print source-able completion script
461
+ bool use_color = false; // use color to distinguish generations and inputs
462
+ bool special = false; // enable special token output
463
+ bool interactive = false; // interactive mode
464
+ bool interactive_first = false; // wait for user input immediately
465
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
466
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
467
+
468
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
469
+ bool multiline_input = false; // reverse the usage of `\`
470
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
471
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
472
+ bool no_perf = false; // disable performance metrics
473
+ bool show_timings = true; // show timing information on CLI
474
+ bool ctx_shift = false; // context shift on infinite text generation
475
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
476
+ bool kv_unified = false; // enable unified KV cache
477
+
478
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
479
+ bool use_mmap = true; // enable mmap to use filesystem cache
480
+ bool use_direct_io = false; // read from disk without buffering
481
+ bool use_mlock = false; // use mlock to keep model in memory
482
+ bool verbose_prompt = false; // print prompt tokens before generation
483
+ bool display_prompt = true; // print prompt before generation
484
+ bool no_kv_offload = false; // disable KV offloading
485
+ bool warmup = true; // warmup run
486
+ bool check_tensors = false; // validate tensor data
487
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
488
+ bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
489
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
490
+
491
+ bool single_turn = false; // single turn chat conversation
492
+
493
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
494
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
495
+
496
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
497
+
498
+ // multimodal models (see tools/mtmd)
499
+ struct common_params_model mmproj;
500
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
501
+ bool no_mmproj = false; // explicitly disable multimodal model
502
+ std::vector<std::string> image; // path to image file(s)
503
+ int image_min_tokens = -1;
504
+ int image_max_tokens = -1;
505
+
506
+ // finetune
507
+ struct lr_opt lr;
508
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
509
+ float val_split = 0.05f; // fraction of the data used for the validation set
510
+
511
+ // embedding
512
+ bool embedding = false; // get only sentence embedding
513
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
514
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
515
+ std::string embd_sep = "\n"; // separator of embeddings
516
+ std::string cls_sep = "\t"; // separator of classification sequences
517
+
518
+ // server params
519
+ int32_t port = 8080; // server listens on this network port
520
+ int32_t timeout_read = 600; // http read timeout in seconds
521
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
522
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
523
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
524
+ bool cache_prompt = true; // whether to enable prompt caching
525
+ int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
526
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
527
+
528
+ std::string hostname = "127.0.0.1";
529
+ std::string public_path = ""; // NOLINT
530
+ std::string api_prefix = ""; // NOLINT
531
+ std::string chat_template = ""; // NOLINT
532
+ bool use_jinja = true; // NOLINT
533
+ bool enable_chat_template = true;
534
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
535
+ int reasoning_budget = -1;
536
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
537
+ int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
538
+
539
+ std::vector<std::string> api_keys;
540
+
541
+ std::string ssl_file_key = ""; // NOLINT
542
+ std::string ssl_file_cert = ""; // NOLINT
543
+
544
+ std::map<std::string, std::string> default_template_kwargs;
545
+
546
+ // webui configs
547
+ bool webui = true;
548
+ std::string webui_config_json;
549
+
550
+ // "advanced" endpoints are disabled by default for better security
551
+ bool endpoint_slots = true;
552
+ bool endpoint_props = false; // only control POST requests, not GET
553
+ bool endpoint_metrics = false;
554
+
555
+ // router server configs
556
+ std::string models_dir = ""; // directory containing models for the router server
557
+ std::string models_preset = ""; // directory containing model presets for the router server
558
+ int models_max = 4; // maximum number of models to load simultaneously
559
+ bool models_autoload = true; // automatically load models when requested via the router server
560
+
561
+ bool log_json = false;
562
+
563
+ std::string slot_save_path;
564
+ std::string media_path; // path to directory for loading media files
565
+
566
+ float slot_prompt_similarity = 0.1f;
567
+
568
+ // batched-bench params
569
+ bool is_pp_shared = false;
570
+ bool is_tg_separate = false;
571
+
572
+ std::vector<int32_t> n_pp;
573
+ std::vector<int32_t> n_tg;
574
+ std::vector<int32_t> n_pl;
575
+
576
+ // retrieval params
577
+ std::vector<std::string> context_files; // context files to embed
578
+
579
+ int32_t chunk_size = 64; // chunk size for context embedding
580
+
581
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
582
+
583
+ // passkey params
584
+ int32_t n_junk = 250; // number of times to repeat the junk text
585
+ int32_t i_pos = -1; // position of the passkey in the junk text
586
+
587
+ // imatrix params
588
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
589
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
590
+ int32_t i_chunk = 0; // start processing from this chunk
591
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
592
+
593
+ bool process_output = false; // collect data for the output tensor
594
+ bool compute_ppl = true; // whether to compute perplexity
595
+ bool show_statistics = false; // show imatrix statistics per tensor
596
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
597
+
598
+ // cvector-generator params
599
+ int n_pca_batch = 100;
600
+ int n_pca_iterations = 1000;
601
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
602
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
603
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
604
+
605
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
606
+
607
+ // batched-bench params
608
+ bool batched_bench_output_jsonl = false;
609
+
610
+ // common params
611
+ std::string out_file; // output filename for all example programs
612
+ // optional callback for model loading progress and cancellation:
613
+ // called with a progress value between 0.0 and 1.0.
614
+ // return false from callback to abort model loading or true to continue
615
+ llama_progress_callback load_progress_callback = NULL;
616
+ void * load_progress_callback_user_data = NULL;
617
+ };
618
+
619
+ // call once at the start of a program if it uses libcommon
620
+ // initializes the logging system and prints info about the build
621
+ void common_init();
622
+
623
+ std::string common_params_get_system_info(const common_params & params);
624
+
625
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
626
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
627
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
628
+ bool set_process_priority(enum ggml_sched_priority prio);
629
+
630
+ //
631
+ // String utils
632
+ //
633
+
634
+ #ifdef __GNUC__
635
+ # if defined(__MINGW32__) && !defined(__clang__)
636
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
637
+ # else
638
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
639
+ # endif
640
+ #else
641
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
642
+ #endif
643
+
644
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
645
+ std::string string_format(const char * fmt, ...);
646
+
647
+ std::string string_strip(const std::string & str);
648
+ std::string string_get_sortable_timestamp();
649
+
650
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
651
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
652
+ std::string string_repeat(const std::string & str, size_t n);
653
+
654
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
655
+
656
+ std::string regex_escape(const std::string & s);
657
+
658
+ template<class T>
659
+ static std::vector<T> string_split(const std::string & str, char delim) {
660
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
661
+ std::vector<T> values;
662
+ std::istringstream str_stream(str);
663
+ std::string token;
664
+ while (std::getline(str_stream, token, delim)) {
665
+ T value;
666
+ std::istringstream token_stream(token);
667
+ token_stream >> value;
668
+ values.push_back(value);
669
+ }
670
+ return values;
671
+ }
672
+
673
+ template<>
674
+ inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
675
+ {
676
+ std::vector<std::string> parts;
677
+ size_t begin_pos = 0;
678
+ size_t delim_pos = str.find(delim);
679
+ while (delim_pos != std::string::npos) {
680
+ std::string part = str.substr(begin_pos, delim_pos - begin_pos);
681
+ parts.emplace_back(part);
682
+ begin_pos = delim_pos + 1;
683
+ delim_pos = str.find(delim, begin_pos);
684
+ }
685
+ parts.emplace_back(str.substr(begin_pos));
686
+ return parts;
687
+ }
688
+
689
+ // remove when moving to c++20
690
+ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
691
+ return str.size() >= prefix.size() &&
692
+ str.compare(0, prefix.size(), prefix) == 0;
693
+ }
694
+
695
+ // remove when moving to c++20
696
+ inline bool string_ends_with(std::string_view str, std::string_view suffix) {
697
+ return str.size() >= suffix.size() &&
698
+ str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
699
+ }
700
+
701
+ inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
702
+ if (string_ends_with(str, suffix)) {
703
+ str.resize(str.size() - suffix.size());
704
+ return true;
705
+ }
706
+ return false;
707
+ }
708
+
709
+ inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
710
+ if (!str.empty() && !stop.empty()) {
711
+ const size_t max_len = std::min(str.size(), stop.size());
712
+ const char last_char = str.back();
713
+ for (size_t len = max_len; len > 0; --len) {
714
+ if (stop[len - 1] == last_char) {
715
+ if (string_ends_with(str, stop.substr(0, len))) {
716
+ return str.size() - len;
717
+ }
718
+ }
719
+ }
720
+ }
721
+ return std::string::npos;
722
+ }
723
+
724
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
725
+ void string_process_escapes(std::string & input);
726
+
727
+ std::string string_from(bool value);
728
+ std::string string_from(const std::vector<int> & values);
729
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
730
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
731
+
732
+ //
733
+ // Filesystem utils
734
+ //
735
+
736
+ bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
737
+ bool fs_create_directory_with_parents(const std::string & path);
738
+ bool fs_is_directory(const std::string & path);
739
+
740
+ std::string fs_get_cache_directory();
741
+ std::string fs_get_cache_file(const std::string & filename);
742
+
743
+ struct common_file_info {
744
+ std::string path;
745
+ std::string name;
746
+ size_t size = 0; // in bytes
747
+ bool is_dir = false;
748
+ };
749
+ std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
750
+
751
+ //
752
+ // TTY utils
753
+ //
754
+
755
+ // Auto-detect if colors can be enabled based on terminal and environment
756
+ bool tty_can_use_colors();
757
+
758
+ //
759
+ // Model utils
760
+ //
761
+
762
+ struct common_sampler;
763
+
764
+ // note: defines the model, context, samplers, ets. lifetimes
765
+ struct common_init_result {
766
+ common_init_result(common_params & params);
767
+ ~common_init_result();
768
+
769
+ llama_model * model();
770
+ llama_context * context();
771
+
772
+ common_sampler * sampler(llama_seq_id seq_id);
773
+ void reset_samplers();
774
+
775
+ std::vector<llama_adapter_lora_ptr> & lora();
776
+
777
+ private:
778
+ struct impl;
779
+ std::unique_ptr<impl> pimpl;
780
+ };
781
+
782
+ using common_init_result_ptr = std::unique_ptr<common_init_result>;
783
+
784
+ common_init_result_ptr common_init_from_params(common_params & params);
785
+
786
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
787
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
788
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
789
+
790
+ // clear LoRA adapters from context, then apply new list of adapters
791
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
792
+
793
+ std::string get_model_endpoint();
794
+
795
+ //
796
+ // Batch utils
797
+ //
798
+
799
+ void common_batch_clear(struct llama_batch & batch);
800
+
801
+ void common_batch_add(
802
+ struct llama_batch & batch,
803
+ llama_token id,
804
+ llama_pos pos,
805
+ const std::vector<llama_seq_id> & seq_ids,
806
+ bool logits);
807
+
808
+ // decodes a single batch of tokens for a prompt and manages session tokens
809
+ //
810
+ // Note: We save state before the last token so that we can replay it to ensure
811
+ // compatibility with all memory types. Recurrent/hybrid models cannot remove
812
+ // tokens from memory, so this approach works across all model architectures.
813
+ bool common_prompt_batch_decode(
814
+ struct llama_context * ctx,
815
+ const std::vector<llama_token> & embd,
816
+ int & n_past,
817
+ int n_batch,
818
+ std::string_view state_path,
819
+ bool save_state);
820
+
821
+ // replays the last token after loading state to regenerate logits
822
+ // used after loading session state to ensure the sampling context has valid logits
823
+ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
824
+
825
+ //
826
+ // Vocab utils
827
+ //
828
+
829
+ // tokenizes a string into a vector of tokens
830
+ // should work similar to Python's `tokenizer.encode`
831
+ std::vector<llama_token> common_tokenize(
832
+ const struct llama_context * ctx,
833
+ const std::string & text,
834
+ bool add_special,
835
+ bool parse_special = false);
836
+
837
+ std::vector<llama_token> common_tokenize(
838
+ const struct llama_vocab * vocab,
839
+ const std::string & text,
840
+ bool add_special,
841
+ bool parse_special = false);
842
+
843
+ // tokenizes a token into a piece, optionally renders special/control tokens
844
+ // should work similar to Python's `tokenizer.id_to_piece`
845
+ std::string common_token_to_piece(
846
+ const struct llama_context * ctx,
847
+ llama_token token,
848
+ bool special = true);
849
+
850
+ std::string common_token_to_piece(
851
+ const struct llama_vocab * vocab,
852
+ llama_token token,
853
+ bool special = true);
854
+
855
+ // detokenizes a vector of tokens into a string
856
+ // should work similar to Python's `tokenizer.decode`
857
+ // optionally renders special/control tokens
858
+ std::string common_detokenize(
859
+ const struct llama_context * ctx,
860
+ const std::vector<llama_token> & tokens,
861
+ bool special = true);
862
+
863
+ std::string common_detokenize(
864
+ const struct llama_vocab * vocab,
865
+ const std::vector<llama_token> & tokens,
866
+ bool special = true);
867
+
868
+ //
869
+ // Embedding utils
870
+ //
871
+
872
+ // TODO: repace embd_norm with an enum
873
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
874
+
875
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
876
+
877
+ //
878
+ // Control vector utils
879
+ //
880
+
881
+ struct common_control_vector_data {
882
+ int n_embd;
883
+
884
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
885
+ std::vector<float> data;
886
+ };
887
+
888
+ struct common_control_vector_load_info {
889
+ float strength;
890
+
891
+ std::string fname;
892
+ };
893
+
894
+ // Load control vectors, scale each by strength, and add them together.
895
+ // On error, returns {-1, empty}
896
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
897
+
898
+ //
899
+ // Split utils
900
+ //
901
+
902
+ namespace {
903
+
904
+ const char * const LLM_KV_SPLIT_NO = "split.no";
905
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
906
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
907
+
908
+ }
909
+
910
+ //
911
+ // MoE utils
912
+ //
913
+
914
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
915
+
916
+ inline std::string llm_ffn_exps_block_regex(int idx) {
917
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
918
+ }
919
+
920
+ inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
921
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
922
+ }
923
+
924
+ //
925
+ // training utils
926
+ //
927
+
928
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
929
+
930
+ // "adamw" or "sgd" (case insensitive)
931
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
llama.cpp/common/console.cpp ADDED
@@ -0,0 +1,1137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "console.h"
2
+ #include "log.h"
3
+ #include <vector>
4
+ #include <iostream>
5
+ #include <cassert>
6
+ #include <cstddef>
7
+ #include <cctype>
8
+ #include <cwctype>
9
+ #include <cstdint>
10
+ #include <condition_variable>
11
+ #include <mutex>
12
+ #include <thread>
13
+ #include <stdarg.h>
14
+
15
+ #if defined(_WIN32)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #ifndef NOMINMAX
18
+ #define NOMINMAX
19
+ #endif
20
+ #include <windows.h>
21
+ #include <fcntl.h>
22
+ #include <io.h>
23
+ #ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
24
+ #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
25
+ #endif
26
+ #else
27
+ #include <climits>
28
+ #include <sys/ioctl.h>
29
+ #include <unistd.h>
30
+ #include <wchar.h>
31
+ #include <stdio.h>
32
+ #include <stdlib.h>
33
+ #include <signal.h>
34
+ #include <termios.h>
35
+ #endif
36
+
37
+ #define ANSI_COLOR_RED "\x1b[31m"
38
+ #define ANSI_COLOR_GREEN "\x1b[32m"
39
+ #define ANSI_COLOR_YELLOW "\x1b[33m"
40
+ #define ANSI_COLOR_BLUE "\x1b[34m"
41
+ #define ANSI_COLOR_MAGENTA "\x1b[35m"
42
+ #define ANSI_COLOR_CYAN "\x1b[36m"
43
+ #define ANSI_COLOR_GRAY "\x1b[90m"
44
+ #define ANSI_COLOR_RESET "\x1b[0m"
45
+ #define ANSI_BOLD "\x1b[1m"
46
+
47
+ namespace console {
48
+
49
+ #if defined (_WIN32)
50
+ namespace {
51
+ // Use private-use unicode values to represent special keys that are not reported
52
+ // as characters (e.g. arrows on Windows). These values should never clash with
53
+ // real input and let the rest of the code handle navigation uniformly.
54
+ static constexpr char32_t KEY_ARROW_LEFT = 0xE000;
55
+ static constexpr char32_t KEY_ARROW_RIGHT = 0xE001;
56
+ static constexpr char32_t KEY_ARROW_UP = 0xE002;
57
+ static constexpr char32_t KEY_ARROW_DOWN = 0xE003;
58
+ static constexpr char32_t KEY_HOME = 0xE004;
59
+ static constexpr char32_t KEY_END = 0xE005;
60
+ static constexpr char32_t KEY_CTRL_ARROW_LEFT = 0xE006;
61
+ static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
62
+ static constexpr char32_t KEY_DELETE = 0xE008;
63
+ }
64
+
65
+ //
66
+ // Console state
67
+ //
68
+ #endif
69
+
70
+ static bool advanced_display = false;
71
+ static bool simple_io = true;
72
+ static display_type current_display = DISPLAY_TYPE_RESET;
73
+
74
+ static FILE* out = stdout;
75
+
76
+ #if defined (_WIN32)
77
+ static void* hConsole;
78
+ #else
79
+ static FILE* tty = nullptr;
80
+ static termios initial_state;
81
+ #endif
82
+
83
+ //
84
+ // Init and cleanup
85
+ //
86
+
87
+ void init(bool use_simple_io, bool use_advanced_display) {
88
+ advanced_display = use_advanced_display;
89
+ simple_io = use_simple_io;
90
+ #if defined(_WIN32)
91
+ // Windows-specific console initialization
92
+ DWORD dwMode = 0;
93
+ hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
94
+ if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
95
+ hConsole = GetStdHandle(STD_ERROR_HANDLE);
96
+ if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
97
+ hConsole = nullptr;
98
+ simple_io = true;
99
+ }
100
+ }
101
+ if (hConsole) {
102
+ // Check conditions combined to reduce nesting
103
+ if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
104
+ !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
105
+ advanced_display = false;
106
+ }
107
+ // Set console output codepage to UTF8
108
+ SetConsoleOutputCP(CP_UTF8);
109
+ }
110
+ HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
111
+ if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
112
+ // Set console input codepage to UTF16
113
+ _setmode(_fileno(stdin), _O_WTEXT);
114
+
115
+ // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
116
+ if (simple_io) {
117
+ dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
118
+ } else {
119
+ dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
120
+ }
121
+ if (!SetConsoleMode(hConIn, dwMode)) {
122
+ simple_io = true;
123
+ }
124
+ }
125
+ if (simple_io) {
126
+ _setmode(_fileno(stdin), _O_U8TEXT);
127
+ }
128
+ #else
129
+ // POSIX-specific console initialization
130
+ if (!simple_io) {
131
+ struct termios new_termios;
132
+ tcgetattr(STDIN_FILENO, &initial_state);
133
+ new_termios = initial_state;
134
+ new_termios.c_lflag &= ~(ICANON | ECHO);
135
+ new_termios.c_cc[VMIN] = 1;
136
+ new_termios.c_cc[VTIME] = 0;
137
+ tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
138
+
139
+ tty = fopen("/dev/tty", "w+");
140
+ if (tty != nullptr) {
141
+ out = tty;
142
+ }
143
+ }
144
+
145
+ setlocale(LC_ALL, "");
146
+ #endif
147
+ }
148
+
149
+ void cleanup() {
150
+ // Reset console display
151
+ set_display(DISPLAY_TYPE_RESET);
152
+
153
+ #if !defined(_WIN32)
154
+ // Restore settings on POSIX systems
155
+ if (!simple_io) {
156
+ if (tty != nullptr) {
157
+ out = stdout;
158
+ fclose(tty);
159
+ tty = nullptr;
160
+ }
161
+ tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
162
+ }
163
+ #endif
164
+ }
165
+
166
+ //
167
+ // Display and IO
168
+ //
169
+
170
+ // Keep track of current display and only emit ANSI code if it changes
171
+ void set_display(display_type display) {
172
+ if (advanced_display && current_display != display) {
173
+ common_log_flush(common_log_main());
174
+ switch(display) {
175
+ case DISPLAY_TYPE_RESET:
176
+ fprintf(out, ANSI_COLOR_RESET);
177
+ break;
178
+ case DISPLAY_TYPE_INFO:
179
+ fprintf(out, ANSI_COLOR_MAGENTA);
180
+ break;
181
+ case DISPLAY_TYPE_PROMPT:
182
+ fprintf(out, ANSI_COLOR_YELLOW);
183
+ break;
184
+ case DISPLAY_TYPE_REASONING:
185
+ fprintf(out, ANSI_COLOR_GRAY);
186
+ break;
187
+ case DISPLAY_TYPE_USER_INPUT:
188
+ fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
189
+ break;
190
+ case DISPLAY_TYPE_ERROR:
191
+ fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
192
+ }
193
+ current_display = display;
194
+ fflush(out);
195
+ }
196
+ }
197
+
198
+ static char32_t getchar32() {
199
+ #if defined(_WIN32)
200
+ HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
201
+ wchar_t high_surrogate = 0;
202
+
203
+ while (true) {
204
+ INPUT_RECORD record;
205
+ DWORD count;
206
+ if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
207
+ return WEOF;
208
+ }
209
+
210
+ if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
211
+ wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
212
+ if (wc == 0) {
213
+ const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
214
+ const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
215
+ switch (record.Event.KeyEvent.wVirtualKeyCode) {
216
+ case VK_LEFT: return ctrl_pressed ? KEY_CTRL_ARROW_LEFT : KEY_ARROW_LEFT;
217
+ case VK_RIGHT: return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
218
+ case VK_UP: return KEY_ARROW_UP;
219
+ case VK_DOWN: return KEY_ARROW_DOWN;
220
+ case VK_HOME: return KEY_HOME;
221
+ case VK_END: return KEY_END;
222
+ case VK_DELETE: return KEY_DELETE;
223
+ default: continue;
224
+ }
225
+ }
226
+
227
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
228
+ high_surrogate = wc;
229
+ continue;
230
+ }
231
+ if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
232
+ if (high_surrogate != 0) { // Check if we have a high surrogate
233
+ return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
234
+ }
235
+ }
236
+
237
+ high_surrogate = 0; // Reset the high surrogate
238
+ return static_cast<char32_t>(wc);
239
+ }
240
+ }
241
+ #else
242
+ wchar_t wc = getwchar();
243
+ if (static_cast<wint_t>(wc) == WEOF) {
244
+ return WEOF;
245
+ }
246
+
247
+ #if WCHAR_MAX == 0xFFFF
248
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
249
+ wchar_t low_surrogate = getwchar();
250
+ if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
251
+ return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
252
+ }
253
+ }
254
+ if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
255
+ return 0xFFFD; // Return the replacement character U+FFFD
256
+ }
257
+ #endif
258
+
259
+ return static_cast<char32_t>(wc);
260
+ #endif
261
+ }
262
+
263
+ static void pop_cursor() {
264
+ #if defined(_WIN32)
265
+ if (hConsole != NULL) {
266
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
267
+ GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
268
+
269
+ COORD newCursorPosition = bufferInfo.dwCursorPosition;
270
+ if (newCursorPosition.X == 0) {
271
+ newCursorPosition.X = bufferInfo.dwSize.X - 1;
272
+ newCursorPosition.Y -= 1;
273
+ } else {
274
+ newCursorPosition.X -= 1;
275
+ }
276
+
277
+ SetConsoleCursorPosition(hConsole, newCursorPosition);
278
+ return;
279
+ }
280
+ #endif
281
+ putc('\b', out);
282
+ }
283
+
284
+ static int estimateWidth(char32_t codepoint) {
285
+ #if defined(_WIN32)
286
+ (void)codepoint;
287
+ return 1;
288
+ #else
289
+ return wcwidth(codepoint);
290
+ #endif
291
+ }
292
+
293
+ static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
294
+ #if defined(_WIN32)
295
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
296
+ if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
297
+ // go with the default
298
+ return expectedWidth;
299
+ }
300
+ COORD initialPosition = bufferInfo.dwCursorPosition;
301
+ DWORD nNumberOfChars = length;
302
+ WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
303
+
304
+ CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
305
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
306
+
307
+ // Figure out our real position if we're in the last column
308
+ if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
309
+ DWORD nNumberOfChars;
310
+ WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
311
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
312
+ }
313
+
314
+ int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
315
+ if (width < 0) {
316
+ width += newBufferInfo.dwSize.X;
317
+ }
318
+ return width;
319
+ #else
320
+ // We can trust expectedWidth if we've got one
321
+ if (expectedWidth >= 0 || tty == nullptr) {
322
+ fwrite(utf8_codepoint, length, 1, out);
323
+ return expectedWidth;
324
+ }
325
+
326
+ fputs("\033[6n", tty); // Query cursor position
327
+ int x1;
328
+ int y1;
329
+ int x2;
330
+ int y2;
331
+ int results = 0;
332
+ results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
333
+
334
+ fwrite(utf8_codepoint, length, 1, tty);
335
+
336
+ fputs("\033[6n", tty); // Query cursor position
337
+ results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
338
+
339
+ if (results != 4) {
340
+ return expectedWidth;
341
+ }
342
+
343
+ int width = x2 - x1;
344
+ if (width < 0) {
345
+ // Calculate the width considering text wrapping
346
+ struct winsize w;
347
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
348
+ width += w.ws_col;
349
+ }
350
+ return width;
351
+ #endif
352
+ }
353
+
354
+ static void replace_last(char ch) {
355
+ #if defined(_WIN32)
356
+ pop_cursor();
357
+ put_codepoint(&ch, 1, 1);
358
+ #else
359
+ fprintf(out, "\b%c", ch);
360
+ #endif
361
+ }
362
+
363
+ static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
364
+ unsigned char c = static_cast<unsigned char>(input[pos]);
365
+ if ((c & 0x80u) == 0u) {
366
+ advance = 1;
367
+ return c;
368
+ }
369
+ if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
370
+ unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
371
+ if ((c1 & 0xC0u) != 0x80u) {
372
+ advance = 1;
373
+ return 0xFFFD;
374
+ }
375
+ advance = 2;
376
+ return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
377
+ }
378
+ if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
379
+ unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
380
+ unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
381
+ if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
382
+ advance = 1;
383
+ return 0xFFFD;
384
+ }
385
+ advance = 3;
386
+ return ((c & 0x0Fu) << 12) |
387
+ ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
388
+ (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
389
+ }
390
+ if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
391
+ unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
392
+ unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
393
+ unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
394
+ if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
395
+ advance = 1;
396
+ return 0xFFFD;
397
+ }
398
+ advance = 4;
399
+ return ((c & 0x07u) << 18) |
400
+ ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
401
+ ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
402
+ (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
403
+ }
404
+
405
+ advance = 1;
406
+ return 0xFFFD; // replacement character for invalid input
407
+ }
408
+
409
+ static void append_utf8(char32_t ch, std::string & out) {
410
+ if (ch <= 0x7F) {
411
+ out.push_back(static_cast<unsigned char>(ch));
412
+ } else if (ch <= 0x7FF) {
413
+ out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
414
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
415
+ } else if (ch <= 0xFFFF) {
416
+ out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
417
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
418
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
419
+ } else if (ch <= 0x10FFFF) {
420
+ out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
421
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
422
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
423
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
424
+ } else {
425
+ // Invalid Unicode code point
426
+ }
427
+ }
428
+
429
+ // Helper function to remove the last UTF-8 character from a string
430
+ static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
431
+ if (pos == 0) return 0;
432
+ pos--;
433
+ while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
434
+ pos--;
435
+ }
436
+ return pos;
437
+ }
438
+
439
+ static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
440
+ if (pos >= line.length()) return line.length();
441
+ pos++;
442
+ while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
443
+ pos++;
444
+ }
445
+ return pos;
446
+ }
447
+
448
+ static void move_cursor(int delta);
449
+ static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
450
+ static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
451
+ static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
452
+ static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
453
+
454
+ static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
455
+ if (char_pos >= widths.size()) {
456
+ return;
457
+ }
458
+
459
+ size_t next_pos = next_utf8_char_pos(line, byte_pos);
460
+ int w = widths[char_pos];
461
+ size_t char_len = next_pos - byte_pos;
462
+
463
+ line.erase(byte_pos, char_len);
464
+ widths.erase(widths.begin() + char_pos);
465
+
466
+ size_t p = byte_pos;
467
+ int tail_width = 0;
468
+ for (size_t i = char_pos; i < widths.size(); ++i) {
469
+ size_t following = next_utf8_char_pos(line, p);
470
+ put_codepoint(line.c_str() + p, following - p, widths[i]);
471
+ tail_width += widths[i];
472
+ p = following;
473
+ }
474
+
475
+ for (int i = 0; i < w; ++i) {
476
+ fputc(' ', out);
477
+ }
478
+
479
+ move_cursor(-(tail_width + w));
480
+ }
481
+
482
+ static void clear_current_line(const std::vector<int> & widths) {
483
+ int total_width = 0;
484
+ for (int w : widths) {
485
+ total_width += (w > 0 ? w : 1);
486
+ }
487
+
488
+ if (total_width > 0) {
489
+ std::string spaces(total_width, ' ');
490
+ fwrite(spaces.c_str(), 1, total_width, out);
491
+ move_cursor(-total_width);
492
+ }
493
+ }
494
+
495
+ static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
496
+ size_t & byte_pos) {
497
+ move_to_line_start(char_pos, byte_pos, widths);
498
+ clear_current_line(widths);
499
+
500
+ line = std::move(new_line);
501
+ widths.clear();
502
+ byte_pos = 0;
503
+ char_pos = 0;
504
+
505
+ size_t idx = 0;
506
+ while (idx < line.size()) {
507
+ size_t advance = 0;
508
+ char32_t cp = decode_utf8(line, idx, advance);
509
+ int expected_width = estimateWidth(cp);
510
+ int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
511
+ if (real_width < 0) real_width = 0;
512
+ widths.push_back(real_width);
513
+ idx += advance;
514
+ ++char_pos;
515
+ byte_pos = idx;
516
+ }
517
+ }
518
+
519
+ static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
520
+ int back_width = 0;
521
+ for (size_t i = 0; i < char_pos; ++i) {
522
+ back_width += widths[i];
523
+ }
524
+ move_cursor(-back_width);
525
+ char_pos = 0;
526
+ byte_pos = 0;
527
+ }
528
+
529
+ static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
530
+ int forward_width = 0;
531
+ for (size_t i = char_pos; i < widths.size(); ++i) {
532
+ forward_width += widths[i];
533
+ }
534
+ move_cursor(forward_width);
535
+ char_pos = widths.size();
536
+ byte_pos = line.length();
537
+ }
538
+
539
+ static bool has_ctrl_modifier(const std::string & params) {
540
+ size_t start = 0;
541
+ while (start < params.size()) {
542
+ size_t end = params.find(';', start);
543
+ size_t len = (end == std::string::npos) ? params.size() - start : end - start;
544
+ if (len > 0) {
545
+ int value = 0;
546
+ for (size_t i = 0; i < len; ++i) {
547
+ char ch = params[start + i];
548
+ if (!std::isdigit(static_cast<unsigned char>(ch))) {
549
+ value = -1;
550
+ break;
551
+ }
552
+ value = value * 10 + (ch - '0');
553
+ }
554
+ if (value == 5) {
555
+ return true;
556
+ }
557
+ }
558
+
559
+ if (end == std::string::npos) {
560
+ break;
561
+ }
562
+ start = end + 1;
563
+ }
564
+ return false;
565
+ }
566
+
567
+ static bool is_space_codepoint(char32_t cp) {
568
+ return std::iswspace(static_cast<wint_t>(cp)) != 0;
569
+ }
570
+
571
+ static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
572
+ if (char_pos == 0) {
573
+ return;
574
+ }
575
+
576
+ size_t new_char_pos = char_pos;
577
+ size_t new_byte_pos = byte_pos;
578
+ int move_width = 0;
579
+
580
+ while (new_char_pos > 0) {
581
+ size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
582
+ size_t advance = 0;
583
+ char32_t cp = decode_utf8(line, prev_byte, advance);
584
+ if (!is_space_codepoint(cp)) {
585
+ break;
586
+ }
587
+ move_width += widths[new_char_pos - 1];
588
+ new_char_pos--;
589
+ new_byte_pos = prev_byte;
590
+ }
591
+
592
+ while (new_char_pos > 0) {
593
+ size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
594
+ size_t advance = 0;
595
+ char32_t cp = decode_utf8(line, prev_byte, advance);
596
+ if (is_space_codepoint(cp)) {
597
+ break;
598
+ }
599
+ move_width += widths[new_char_pos - 1];
600
+ new_char_pos--;
601
+ new_byte_pos = prev_byte;
602
+ }
603
+
604
+ move_cursor(-move_width);
605
+ char_pos = new_char_pos;
606
+ byte_pos = new_byte_pos;
607
+ }
608
+
609
+ static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
610
+ if (char_pos >= widths.size()) {
611
+ return;
612
+ }
613
+
614
+ size_t new_char_pos = char_pos;
615
+ size_t new_byte_pos = byte_pos;
616
+ int move_width = 0;
617
+
618
+ while (new_char_pos < widths.size()) {
619
+ size_t advance = 0;
620
+ char32_t cp = decode_utf8(line, new_byte_pos, advance);
621
+ if (!is_space_codepoint(cp)) {
622
+ break;
623
+ }
624
+ move_width += widths[new_char_pos];
625
+ new_char_pos++;
626
+ new_byte_pos += advance;
627
+ }
628
+
629
+ while (new_char_pos < widths.size()) {
630
+ size_t advance = 0;
631
+ char32_t cp = decode_utf8(line, new_byte_pos, advance);
632
+ if (is_space_codepoint(cp)) {
633
+ break;
634
+ }
635
+ move_width += widths[new_char_pos];
636
+ new_char_pos++;
637
+ new_byte_pos += advance;
638
+ }
639
+
640
+ while (new_char_pos < widths.size()) {
641
+ size_t advance = 0;
642
+ char32_t cp = decode_utf8(line, new_byte_pos, advance);
643
+ if (!is_space_codepoint(cp)) {
644
+ break;
645
+ }
646
+ move_width += widths[new_char_pos];
647
+ new_char_pos++;
648
+ new_byte_pos += advance;
649
+ }
650
+
651
+ move_cursor(move_width);
652
+ char_pos = new_char_pos;
653
+ byte_pos = new_byte_pos;
654
+ }
655
+
656
+ static void move_cursor(int delta) {
657
+ if (delta == 0) return;
658
+ #if defined(_WIN32)
659
+ if (hConsole != NULL) {
660
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
661
+ GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
662
+ COORD newCursorPosition = bufferInfo.dwCursorPosition;
663
+ int width = bufferInfo.dwSize.X;
664
+ int newX = newCursorPosition.X + delta;
665
+ int newY = newCursorPosition.Y;
666
+
667
+ while (newX >= width) {
668
+ newX -= width;
669
+ newY++;
670
+ }
671
+ while (newX < 0) {
672
+ newX += width;
673
+ newY--;
674
+ }
675
+
676
+ newCursorPosition.X = newX;
677
+ newCursorPosition.Y = newY;
678
+ SetConsoleCursorPosition(hConsole, newCursorPosition);
679
+ }
680
+ #else
681
+ if (delta < 0) {
682
+ for (int i = 0; i < -delta; i++) fprintf(out, "\b");
683
+ } else {
684
+ for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
685
+ }
686
+ #endif
687
+ }
688
+
689
+ struct history_t {
690
+ std::vector<std::string> entries;
691
+ size_t viewing_idx = SIZE_MAX;
692
+ std::string backup_line; // current line before viewing history
693
+ void add(const std::string & line) {
694
+ if (line.empty()) {
695
+ return;
696
+ }
697
+ // avoid duplicates with the last entry
698
+ if (entries.empty() || entries.back() != line) {
699
+ entries.push_back(line);
700
+ }
701
+ // also clear viewing state
702
+ end_viewing();
703
+ }
704
+ bool prev(std::string & cur_line) {
705
+ if (entries.empty()) {
706
+ return false;
707
+ }
708
+ if (viewing_idx == SIZE_MAX) {
709
+ return false;
710
+ }
711
+ if (viewing_idx > 0) {
712
+ viewing_idx--;
713
+ }
714
+ cur_line = entries[viewing_idx];
715
+ return true;
716
+ }
717
+ bool next(std::string & cur_line) {
718
+ if (entries.empty() || viewing_idx == SIZE_MAX) {
719
+ return false;
720
+ }
721
+ viewing_idx++;
722
+ if (viewing_idx >= entries.size()) {
723
+ cur_line = backup_line;
724
+ end_viewing();
725
+ } else {
726
+ cur_line = entries[viewing_idx];
727
+ }
728
+ return true;
729
+ }
730
+ void begin_viewing(const std::string & line) {
731
+ backup_line = line;
732
+ viewing_idx = entries.size();
733
+ }
734
+ void end_viewing() {
735
+ viewing_idx = SIZE_MAX;
736
+ backup_line.clear();
737
+ }
738
+ bool is_viewing() const {
739
+ return viewing_idx != SIZE_MAX;
740
+ }
741
+ } history;
742
+
743
+ static bool readline_advanced(std::string & line, bool multiline_input) {
744
+ if (out != stdout) {
745
+ fflush(stdout);
746
+ }
747
+
748
+ line.clear();
749
+ std::vector<int> widths;
750
+ bool is_special_char = false;
751
+ bool end_of_stream = false;
752
+
753
+ size_t byte_pos = 0; // current byte index
754
+ size_t char_pos = 0; // current character index (one char can be multiple bytes)
755
+
756
+ char32_t input_char;
757
+ while (true) {
758
+ assert(char_pos <= byte_pos);
759
+ assert(char_pos <= widths.size());
760
+ auto history_prev = [&]() {
761
+ if (!history.is_viewing()) {
762
+ history.begin_viewing(line);
763
+ }
764
+ std::string new_line;
765
+ if (!history.prev(new_line)) {
766
+ return;
767
+ }
768
+ set_line_contents(new_line, line, widths, char_pos, byte_pos);
769
+ };
770
+ auto history_next = [&]() {
771
+ if (history.is_viewing()) {
772
+ std::string new_line;
773
+ if (!history.next(new_line)) {
774
+ return;
775
+ }
776
+ set_line_contents(new_line, line, widths, char_pos, byte_pos);
777
+ }
778
+ };
779
+
780
+ fflush(out); // Ensure all output is displayed before waiting for input
781
+ input_char = getchar32();
782
+
783
+ if (input_char == '\r' || input_char == '\n') {
784
+ break;
785
+ }
786
+
787
+ if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
788
+ end_of_stream = true;
789
+ break;
790
+ }
791
+
792
+ if (is_special_char) {
793
+ replace_last(line.back());
794
+ is_special_char = false;
795
+ }
796
+
797
+ if (input_char == '\033') { // Escape sequence
798
+ char32_t code = getchar32();
799
+ if (code == '[') {
800
+ std::string params;
801
+ while (true) {
802
+ code = getchar32();
803
+ if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
804
+ break;
805
+ }
806
+ params.push_back(static_cast<char>(code));
807
+ }
808
+
809
+ const bool ctrl_modifier = has_ctrl_modifier(params);
810
+
811
+ if (code == 'D') { // left
812
+ if (ctrl_modifier) {
813
+ move_word_left(char_pos, byte_pos, widths, line);
814
+ } else if (char_pos > 0) {
815
+ int w = widths[char_pos - 1];
816
+ move_cursor(-w);
817
+ char_pos--;
818
+ byte_pos = prev_utf8_char_pos(line, byte_pos);
819
+ }
820
+ } else if (code == 'C') { // right
821
+ if (ctrl_modifier) {
822
+ move_word_right(char_pos, byte_pos, widths, line);
823
+ } else if (char_pos < widths.size()) {
824
+ int w = widths[char_pos];
825
+ move_cursor(w);
826
+ char_pos++;
827
+ byte_pos = next_utf8_char_pos(line, byte_pos);
828
+ }
829
+ } else if (code == 'H') { // home
830
+ move_to_line_start(char_pos, byte_pos, widths);
831
+ } else if (code == 'F') { // end
832
+ move_to_line_end(char_pos, byte_pos, widths, line);
833
+ } else if (code == 'A' || code == 'B') {
834
+ // up/down
835
+ if (code == 'A') {
836
+ history_prev();
837
+ is_special_char = false;
838
+ } else if (code == 'B') {
839
+ history_next();
840
+ is_special_char = false;
841
+ }
842
+ } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
843
+ std::string digits;
844
+ for (char ch : params) {
845
+ if (ch == ';') {
846
+ break;
847
+ }
848
+ if (std::isdigit(static_cast<unsigned char>(ch))) {
849
+ digits.push_back(ch);
850
+ }
851
+ }
852
+
853
+ if (code == '~') {
854
+ if (digits == "1" || digits == "7") { // home
855
+ move_to_line_start(char_pos, byte_pos, widths);
856
+ } else if (digits == "4" || digits == "8") { // end
857
+ move_to_line_end(char_pos, byte_pos, widths, line);
858
+ } else if (digits == "3") { // delete
859
+ delete_at_cursor(line, widths, char_pos, byte_pos);
860
+ }
861
+ }
862
+ }
863
+ } else if (code == 0x1B) {
864
+ // Discard the rest of the escape sequence
865
+ while ((code = getchar32()) != (char32_t) WEOF) {
866
+ if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
867
+ break;
868
+ }
869
+ }
870
+ }
871
+ #if defined(_WIN32)
872
+ } else if (input_char == KEY_ARROW_LEFT) {
873
+ if (char_pos > 0) {
874
+ int w = widths[char_pos - 1];
875
+ move_cursor(-w);
876
+ char_pos--;
877
+ byte_pos = prev_utf8_char_pos(line, byte_pos);
878
+ }
879
+ } else if (input_char == KEY_ARROW_RIGHT) {
880
+ if (char_pos < widths.size()) {
881
+ int w = widths[char_pos];
882
+ move_cursor(w);
883
+ char_pos++;
884
+ byte_pos = next_utf8_char_pos(line, byte_pos);
885
+ }
886
+ } else if (input_char == KEY_CTRL_ARROW_LEFT) {
887
+ move_word_left(char_pos, byte_pos, widths, line);
888
+ } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
889
+ move_word_right(char_pos, byte_pos, widths, line);
890
+ } else if (input_char == KEY_HOME) {
891
+ move_to_line_start(char_pos, byte_pos, widths);
892
+ } else if (input_char == KEY_END) {
893
+ move_to_line_end(char_pos, byte_pos, widths, line);
894
+ } else if (input_char == KEY_DELETE) {
895
+ delete_at_cursor(line, widths, char_pos, byte_pos);
896
+ } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
897
+ if (input_char == KEY_ARROW_UP) {
898
+ history_prev();
899
+ is_special_char = false;
900
+ } else if (input_char == KEY_ARROW_DOWN) {
901
+ history_next();
902
+ is_special_char = false;
903
+ }
904
+ #endif
905
+ } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
906
+ if (char_pos > 0) {
907
+ int w = widths[char_pos - 1];
908
+ move_cursor(-w);
909
+ char_pos--;
910
+ size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
911
+ size_t char_len = byte_pos - prev_pos;
912
+ byte_pos = prev_pos;
913
+
914
+ // remove the character
915
+ line.erase(byte_pos, char_len);
916
+ widths.erase(widths.begin() + char_pos);
917
+
918
+ // redraw tail
919
+ size_t p = byte_pos;
920
+ int tail_width = 0;
921
+ for (size_t i = char_pos; i < widths.size(); ++i) {
922
+ size_t next_p = next_utf8_char_pos(line, p);
923
+ put_codepoint(line.c_str() + p, next_p - p, widths[i]);
924
+ tail_width += widths[i];
925
+ p = next_p;
926
+ }
927
+
928
+ // clear display
929
+ for (int i = 0; i < w; ++i) {
930
+ fputc(' ', out);
931
+ }
932
+ move_cursor(-(tail_width + w));
933
+ }
934
+ } else {
935
+ // insert character
936
+ std::string new_char_str;
937
+ append_utf8(input_char, new_char_str);
938
+ int w = estimateWidth(input_char);
939
+
940
+ if (char_pos == widths.size()) {
941
+ // insert at the end
942
+ line += new_char_str;
943
+ int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
944
+ if (real_w < 0) real_w = 0;
945
+ widths.push_back(real_w);
946
+ byte_pos += new_char_str.length();
947
+ char_pos++;
948
+ } else {
949
+ // insert in middle
950
+ line.insert(byte_pos, new_char_str);
951
+
952
+ int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
953
+ if (real_w < 0) real_w = 0;
954
+
955
+ widths.insert(widths.begin() + char_pos, real_w);
956
+
957
+ // print the tail
958
+ size_t p = byte_pos + new_char_str.length();
959
+ int tail_width = 0;
960
+ for (size_t i = char_pos + 1; i < widths.size(); ++i) {
961
+ size_t next_p = next_utf8_char_pos(line, p);
962
+ put_codepoint(line.c_str() + p, next_p - p, widths[i]);
963
+ tail_width += widths[i];
964
+ p = next_p;
965
+ }
966
+
967
+ move_cursor(-tail_width);
968
+
969
+ byte_pos += new_char_str.length();
970
+ char_pos++;
971
+ }
972
+ }
973
+
974
+ if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
975
+ replace_last(line.back());
976
+ is_special_char = true;
977
+ }
978
+ }
979
+
980
+ bool has_more = multiline_input;
981
+ if (is_special_char) {
982
+ replace_last(' ');
983
+ pop_cursor();
984
+
985
+ char last = line.back();
986
+ line.pop_back();
987
+ if (last == '\\') {
988
+ line += '\n';
989
+ fputc('\n', out);
990
+ has_more = !has_more;
991
+ } else {
992
+ // llama will just eat the single space, it won't act as a space
993
+ if (line.length() == 1 && line.back() == ' ') {
994
+ line.clear();
995
+ pop_cursor();
996
+ }
997
+ has_more = false;
998
+ }
999
+ } else {
1000
+ if (end_of_stream) {
1001
+ has_more = false;
1002
+ } else {
1003
+ line += '\n';
1004
+ fputc('\n', out);
1005
+ }
1006
+ }
1007
+
1008
+ if (!end_of_stream && !line.empty()) {
1009
+ // remove the trailing newline for history storage
1010
+ if (!line.empty() && line.back() == '\n') {
1011
+ line.pop_back();
1012
+ }
1013
+ // TODO: maybe support multiline history entries?
1014
+ history.add(line);
1015
+ }
1016
+
1017
+ fflush(out);
1018
+ return has_more;
1019
+ }
1020
+
1021
+ static bool readline_simple(std::string & line, bool multiline_input) {
1022
+ #if defined(_WIN32)
1023
+ std::wstring wline;
1024
+ if (!std::getline(std::wcin, wline)) {
1025
+ // Input stream is bad or EOF received
1026
+ line.clear();
1027
+ GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
1028
+ return false;
1029
+ }
1030
+
1031
+ int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
1032
+ line.resize(size_needed);
1033
+ WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
1034
+ #else
1035
+ if (!std::getline(std::cin, line)) {
1036
+ // Input stream is bad or EOF received
1037
+ line.clear();
1038
+ return false;
1039
+ }
1040
+ #endif
1041
+ if (!line.empty()) {
1042
+ char last = line.back();
1043
+ if (last == '/') { // Always return control on '/' symbol
1044
+ line.pop_back();
1045
+ return false;
1046
+ }
1047
+ if (last == '\\') { // '\\' changes the default action
1048
+ line.pop_back();
1049
+ multiline_input = !multiline_input;
1050
+ }
1051
+ }
1052
+ line += '\n';
1053
+
1054
+ // By default, continue input if multiline_input is set
1055
+ return multiline_input;
1056
+ }
1057
+
1058
+ bool readline(std::string & line, bool multiline_input) {
1059
+ if (simple_io) {
1060
+ return readline_simple(line, multiline_input);
1061
+ }
1062
+ return readline_advanced(line, multiline_input);
1063
+ }
1064
+
1065
+ namespace spinner {
1066
+ static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
1067
+ static std::condition_variable cv_stop;
1068
+ static std::thread th;
1069
+ static size_t frame = 0; // only modified by one thread
1070
+ static bool running = false;
1071
+ static std::mutex mtx;
1072
+ static auto wait_time = std::chrono::milliseconds(100);
1073
+ static void draw_next_frame() {
1074
+ // don't need lock because only one thread modifies running
1075
+ frame = (frame + 1) % sizeof(LOADING_CHARS);
1076
+ replace_last(LOADING_CHARS[frame]);
1077
+ fflush(out);
1078
+ }
1079
+ void start() {
1080
+ std::unique_lock<std::mutex> lock(mtx);
1081
+ if (simple_io || running) {
1082
+ return;
1083
+ }
1084
+ common_log_flush(common_log_main());
1085
+ fprintf(out, "%c", LOADING_CHARS[0]);
1086
+ fflush(out);
1087
+ frame = 1;
1088
+ running = true;
1089
+ th = std::thread([]() {
1090
+ std::unique_lock<std::mutex> lock(mtx);
1091
+ while (true) {
1092
+ if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
1093
+ break;
1094
+ }
1095
+ draw_next_frame();
1096
+ }
1097
+ });
1098
+ }
1099
+ void stop() {
1100
+ {
1101
+ std::unique_lock<std::mutex> lock(mtx);
1102
+ if (simple_io || !running) {
1103
+ return;
1104
+ }
1105
+ running = false;
1106
+ cv_stop.notify_all();
1107
+ }
1108
+ if (th.joinable()) {
1109
+ th.join();
1110
+ }
1111
+ replace_last(' ');
1112
+ pop_cursor();
1113
+ fflush(out);
1114
+ }
1115
+ }
1116
+
1117
+ void log(const char * fmt, ...) {
1118
+ va_list args;
1119
+ va_start(args, fmt);
1120
+ vfprintf(out, fmt, args);
1121
+ va_end(args);
1122
+ }
1123
+
1124
+ void error(const char * fmt, ...) {
1125
+ va_list args;
1126
+ va_start(args, fmt);
1127
+ display_type cur = current_display;
1128
+ set_display(DISPLAY_TYPE_ERROR);
1129
+ vfprintf(out, fmt, args);
1130
+ set_display(cur); // restore previous color
1131
+ va_end(args);
1132
+ }
1133
+
1134
+ void flush() {
1135
+ fflush(out);
1136
+ }
1137
+ }
llama.cpp/common/console.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Console functions
2
+
3
+ #pragma once
4
+
5
+ #include "common.h"
6
+
7
+ #include <string>
8
+
9
+ enum display_type {
10
+ DISPLAY_TYPE_RESET = 0,
11
+ DISPLAY_TYPE_INFO,
12
+ DISPLAY_TYPE_PROMPT,
13
+ DISPLAY_TYPE_REASONING,
14
+ DISPLAY_TYPE_USER_INPUT,
15
+ DISPLAY_TYPE_ERROR
16
+ };
17
+
18
+ namespace console {
19
+ void init(bool use_simple_io, bool use_advanced_display);
20
+ void cleanup();
21
+ void set_display(display_type display);
22
+ bool readline(std::string & line, bool multiline_input);
23
+
24
+ namespace spinner {
25
+ void start();
26
+ void stop();
27
+ }
28
+
29
+ // note: the logging API below output directly to stdout
30
+ // it can negatively impact performance if used on inference thread
31
+ // only use in in a dedicated CLI thread
32
+ // for logging in inference thread, use log.h instead
33
+
34
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
35
+ void log(const char * fmt, ...);
36
+
37
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
38
+ void error(const char * fmt, ...);
39
+
40
+ void flush();
41
+ }
llama.cpp/common/debug.cpp ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "debug.h"
2
+
3
+ #include "log.h"
4
+
5
+ #include <cmath>
6
+ #include <string>
7
+
8
+ static std::string common_ggml_ne_string(const ggml_tensor * t) {
9
+ std::string str;
10
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
11
+ str += std::to_string(t->ne[i]);
12
+ if (i + 1 < GGML_MAX_DIMS) {
13
+ str += ", ";
14
+ }
15
+ }
16
+ return str;
17
+ }
18
+
19
+ static float common_ggml_get_float_value(const uint8_t * data,
20
+ ggml_type type,
21
+ const size_t * nb,
22
+ size_t i0,
23
+ size_t i1,
24
+ size_t i2,
25
+ size_t i3) {
26
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
27
+ float v;
28
+ if (type == GGML_TYPE_F16) {
29
+ v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
30
+ } else if (type == GGML_TYPE_F32) {
31
+ v = *(const float *) &data[i];
32
+ } else if (type == GGML_TYPE_I64) {
33
+ v = (float) *(const int64_t *) &data[i];
34
+ } else if (type == GGML_TYPE_I32) {
35
+ v = (float) *(const int32_t *) &data[i];
36
+ } else if (type == GGML_TYPE_I16) {
37
+ v = (float) *(const int16_t *) &data[i];
38
+ } else if (type == GGML_TYPE_I8) {
39
+ v = (float) *(const int8_t *) &data[i];
40
+ } else if (type == GGML_TYPE_BF16) {
41
+ v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
42
+ } else {
43
+ GGML_ABORT("fatal error");
44
+ }
45
+ return v;
46
+ }
47
+
48
+ #define INDENT " "
49
+
50
+ template <bool abort>
51
+ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
52
+ GGML_ASSERT(n > 0);
53
+ float sum = 0;
54
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
55
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
56
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
57
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
58
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
59
+ sum += v;
60
+ }
61
+ }
62
+ }
63
+ }
64
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
65
+ LOG(INDENT "[\n");
66
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
67
+ if (i2 == n && ne[2] > 2 * n) {
68
+ LOG(INDENT INDENT "..., \n");
69
+ i2 = ne[2] - n;
70
+ }
71
+ LOG(INDENT INDENT "[\n");
72
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
73
+ if (i1 == n && ne[1] > 2 * n) {
74
+ LOG(INDENT INDENT INDENT "..., \n");
75
+ i1 = ne[1] - n;
76
+ }
77
+ LOG(INDENT INDENT INDENT "[");
78
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
79
+ if (i0 == n && ne[0] > 2 * n) {
80
+ LOG(" ..., ");
81
+ i0 = ne[0] - n;
82
+ }
83
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
84
+ LOG("%12.4f", v);
85
+ if (i0 < ne[0] - 1) {
86
+ LOG(", ");
87
+ }
88
+ }
89
+ LOG(" ],\n");
90
+ }
91
+ LOG(INDENT INDENT "],\n");
92
+ }
93
+ LOG(INDENT "]\n");
94
+ LOG(INDENT "sum = %f\n", sum);
95
+ }
96
+
97
+ if constexpr (abort) {
98
+ if (std::isnan(sum)) {
99
+ LOG("encountered NaN - aborting\n");
100
+ exit(0);
101
+ }
102
+ }
103
+ }
104
+
105
+ /**
106
+ * GGML operations callback during the graph execution.
107
+ *
108
+ * @param t current tensor
109
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
110
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
111
+ * see ggml_backend_sched_eval_callback
112
+ * @param user_data user data to pass at each call back
113
+ * @return true to receive data or continue the graph, false otherwise
114
+ */
115
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
116
+ auto * cb_data = (base_callback_data *) user_data;
117
+
118
+ const struct ggml_tensor * src0 = t->src[0];
119
+ const struct ggml_tensor * src1 = t->src[1];
120
+
121
+ if (ask) {
122
+ return true; // Always retrieve data
123
+ }
124
+
125
+ bool matches_filter = cb_data->tensor_filters.empty();
126
+
127
+ if (!matches_filter) {
128
+ for (const auto & filter : cb_data->tensor_filters) {
129
+ if (std::regex_search(t->name, filter)) {
130
+ matches_filter = true;
131
+ break;
132
+ }
133
+ }
134
+ }
135
+
136
+ char src1_str[128] = { 0 };
137
+ if (src1) {
138
+ snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
139
+ }
140
+
141
+ if (matches_filter) {
142
+ LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
143
+ ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
144
+ common_ggml_ne_string(t).c_str());
145
+ }
146
+
147
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
148
+
149
+ if (!is_host) {
150
+ auto n_bytes = ggml_nbytes(t);
151
+ cb_data->data.resize(n_bytes);
152
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
153
+ }
154
+
155
+ if (!ggml_is_quantized(t->type) && matches_filter) {
156
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
157
+ common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
158
+ }
159
+
160
+ return true;
161
+ }
162
+
163
+ // Explicit template instantiations
164
+ template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
165
+ template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
166
+ template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
167
+ template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
llama.cpp/common/debug.h ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "common.h"
3
+ #include <string>
4
+ #include <vector>
5
+ #include <regex>
6
+
7
+ // common debug functions and structs
8
+
9
+ // Print a tensor's detailed data
10
+ // data - the tensor's data in byte format
11
+ // type - the tensor's quantization type
12
+ // ne - the tensor dimensions array
13
+ // nb - the tensor strides array
14
+ // n - the number of rows/columns to fully print
15
+ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
16
+
17
+ // Intended to use as callback for ggml_backend_sched_eval_callback
18
+ // prints tensors that are processed in the computation graph
19
+ // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
20
+ // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
21
+ // The template parameter determins whether an error should be thrown whenever a NaN is encountered
22
+ // in a tensor (useful for stopping debug sessions on first erroneous tensor)
23
+ // The callback data will be passed as the third parameter (user_data)
24
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
25
+ struct base_callback_data {
26
+ std::vector<uint8_t> data;
27
+ std::vector<std::regex> tensor_filters;
28
+
29
+ base_callback_data() = default;
30
+
31
+ base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
32
+ for (const auto & pattern : filter_patterns) {
33
+ try {
34
+ std::string anchored_pattern = "^" + pattern;
35
+ tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
36
+ } catch (const std::regex_error & e) {
37
+ throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
38
+ }
39
+ }
40
+ params.cb_eval = common_debug_cb_eval<false>;
41
+ params.cb_eval_user_data = this;
42
+ }
43
+ };