CrashOverrideX commited on 3 days ago

Commit

67c993e

verified ·

1 Parent(s): 0a7f9e7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

llama.cpp/.devops/cann.Dockerfile +130 -0
llama.cpp/.devops/cpu.Dockerfile +88 -0
llama.cpp/.devops/cuda-new.Dockerfile +95 -0
llama.cpp/.devops/cuda.Dockerfile +94 -0
llama.cpp/.devops/intel.Dockerfile +95 -0
llama.cpp/.devops/llama-cli-cann.Dockerfile +45 -0
llama.cpp/.devops/llama-cpp-cuda.srpm.spec +85 -0
llama.cpp/.devops/llama-cpp.srpm.spec +87 -0
llama.cpp/.devops/musa.Dockerfile +101 -0
llama.cpp/.devops/rocm.Dockerfile +113 -0
llama.cpp/.devops/s390x.Dockerfile +126 -0
llama.cpp/.devops/tools.sh +53 -0
llama.cpp/.devops/vulkan.Dockerfile +90 -0
llama.cpp/.gemini/settings.json +1 -0
llama.cpp/.github/labeler.yml +106 -0
llama.cpp/.github/pull_request_template.md +1 -0
llama.cpp/build/CMakeCache.txt +91 -0
llama.cpp/ci/README-MUSA.md +35 -0
llama.cpp/ci/README.md +33 -0
llama.cpp/ci/run.sh +709 -0
llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
llama.cpp/cmake/build-info.cmake +48 -0
llama.cpp/cmake/common.cmake +58 -0
llama.cpp/cmake/download-models.cmake +21 -0
llama.cpp/cmake/git-vars.cmake +22 -0
llama.cpp/cmake/license.cmake +40 -0
llama.cpp/cmake/llama-config.cmake.in +30 -0
llama.cpp/cmake/llama.pc.in +10 -0
llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
llama.cpp/cmake/x64-windows-llvm.cmake +5 -0
llama.cpp/common/CMakeLists.txt +149 -0
llama.cpp/common/arg.cpp +0 -0
llama.cpp/common/arg.h +131 -0
llama.cpp/common/base64.hpp +392 -0
llama.cpp/common/build-info.cpp.in +4 -0
llama.cpp/common/chat-parser-xml-toolcall.cpp +879 -0
llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
llama.cpp/common/chat-parser.cpp +1649 -0
llama.cpp/common/chat-parser.h +133 -0
llama.cpp/common/chat-peg-parser.cpp +124 -0
llama.cpp/common/chat-peg-parser.h +105 -0
llama.cpp/common/chat.cpp +0 -0
llama.cpp/common/chat.h +252 -0
llama.cpp/common/common.cpp +1824 -0
llama.cpp/common/common.h +931 -0
llama.cpp/common/console.cpp +1137 -0
llama.cpp/common/console.h +41 -0
llama.cpp/common/debug.cpp +167 -0
llama.cpp/common/debug.h +43 -0

llama.cpp/.devops/cann.Dockerfile ADDED Viewed

	@@ -0,0 +1,130 @@

+# ==============================================================================
+# ARGUMENTS
+# ==============================================================================
+# Define the CANN base image for easier version updates later
+ARG CHIP_TYPE=910b
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
+# ==============================================================================
+# BUILD STAGE
+# Compile all binary files and libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS build
+# -- Install build dependencies --
+RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+# -- Set the working directory --
+WORKDIR /app
+# -- Copy project files --
+COPY . .
+# -- Set CANN environment variables (required for compilation) --
+# Using ENV instead of `source` allows environment variables to persist across the entire image layer
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+# ... You can add other environment variables from the original file as needed ...
+# For brevity, only core variables are listed here. You can paste the original ENV list here.
+# -- Build llama.cpp --
+# Use the passed CHIP_TYPE argument and add general build options
+ARG CHIP_TYPE
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
+    && \
+    cmake -B build \
+        -DGGML_CANN=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DSOC_TYPE=ascend${CHIP_TYPE} \
+        -DUSE_ACL_GRAPH=ON \
+        . && \
+    cmake --build build --config Release -j$(nproc)
+# -- Organize build artifacts for copying in later stages --
+# Create a lib directory to store all .so files
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+# Create a full directory to store all executables and Python scripts
+RUN mkdir -p /app/full && \
+    cp build/bin/* /app/full/ && \
+    cp *.py /app/full/ && \
+    cp -r gguf-py /app/full/ && \
+    cp -r requirements /app/full/ && \
+    cp requirements.txt /app/full/
+    # If you have a tools.sh script, make sure it is copied here
+    # cp .devops/tools.sh /app/full/tools.sh
+# ==============================================================================
+# BASE STAGE
+# Create a minimal base image with CANN runtime and common libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS base
+# -- Install runtime dependencies --
+RUN yum install -y libgomp curl && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+# -- Set CANN environment variables (required for runtime) --
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+# ... You can add other environment variables from the original file as needed ...
+WORKDIR /app
+# Copy compiled .so files from the build stage
+COPY --from=build /app/lib/ /app
+# ==============================================================================
+# FINAL STAGES (TARGETS)
+# ==============================================================================
+### Target: full
+# Complete image with all tools, Python bindings, and dependencies
+# ==============================================================================
+FROM base AS full
+COPY --from=build /app/full /app
+# Install Python dependencies
+RUN yum install -y git python3 python3-pip && \
+    pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip3 install --no-cache-dir -r requirements.txt && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+# You need to provide a tools.sh script as the entrypoint
+ENTRYPOINT ["/app/tools.sh"]
+# If there is no tools.sh, you can set the default to start the server
+# ENTRYPOINT ["/app/llama-server"]
+### Target: light
+# Lightweight image containing only llama-cli and llama-completion
+# ==============================================================================
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Target: server
+# Dedicated server image containing only llama-server
+# ==============================================================================
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/cpu.Dockerfile ADDED Viewed

	@@ -0,0 +1,88 @@

+ARG UBUNTU_VERSION=22.04
+FROM ubuntu:$UBUNTU_VERSION AS build
+ARG TARGETARCH
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libssl-dev
+WORKDIR /app
+COPY . .
+RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    else \
+        echo "Unsupported architecture"; \
+        exit 1; \
+    fi && \
+    cmake --build build -j $(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/cuda-new.Dockerfile ADDED Viewed

	@@ -0,0 +1,95 @@

+ARG UBUNTU_VERSION=24.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=13.1.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+WORKDIR /app
+COPY . .
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,94 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.4.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+WORKDIR /app
+COPY . .
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/intel.Dockerfile ADDED Viewed

	@@ -0,0 +1,95 @@

+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
+## Build Image
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libssl-dev
+WORKDIR /app
+COPY . .
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
+    cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+### Full
+FROM base AS full
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        python3-venv && \
+    python3 -m venv /opt/venv && \
+    . /opt/venv/bin/activate && \
+    pip install --upgrade pip setuptools wheel && \
+    pip install -r requirements.txt && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+ENV PATH="/opt/venv/bin:$PATH"
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/llama-cli-cann.Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
+FROM ascendai/cann:$ASCEND_VERSION AS build
+WORKDIR /app
+COPY . .
+RUN yum install -y gcc g++ cmake make openssl-devel
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+# find libascend_hal.so, because the drive hasn`t been mounted.
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+RUN echo "Building with static libs" && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
+    cmake --build build --config Release --target llama-cli && \
+    cmake --build build --config Release --target llama-completion
+# TODO: use image with NNRT
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
+ENV LC_ALL=C.utf8
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+ENTRYPOINT ["/llama-cli" ]

llama.cpp/.devops/llama-cpp-cuda.srpm.spec ADDED Viewed

	@@ -0,0 +1,85 @@

+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+Name:           llama.cpp-cuda
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
+Requires:       cuda-toolkit
+URL:            https://github.com/ggml-org/llama.cpp
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+%description
+CPU inference for Meta's Lllama2 models using default options.
+%prep
+%setup -n llama.cpp-master
+%build
+make -j GGML_CUDA=1
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+[Install]
+WantedBy=default.target
+EOF
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+%files
+%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-completion
+%{_bindir}/llama-cuda-server
+%{_bindir}/llama-cuda-simple
+/usr/lib/systemd/system/llamacuda.service
+%config /etc/sysconfig/llama
+%pre
+%post
+%preun
+%postun
+%changelog

llama.cpp/.devops/llama-cpp.srpm.spec ADDED Viewed

	@@ -0,0 +1,87 @@

+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+#    In the meantime, YYYYMMDD format will be used.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+Name:           llama.cpp
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
+Requires:       libstdc++
+URL:            https://github.com/ggml-org/llama.cpp
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+%description
+CPU inference for Meta's Lllama2 models using default options.
+Models are not included in this package and must be downloaded separately.
+%prep
+%setup -n llama.cpp-master
+%build
+make -j
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+[Install]
+WantedBy=default.target
+EOF
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+%files
+%{_bindir}/llama-cli
+%{_bindir}/llama-completion
+%{_bindir}/llama-server
+%{_bindir}/llama-simple
+/usr/lib/systemd/system/llama.service
+%config /etc/sysconfig/llama
+%pre
+%post
+%preun
+%postun
+%changelog

llama.cpp/.devops/musa.Dockerfile ADDED Viewed

	@@ -0,0 +1,101 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc4.3.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    cmake \
+    python3 \
+    python3-pip \
+    git \
+    libssl-dev \
+    libgomp1
+WORKDIR /app
+COPY . .
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ${BASE_MUSA_RUN_CONTAINER} AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/rocm.Dockerfile ADDED Viewed

	@@ -0,0 +1,113 @@

+ARG UBUNTU_VERSION=24.04
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=7.2
+ARG AMDGPU_VERSION=7.2
+# Target the ROCm build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+### Build image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+# Unless otherwise specified, we make a fat build.
+# This is mostly tied to rocBLAS supported archs.
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
+# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
+# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
+ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
+# Set ROCm architectures
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+RUN apt-get update \
+    && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    libssl-dev \
+    curl \
+    libgomp1
+WORKDIR /app
+COPY . .
+RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+    cmake -S . -B build \
+        -DGGML_HIP=ON \
+        -DGGML_HIP_ROCWMMA_FATTN=ON \
+        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
+        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
+        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+    && cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib \
+    && find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3-pip \
+    python3 \
+    python3-wheel\
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.devops/s390x.Dockerfile ADDED Viewed

	@@ -0,0 +1,126 @@

+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+### Build Llama.cpp stage
+FROM gcc:${GCC_VERSION} AS build
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt upgrade -y && \
+    apt install -y --no-install-recommends \
+        git cmake ccache ninja-build \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        libopenblas-dev libssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY . .
+RUN --mount=type=cache,target=/root/.ccache \
+    --mount=type=cache,target=/app/build \
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_NATIVE=OFF \
+        -DGGML_BACKEND_DL=ON \
+        -DGGML_CPU_ALL_VARIANTS=ON \
+        -DGGML_BLAS=ON \
+        -DGGML_BLAS_VENDOR=OpenBLAS && \
+    cmake --build build --config Release -j $(nproc) && \
+    cmake --install build --prefix /opt/llama.cpp
+COPY *.py             /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+COPY gguf-py          /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements     /opt/llama.cpp/gguf-py/requirements
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM scratch AS collector
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+### Base image
+FROM ubuntu:${UBUNTU_VERSION} AS base
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt install -y --no-install-recommends \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
+        curl libgomp1 libopenblas-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+### Full
+FROM base AS full
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt install -y \
+        git cmake libjpeg-dev \
+        python3 python3-pip python3-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+RUN pip install --no-cache-dir --break-system-packages \
+        -r /app/gguf-py/requirements.txt
+ENTRYPOINT [ "/app/tools.sh" ]
+### CLI Only
+FROM base AS light
+WORKDIR /llama.cpp/bin
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+### Server
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+WORKDIR /llama.cpp/bin
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+EXPOSE 8080
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

llama.cpp/.devops/tools.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env bash
+set -e
+# Read the first argument into a variable
+arg1="$1"
+# Shift the arguments to remove the first one
+shift
+if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
+    exec python3 ./convert_hf_to_gguf.py "$@"
+elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
+    exec ./llama-quantize "$@"
+elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
+    exec ./llama-cli "$@"
+elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
+    exec ./llama-completion "$@"
+elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
+    exec ./llama-bench "$@"
+elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
+    exec ./llama-perplexity "$@"
+elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
+    echo "Converting PTH to GGML..."
+    for i in $(ls $1/$2/ggml-model-f16.bin*); do
+        if [ -f "${i/f16/q4_0}" ]; then
+            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
+        else
+            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+        fi
+    done
+elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
+    exec ./llama-server "$@"
+else
+    echo "Unknown command: $arg1"
+    echo "Available commands: "
+    echo "  --run (-r): Run a model (chat) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
+    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
+    echo "              ex: -m model.gguf"
+    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
+    echo "              ex: -m model.gguf -f file.txt"
+    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "              ex: --outtype f16 \"/models/7B/\" "
+    echo "  --quantize (-q): Optimize with quantization process ggml"
+    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --server (-s): Run a model on the server"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
+fi

llama.cpp/.devops/vulkan.Dockerfile ADDED Viewed

	@@ -0,0 +1,90 @@

+ARG UBUNTU_VERSION=26.04
+FROM ubuntu:$UBUNTU_VERSION AS build
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget xz-utils
+# Install SSL and Vulkan SDK dependencies
+RUN apt install -y libssl-dev curl \
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+    cmake --build build --config Release -j$(nproc)
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+COPY --from=build /app/lib/ /app
+### Full
+FROM base AS full
+COPY --from=build /app/full /app
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y \
+    build-essential \
+    git \
+    python3 \
+    python3-dev \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+ENTRYPOINT ["/app/tools.sh"]
+### Light, CLI only
+FROM base AS light
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+WORKDIR /app
+ENTRYPOINT [ "/app/llama-cli" ]
+### Server, Server only
+FROM base AS server
+ENV LLAMA_ARG_HOST=0.0.0.0
+COPY --from=build /app/full/llama-server /app
+WORKDIR /app
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

llama.cpp/.gemini/settings.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ { "contextFileName": "AGENTS.md" }

llama.cpp/.github/labeler.yml ADDED Viewed

	@@ -0,0 +1,106 @@

+# https://github.com/actions/labeler
+Apple Metal:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-metal.h
+            - ggml/src/ggml-metal/**
+            - README-metal.md
+SYCL:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-sycl.h
+            - ggml/src/ggml-sycl/**
+            - docs/backend/SYCL.md
+            - examples/sycl/**
+Nvidia GPU:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-cuda.h
+            - ggml/src/ggml-cuda/**
+Vulkan:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-vulkan.h
+            - ggml/src/ggml-vulkan/**
+IBM zDNN:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-zdnn.h
+            - ggml/src/ggml-zdnn/**
+documentation:
+    - changed-files:
+        - any-glob-to-any-file:
+            - docs/**
+            - media/**
+testing:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tests/**
+build:
+    - changed-files:
+        - any-glob-to-any-file:
+            - cmake/**
+            - CMakeLists.txt
+            - CMakePresets.json
+examples:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/**
+            - tools/**
+devops:
+    - changed-files:
+        - any-glob-to-any-file:
+            - .devops/**
+            - .github/**
+            - ci/**
+python:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.py"
+            - requirements/**
+            - gguf-py/**
+            - .flake8
+script:
+    - changed-files:
+        - any-glob-to-any-file:
+            - scripts/**
+android:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/llama.android/**
+server:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tools/server/**
+ggml:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/**
+model:
+    - changed-files:
+        - any-glob-to-any-file:
+            - src/models/**
+nix:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.nix"
+            - .github/workflows/nix-*.yml
+            - .devops/nix/nixpkgs-instances.nix
+embedding:
+    - changed-files:
+        - any-glob-to-any-file: examples/embedding/
+jinja parser:
+    - changed-files:
+        - any-glob-to-any-file:
+            - common/jinja/**
+Ascend NPU:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-cann.h
+            - ggml/src/ggml-cann/**
+            - docs/backend/CANN.md
+OpenCL:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-opencl.h
+            - ggml/src/ggml-opencl/**

llama.cpp/.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR

llama.cpp/build/CMakeCache.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+# This is the CMakeCache file.
+# For build in directory: r:/Quillan/Quillan-v4.2-model/llama.cpp/build
+# It was generated by CMake: C:/Program Files/CMake/bin/cmake.exe
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+########################
+# EXTERNAL cache entries
+########################
+//Value Computed by CMake.
+CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp/build/CMakeFiles/pkgRedirects
+//Program used to build from makefiles.
+CMAKE_MAKE_PROGRAM:STRING=nmake
+//Value Computed by CMake
+CMAKE_PROJECT_COMPAT_VERSION:STATIC=
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=llama.cpp
+//Value Computed by CMake
+CMAKE_PROJECT_SPDX_LICENSE:STATIC=
+//Value Computed by CMake
+llama.cpp_BINARY_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp/build
+//Value Computed by CMake
+llama.cpp_IS_TOP_LEVEL:STATIC=ON
+//Value Computed by CMake
+llama.cpp_SOURCE_DIR:STATIC=R:/Quillan/Quillan-v4.2-model/llama.cpp
+########################
+# INTERNAL cache entries
+########################
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=r:/Quillan/Quillan-v4.2-model/llama.cpp/build
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=4
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=2
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cmake.exe
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cpack.exe
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=C:/Program Files/CMake/bin/ctest.exe
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=C:/Program Files/CMake/bin/cmake-gui.exe
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=NMake Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=R:/Quillan/Quillan-v4.2-model/llama.cpp
+//Name of CMakeLists files to read
+CMAKE_LIST_FILE_NAME:INTERNAL=CMakeLists.txt
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=C:/Program Files/CMake/share/cmake-4.2

llama.cpp/ci/README-MUSA.md ADDED Viewed

	@@ -0,0 +1,35 @@

+## Running MUSA CI in a Docker Container
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+### 1. Create a local directory to store cached models, configuration files and venv:
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+### 2. Create a local directory to store CI run results:
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+### 3. Start a Docker container and run the CI:
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+```
+Inside the container, execute the following commands:
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

llama.cpp/ci/README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# CI
+This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
+cover hardware configurations that are not available from Github-hosted runners and/or require more computational
+resource than normally available.
+It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:
+```bash
+mkdir tmp
+# CPU-only build
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# with CUDA support
+GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# with MUSA support
+GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# etc.
+```
+# Adding self-hosted runners
+- Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
+- Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
+- Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
+- Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env

llama.cpp/ci/run.sh ADDED Viewed

	@@ -0,0 +1,709 @@

+#!/usr/bin/env bash
+#
+# sample usage:
+#
+# mkdir tmp
+#
+# # CPU-only build
+# bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with CUDA support
+# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with VULKAN support
+# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with WebGPU support
+# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with MUSA support
+# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with KLEIDIAI support
+# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+if [ -z "$2" ]; then
+    echo "usage: $0 <output-dir> <mnt-dir>"
+    exit 1
+fi
+mkdir -p "$1"
+mkdir -p "$2"
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+rm -f $OUT/*.log
+rm -f $OUT/*.exit
+rm -f $OUT/*.md
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
+if [ ! -z ${GG_BUILD_METAL} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+fi
+if [ ! -z ${GG_BUILD_CUDA} ]; then
+    # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
+        if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
+            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
+        else
+            echo "Warning: Using fallback CUDA architectures"
+            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
+        fi
+    else
+        echo "Error: nvidia-smi not found, cannot build with CUDA"
+        exit 1
+    fi
+fi
+if [ ! -z ${GG_BUILD_ROCM} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
+    if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
+        echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
+        exit 1
+    fi
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
+fi
+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
+        echo "source /opt/intel/oneapi/setvars.sh"
+        exit 1
+    fi
+    # Use only main GPU
+    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+    # Enable sysman for correct memory reporting
+    export ZES_ENABLE_SYSMAN=1
+    # to circumvent precision issues on CPY operations
+    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+fi
+if [ ! -z ${GG_BUILD_VULKAN} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+    # if on Mac, disable METAL
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    fi
+fi
+if [ ! -z ${GG_BUILD_WEBGPU} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
+        if [ -z "${CMAKE_PREFIX_PATH}" ]; then
+            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
+        else
+            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
+        fi
+    fi
+    # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
+    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
+    fi
+fi
+if [ ! -z ${GG_BUILD_MUSA} ]; then
+    # Use qy1 by default (MTT S80)
+    MUSA_ARCH=${MUSA_ARCH:-21}
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
+fi
+if [ ! -z ${GG_BUILD_NO_SVE} ]; then
+    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
+fi
+if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
+    echo ">>===== Enabling KleidiAI support"
+    CANDIDATES=(
+        "armv9-a+dotprod+i8mm+sve2"
+        "armv9-a+dotprod+i8mm"
+        "armv8.6-a+dotprod+i8mm"
+        "armv8.2-a+dotprod"
+    )
+    CPU=""
+    for cpu in "${CANDIDATES[@]}"; do
+        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
+            CPU="$cpu"
+            break
+        fi
+    done
+    if [ -z "$CPU" ]; then
+        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
+        exit 1
+    fi
+    echo ">>===== Using ARM baseline: ${CPU}"
+    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_KLEIDIAI=ON \
+        -DGGML_CPU_AARCH64=ON \
+        -DGGML_CPU_ARM_ARCH=${CPU} \
+        -DBUILD_SHARED_LIBS=OFF"
+fi
+## helpers
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+    local out=$1
+    local url=$2
+    local cwd=`pwd`
+    mkdir -p $out
+    cd $out
+    # should not re-download if file is the same
+    wget -nv -c -N $url
+    cd $cwd
+}
+function gg_printf {
+    printf -- "$@" >> $OUT/README.md
+}
+function gg_run {
+    ci=$1
+    set -o pipefail
+    set -x
+    gg_run_$ci | tee $OUT/$ci.log
+    cur=$?
+    echo "$cur" > $OUT/$ci.exit
+    set +x
+    set +o pipefail
+    gg_sum_$ci
+    ret=$((ret | cur))
+}
+## ci
+# ctest_debug
+function gg_run_ctest_debug {
+    cd ${SRC}
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+    set -e
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+}
+function gg_sum_ctest_debug {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+# ctest_release
+function gg_run_ctest_release {
+    cd ${SRC}
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+        (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    else
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    fi
+    set +e
+}
+function gg_sum_ctest_release {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+# test_scripts
+function gg_run_test_scripts {
+    cd ${SRC}
+    set -e
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    set +e
+}
+function gg_sum_test_scripts {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs test scripts\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+function gg_get_model {
+    #local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
+    if [[ -s $gguf_0 ]]; then
+        echo -n "$gguf_0"
+    else
+        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
+        exit 1
+    fi
+}
+function gg_run_ctest_with_model_debug {
+    cd ${SRC}
+    local model; model=$(gg_get_model)
+    cd build-ci-debug
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+function gg_run_ctest_with_model_release {
+    cd ${SRC}
+    local model; model=$(gg_get_model)
+    cd build-ci-release
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    # test memory leaks
+    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
+    #    # TODO: this hangs for some reason ...
+    #    (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
+    #fi
+    set +e
+    cd ..
+}
+function gg_sum_ctest_with_model_debug {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest with model files in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+function gg_sum_ctest_with_model_release {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest with model files in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+# qwen3_0_6b
+function gg_run_qwen3_0_6b {
+    cd ${SRC}
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
+   #gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    path_models="../models-mnt/qwen3/0.6B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_bf16="${path_models}/ggml-model-bf16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    wiki_test="${path_wiki}/wiki.test.raw"
+    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        (time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    fi
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    fi
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+    set +e
+}
+function gg_sum_qwen3_0_6b {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Qwen3 0.6B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16:\n```\n%s\n```\n'  "$(cat $OUT/${ci}-tg-f16.log)"
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
+    fi
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+# bge-small
+function gg_run_embd_bge_small {
+    cd ${SRC}
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
+    path_models="../models-mnt/bge-small"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    set +e
+}
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+# rerank_tiny
+function gg_run_rerank_tiny {
+    cd ${SRC}
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.json
+    path_models="../models-mnt/rerank-tiny"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    # sample output
+    # rerank score 0:    0.029
+    # rerank score 1:    0.029
+    # rerank score 2:    0.135
+    # check that the score is in the range [$3, $4]
+    function check_score {
+        qnt="$1"
+        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$score"
+        return 0
+    }
+    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
+    set +e
+}
+function gg_sum_rerank_tiny {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Rerank Tiny (Jina):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
+}
+function gg_check_build_requirements {
+    if ! command -v cmake &> /dev/null; then
+        gg_printf 'cmake not found, please install'
+    fi
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
+    fi
+    if ! command -v ctest &> /dev/null; then
+        gg_printf 'ctest not found, please install'
+    fi
+}
+function gg_run_test_backend_ops_cpu {
+    cd ${SRC}
+    cd build-ci-release
+    set -e
+    (time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
+    set +e
+}
+function gg_sum_test_backend_ops_cpu {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs test-backend-ops for CPU backend\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+## main
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
+    rm -rf ${SRC}/models-mnt
+    mnt_models=${MNT}/models
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
+    # Create a fresh python3 venv and enter it
+    if ! python3 -m venv "$MNT/venv"; then
+        echo "Error: Failed to create Python virtual environment at $MNT/venv."
+        exit 1
+    fi
+    source "$MNT/venv/bin/activate"
+    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
+    pip install --editable gguf-py --disable-pip-version-check
+fi
+ret=0
+test $ret -eq 0 && gg_run ctest_debug
+test $ret -eq 0 && gg_run ctest_release
+if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
+    test $ret -eq 0 && gg_run test_backend_ops_cpu
+fi
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    test $ret -eq 0 && gg_run embd_bge_small
+    test $ret -eq 0 && gg_run rerank_tiny
+    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
+        test $ret -eq 0 && gg_run test_scripts
+    fi
+    test $ret -eq 0 && gg_run qwen3_0_6b
+    test $ret -eq 0 && gg_run ctest_with_model_debug
+    test $ret -eq 0 && gg_run ctest_with_model_release
+fi
+cat $OUT/README.md
+exit $ret

llama.cpp/cmake/arm64-apple-clang.cmake ADDED Viewed

	@@ -0,0 +1,16 @@

+set( CMAKE_SYSTEM_NAME Darwin )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+set( target arm64-apple-darwin-macho )
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

llama.cpp/cmake/arm64-windows-llvm.cmake ADDED Viewed

	@@ -0,0 +1,16 @@

+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

llama.cpp/cmake/build-info.cmake ADDED Viewed

	@@ -0,0 +1,48 @@

+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+set(BUILD_COMPILER "unknown")
+set(BUILD_TARGET "unknown")
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+    find_program(GIT_EXECUTABLE NAMES git git.exe)
+    if(GIT_EXECUTABLE)
+        set(Git_FOUND TRUE)
+        message(STATUS "Found Git: ${GIT_EXECUTABLE}")
+    else()
+        message(WARNING "Git not found. Build info will not be accurate.")
+    endif()
+endif()
+# Get the commit count and hash
+if(Git_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+    endif()
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE COUNT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_NUMBER ${COUNT})
+    endif()
+endif()
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+if(CMAKE_VS_PLATFORM_NAME)
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+endif()

llama.cpp/cmake/common.cmake ADDED Viewed

	@@ -0,0 +1,58 @@

+include("ggml/cmake/common.cmake")
+function(llama_add_compile_flags)
+    if (LLAMA_FATAL_WARNINGS)
+        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            list(APPEND C_FLAGS   -Werror)
+            list(APPEND CXX_FLAGS -Werror)
+        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+            add_compile_options(/WX)
+        endif()
+    endif()
+    if (LLAMA_ALL_WARNINGS)
+        if (NOT MSVC)
+            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                -Werror=implicit-int -Werror=implicit-function-declaration)
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+            list(APPEND C_FLAGS   ${WARNING_FLAGS})
+            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        else()
+            # todo : msvc
+            set(C_FLAGS   "" PARENT_SCOPE)
+            set(CXX_FLAGS "" PARENT_SCOPE)
+        endif()
+    endif()
+    if (NOT MSVC)
+        if (LLAMA_SANITIZE_THREAD)
+            message(STATUS "Using -fsanitize=thread")
+            add_compile_options(-fsanitize=thread)
+            link_libraries     (-fsanitize=thread)
+        endif()
+        if (LLAMA_SANITIZE_ADDRESS)
+            message(STATUS "Using -fsanitize=address")
+            add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+            link_libraries     (-fsanitize=address)
+        endif()
+        if (LLAMA_SANITIZE_UNDEFINED)
+            message(STATUS "Using -fsanitize=undefined")
+            add_compile_options(-fsanitize=undefined)
+            link_libraries     (-fsanitize=undefined)
+        endif()
+    endif()
+endfunction()

llama.cpp/cmake/download-models.cmake ADDED Viewed

	@@ -0,0 +1,21 @@

+get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
+file(MAKE_DIRECTORY "${DEST_DIR}")
+if(NOT EXISTS "${DEST}")
+    message(STATUS "Downloading ${NAME} from ggml-org/models...")
+endif()
+file(DOWNLOAD
+    "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
+    "${DEST}"
+    TLS_VERIFY ON
+    EXPECTED_HASH ${HASH}
+    STATUS status
+)
+list(GET status 0 code)
+if(NOT code EQUAL 0)
+    list(GET status 1 msg)
+    message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
+endif()

llama.cpp/cmake/git-vars.cmake ADDED Viewed

	@@ -0,0 +1,22 @@

+find_package(Git)
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

llama.cpp/cmake/license.cmake ADDED Viewed

	@@ -0,0 +1,40 @@

+define_property(GLOBAL PROPERTY LICENSE_TEXT
+    BRIEF_DOCS "Embedded licenses"
+    FULL_DOCS  "Global string containing all aggregated licenses"
+)
+function(license_add_file NAME FILE)
+    if(NOT IS_ABSOLUTE "${FILE}")
+        set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
+    endif()
+    if(EXISTS "${FILE}")
+        set(TITLE "License for ${NAME}")
+        string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
+        file(READ "${FILE}" TEXT)
+        get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
+        string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
+        set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
+    else()
+        message(WARNING "License file '${FILE}' not found")
+    endif()
+endfunction()
+function(license_generate TARGET_NAME)
+    message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
+    get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
+    set(CPP_CONTENT "// Generated by CMake\n\n")
+    string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
+    string(APPEND CPP_CONTENT "${TEXT}")
+    string(APPEND CPP_CONTENT "nullptr\n")
+    string(APPEND CPP_CONTENT "};\n")
+    set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
+    file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
+    if(TARGET ${TARGET_NAME})
+        target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
+    else()
+        message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
+    endif()
+endfunction()

llama.cpp/cmake/llama-config.cmake.in ADDED Viewed

	@@ -0,0 +1,30 @@

+set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
+set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+@PACKAGE_INIT@
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
+find_library(llama_LIBRARY llama
+    REQUIRED
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
+add_library(llama UNKNOWN IMPORTED)
+set_target_properties(llama
+    PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
+        INTERFACE_COMPILE_FEATURES c_std_90
+        POSITION_INDEPENDENT_CODE ON)
+check_required_components(Llama)

llama.cpp/cmake/llama.pc.in ADDED Viewed

	@@ -0,0 +1,10 @@

+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+Name: llama
+Description: Port of Facebook's LLaMA model in C/C++
+Version: @LLAMA_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lllama
+Cflags: -I${includedir}

llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake ADDED Viewed

	@@ -0,0 +1,29 @@

+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+set(CMAKE_SYSTEM_VERSION 1)
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
+    message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+else()
+    set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
+    if (DEFINED ENV{RISCV_ROOT_PATH})
+        file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+    else()
+        message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+    endif()
+    set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+    set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
+    set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
+    set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+    set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+endif()
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")

llama.cpp/cmake/x64-windows-llvm.cmake ADDED Viewed

	@@ -0,0 +1,5 @@

+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR x86_64 )
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )

llama.cpp/common/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+# common
+find_package(Threads REQUIRED)
+llama_add_compile_flags()
+# Build info header
+if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
+    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
+    # Is git submodule
+    if(NOT IS_DIRECTORY "${GIT_DIR}")
+        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
+        if (SLASH_POS EQUAL 0)
+            set(GIT_DIR "${REAL_GIT_DIR}")
+        else()
+            set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
+        endif()
+    endif()
+    if(EXISTS "${GIT_DIR}/index")
+        # For build-info.cpp below
+        set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
+    else()
+        message(WARNING "Git index not found in git repository.")
+    endif()
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+endif()
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
+set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
+configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+set(TARGET build_info)
+add_library(${TARGET} OBJECT ${OUTPUT_FILE})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+set(TARGET common)
+add_library(${TARGET} STATIC
+    arg.cpp
+    arg.h
+    base64.hpp
+    chat-parser.cpp
+    chat-parser.h
+    chat-parser-xml-toolcall.h
+    chat-parser-xml-toolcall.cpp
+    chat-peg-parser.cpp
+    chat-peg-parser.h
+    chat.cpp
+    chat.h
+    common.cpp
+    common.h
+    console.cpp
+    console.h
+    debug.cpp
+    debug.h
+    download.cpp
+    download.h
+    http.h
+    json-partial.cpp
+    json-partial.h
+    json-schema-to-grammar.cpp
+    llguidance.cpp
+    log.cpp
+    log.h
+    ngram-cache.cpp
+    ngram-cache.h
+    ngram-map.cpp
+    ngram-map.h
+    ngram-mod.cpp
+    ngram-mod.h
+    peg-parser.cpp
+    peg-parser.h
+    preset.cpp
+    preset.h
+    regex-partial.cpp
+    regex-partial.h
+    sampling.cpp
+    sampling.h
+    speculative.cpp
+    speculative.h
+    unicode.cpp
+    unicode.h
+    jinja/lexer.cpp
+    jinja/lexer.h
+    jinja/parser.cpp
+    jinja/parser.h
+    jinja/runtime.cpp
+    jinja/runtime.h
+    jinja/value.cpp
+    jinja/value.h
+    jinja/string.cpp
+    jinja/string.h
+    jinja/caps.cpp
+    jinja/caps.h
+    )
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+target_link_libraries(${TARGET} PRIVATE
+    build_info
+    cpp-httplib
+)
+if (LLAMA_LLGUIDANCE)
+    include(ExternalProject)
+    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
+    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
+    set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    ExternalProject_Add(llguidance_ext
+        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
+        # v1.0.1:
+        GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
+        PREFIX ${CMAKE_BINARY_DIR}/llguidance
+        SOURCE_DIR ${LLGUIDANCE_SRC}
+        BUILD_IN_SOURCE TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND cargo build --release --package llguidance
+        INSTALL_COMMAND ""
+        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
+        UPDATE_COMMAND ""
+    )
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
+    add_library(llguidance STATIC IMPORTED)
+    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
+    add_dependencies(llguidance llguidance_ext)
+    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
+    target_link_libraries(${TARGET} PRIVATE llguidance)
+    if (WIN32)
+        target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
+    endif()
+endif()
+target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)

llama.cpp/common/arg.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

llama.cpp/common/arg.h ADDED Viewed

	@@ -0,0 +1,131 @@

+#pragma once
+#include "common.h"
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+#include <cstring>
+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
+#define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+//
+// CLI argument parsing
+//
+struct common_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::set<enum llama_example> excludes = {};
+    std::vector<const char *> args;
+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
+    void (*handler_void)   (common_params & params) = nullptr;
+    void (*handler_string) (common_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_bool)   (common_params & params, bool) = nullptr;
+    common_arg() = default;
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(common_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::initializer_list<const char *> & args_neg,
+        const std::string & help,
+        void (*handler)(common_params & params, bool)
+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
+    // support 2 values for arg
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
+    common_arg & set_env(const char * env);
+    common_arg & set_sparam();
+    common_arg & set_preset_only();
+    bool in_example(enum llama_example ex);
+    bool is_exclude(enum llama_example ex);
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
+    std::string to_string() const;
+    // for using as key in std::map
+    bool operator<(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) < 0;
+    }
+    bool operator==(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) == 0;
+    }
+    // get all args and env vars (including negated args/env)
+    std::vector<std::string> get_args() const;
+    std::vector<std::string> get_env() const;
+};
+namespace common_arg_utils {
+    bool is_truthy(const std::string & value);
+    bool is_falsey(const std::string & value);
+    bool is_autoy(const std::string & value);
+}
+struct common_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    common_params & params;
+    std::vector<common_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    common_params_context(common_params & params) : params(params) {}
+};
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+// parse input arguments from CLI into a map
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector<common_arg> & args);
+// initialize argument parser context - used by test-arg-parser and preset
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

llama.cpp/common/base64.hpp ADDED Viewed

	@@ -0,0 +1,392 @@

+/*
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+For more information, please refer to <http://unlicense.org>
+*/
+#ifndef PUBLIC_DOMAIN_BASE64_HPP_
+#define PUBLIC_DOMAIN_BASE64_HPP_
+#include <cstdint>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+class base64_error : public std::runtime_error
+{
+public:
+    using std::runtime_error::runtime_error;
+};
+class base64
+{
+public:
+    enum class alphabet
+    {
+        /** the alphabet is detected automatically */
+        auto_,
+        /** the standard base64 alphabet is used */
+        standard,
+        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
+        url_filename_safe
+    };
+    enum class decoding_behavior
+    {
+        /** if the input is not padded, the remaining bits are ignored */
+        moderate,
+        /** if a padding character is encounter decoding is finished */
+        loose
+    };
+    /**
+     Encodes all the elements from `in_begin` to `in_end` to `out`.
+     @warning The source and destination cannot overlap. The destination must be able to hold at least
+     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
+     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
+     8 bits
+     @tparam Output_iterator the destination; the elements written to it are from the type `char`
+     @param in_begin the beginning of the source
+     @param in_end the ending of the source
+     @param out the destination iterator
+     @param alphabet which alphabet should be used
+     @returns the iterator to the next element past the last element copied
+     @throws see `Input_iterator` and `Output_iterator`
+    */
+    template<typename Input_iterator, typename Output_iterator>
+    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
+                                  alphabet alphabet = alphabet::standard)
+    {
+        constexpr auto pad = '=';
+        const char* alpha  = alphabet == alphabet::url_filename_safe
+                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+        while (in_begin != in_end) {
+            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
+            // first character
+            i0 = static_cast<std::uint8_t>(*in_begin);
+            ++in_begin;
+            *out = alpha[i0 >> 2 & 0x3f];
+            ++out;
+            // part of first character and second
+            if (in_begin != in_end) {
+                i1 = static_cast<std::uint8_t>(*in_begin);
+                ++in_begin;
+                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
+                ++out;
+            } else {
+                *out = alpha[(i0 & 0x3) << 4];
+                ++out;
+                // last padding
+                *out = pad;
+                ++out;
+                // last padding
+                *out = pad;
+                ++out;
+                break;
+            }
+            // part of second character and third
+            if (in_begin != in_end) {
+                i2 = static_cast<std::uint8_t>(*in_begin);
+                ++in_begin;
+                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
+                ++out;
+            } else {
+                *out = alpha[(i1 & 0xf) << 2];
+                ++out;
+                // last padding
+                *out = pad;
+                ++out;
+                break;
+            }
+            // rest of third
+            *out = alpha[i2 & 0x3f];
+            ++out;
+        }
+        return out;
+    }
+    /**
+     Encodes a string.
+     @param str the string that should be encoded
+     @param alphabet which alphabet should be used
+     @returns the encoded base64 string
+     @throws see base64::encode()
+    */
+    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
+    {
+        std::string result;
+        result.reserve(required_encode_size(str.length()) + 1);
+        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
+        return result;
+    }
+    /**
+     Encodes a char array.
+     @param buffer the char array
+     @param size the size of the array
+     @param alphabet which alphabet should be used
+     @returns the encoded string
+    */
+    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
+    {
+        std::string result;
+        result.reserve(required_encode_size(size) + 1);
+        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
+        return result;
+    }
+    /**
+     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
+     in other words: inplace decoding is possible.
+     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
+     otherwise the behavior depends on the output iterator.
+     @tparam Input_iterator the source; the returned elements are cast to `char`
+     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
+     @param in_begin the beginning of the source
+     @param in_end the ending of the source
+     @param out the destination iterator
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the iterator to the next element past the last element copied
+     @throws base64_error depending on the set behavior
+     @throws see `Input_iterator` and `Output_iterator`
+    */
+    template<typename Input_iterator, typename Output_iterator>
+    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
+                                  alphabet alphabet          = alphabet::auto_,
+                                  decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        //constexpr auto pad = '=';
+        std::uint8_t last  = 0;
+        auto bits          = 0;
+        while (in_begin != in_end) {
+            auto c = *in_begin;
+            ++in_begin;
+            if (c == '=') {
+                break;
+            }
+            auto part = _base64_value(alphabet, c);
+            // enough bits for one byte
+            if (bits + 6 >= 8) {
+                *out = (last << (8 - bits)) | (part >> (bits - 2));
+                ++out;
+                bits -= 2;
+            } else {
+                bits += 6;
+            }
+            last = part;
+        }
+        // check padding
+        if (behavior != decoding_behavior::loose) {
+            while (in_begin != in_end) {
+                auto c = *in_begin;
+                ++in_begin;
+                if (c != '=') {
+                    throw base64_error("invalid base64 character.");
+                }
+            }
+        }
+        return out;
+    }
+    /**
+     Decodes a string.
+     @param str the base64 encoded string
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the decoded string
+     @throws see base64::decode()
+    */
+    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
+                              decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        std::string result;
+        result.reserve(max_decode_size(str.length()));
+        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
+        return result;
+    }
+    /**
+     Decodes a string.
+     @param buffer the base64 encoded buffer
+     @param size the size of the buffer
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the decoded string
+     @throws see base64::decode()
+    */
+    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
+                              decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        std::string result;
+        result.reserve(max_decode_size(size));
+        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
+        return result;
+    }
+    /**
+     Decodes a string inplace.
+     @param[in,out] str the base64 encoded string
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @throws base64::decode_inplace()
+    */
+    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
+                               decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
+    }
+    /**
+     Decodes a char array inplace.
+     @param[in,out] str the string array
+     @param size the length of the array
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the pointer to the next element past the last element decoded
+     @throws base64::decode_inplace()
+    */
+    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
+                                decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        return decode(str, str + size, str, alphabet, behavior);
+    }
+    /**
+     Returns the required decoding size for a given size. The value is calculated with the following formula:
+     $$
+     \lceil \frac{size}{4} \rceil \cdot 3
+     $$
+     @param size the size of the encoded input
+     @returns the size of the resulting decoded buffer; this the absolute maximum
+    */
+    static std::size_t max_decode_size(std::size_t size) noexcept
+    {
+        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
+    }
+    /**
+     Returns the required encoding size for a given size. The value is calculated with the following formula:
+     $$
+     \lceil \frac{size}{3} \rceil \cdot 4
+     $$
+     @param size the size of the decoded input
+     @returns the size of the resulting encoded buffer
+    */
+    static std::size_t required_encode_size(std::size_t size) noexcept
+    {
+        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
+    }
+private:
+    static std::uint8_t _base64_value(alphabet& alphabet, char c)
+    {
+        if (c >= 'A' && c <= 'Z') {
+            return c - 'A';
+        } else if (c >= 'a' && c <= 'z') {
+            return c - 'a' + 26;
+        } else if (c >= '0' && c <= '9') {
+            return c - '0' + 52;
+        }
+        // comes down to alphabet
+        if (alphabet == alphabet::standard) {
+            if (c == '+') {
+                return 62;
+            } else if (c == '/') {
+                return 63;
+            }
+        } else if (alphabet == alphabet::url_filename_safe) {
+            if (c == '-') {
+                return 62;
+            } else if (c == '_') {
+                return 63;
+            }
+        } // auto detect
+        else {
+            if (c == '+') {
+                alphabet = alphabet::standard;
+                return 62;
+            } else if (c == '/') {
+                alphabet = alphabet::standard;
+                return 63;
+            } else if (c == '-') {
+                alphabet = alphabet::url_filename_safe;
+                return 62;
+            } else if (c == '_') {
+                alphabet = alphabet::url_filename_safe;
+                return 63;
+            }
+        }
+        throw base64_error("invalid base64 character.");
+    }
+};
+#endif // !PUBLIC_DOMAIN_BASE64_HPP_

llama.cpp/common/build-info.cpp.in ADDED Viewed

	@@ -0,0 +1,4 @@

+int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
+char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
+char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

llama.cpp/common/chat-parser-xml-toolcall.cpp ADDED Viewed

	@@ -0,0 +1,879 @@

+#include "chat.h"
+#include "chat-parser.h"
+#include "common.h"
+#include "json-partial.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "regex-partial.h"
+using json = nlohmann::ordered_json;
+class xml_toolcall_syntax_exception : public std::runtime_error {
+  public:
+    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
+};
+template<typename T>
+inline void sort_uniq(std::vector<T> &vec) {
+    std::sort(vec.begin(), vec.end());
+    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
+}
+template<typename T>
+inline bool all_space(const T &str) {
+    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
+}
+static size_t utf8_truncate_safe(const std::string_view s) {
+    size_t len = s.size();
+    if (len == 0) return 0;
+    size_t i = len;
+    for (size_t back = 0; back < 4 && i > 0; ++back) {
+        --i;
+        unsigned char c = s[i];
+        if ((c & 0x80) == 0) {
+            return len;
+        } else if ((c & 0xC0) == 0xC0) {
+            size_t expected_len = 0;
+            if ((c & 0xE0) == 0xC0) expected_len = 2;
+            else if ((c & 0xF0) == 0xE0) expected_len = 3;
+            else if ((c & 0xF8) == 0xF0) expected_len = 4;
+            else return i;
+            if (len - i >= expected_len) {
+                return len;
+            } else {
+                return i;
+            }
+        }
+    }
+    return len - std::min(len, size_t(3));
+}
+inline void utf8_truncate_safe_resize(std::string &s) {
+    s.resize(utf8_truncate_safe(s));
+}
+inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
+    return s.substr(0, utf8_truncate_safe(s));
+}
+static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
+    if (literal1.size() == 0) return builder.try_find_literal(literal2);
+    const auto saved_pos = builder.pos();
+    while (auto res = builder.try_find_literal(literal1)) {
+        builder.consume_spaces();
+        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
+        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
+            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
+                res->prelude = builder.str({saved_pos, res->groups[0].begin});
+            }
+            builder.move_to(builder.pos() + match_len);
+            res->groups[0].end = builder.pos();
+            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
+            return res;
+        }
+        builder.move_to(res->groups[0].begin + 1);
+    }
+    builder.move_to(saved_pos);
+    return std::nullopt;
+}
+/**
+ * make a GBNF that accept any strings except those containing any of the forbidden strings.
+ */
+std::string make_gbnf_excluding(std::vector<std::string> forbids) {
+    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
+        if (c == '\\' || c == ']' || c == '^' || c == '-') {
+            std::string s = "\\";
+            s.push_back((char)c);
+            return s;
+        }
+        if (isprint(c)) {
+            return std::string(1, (char)c);
+        }
+        char buf[16];
+        snprintf(buf, 15, "\\x%02X", c);
+        return std::string(buf);
+    };
+    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
+        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
+        int i = l;
+        while (i < r) {
+            const std::string &s = forbids[i];
+            if ((int)s.size() == depth) {
+                ++i;
+                continue;
+            }
+            unsigned char c = (unsigned char)s[depth];
+            int j = i;
+            while (j < r && (int)forbids[j].size() > depth &&
+                   (unsigned char)forbids[j][depth] == c) {
+                ++j;
+            }
+            children.push_back({c, {i, j}});
+            i = j;
+        }
+        std::vector<std::string> alts;
+        if (!children.empty()) {
+            std::string cls;
+            for (auto &ch : children) cls += charclass_escape(ch.first);
+            alts.push_back(std::string("[^") + cls + "]");
+        }
+        for (auto &ch : children) {
+            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
+            if (!childExpr.empty()) {
+                std::string quoted_ch = "\"";
+                if (ch.first == '\\') quoted_ch += "\\\\";
+                else if (ch.first == '"') quoted_ch += "\\\"";
+                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
+                else {
+                    char buf[16];
+                    snprintf(buf, 15, "\\x%02X", ch.first);
+                    quoted_ch += buf;
+                }
+                quoted_ch += "\"";
+                std::string branch = quoted_ch + std::string(" ") + childExpr;
+                alts.push_back(branch);
+            }
+        }
+        if (alts.empty()) return "";
+        std::ostringstream oss;
+        oss << "( ";
+        for (size_t k = 0; k < alts.size(); ++k) {
+            if (k) oss << " | ";
+            oss << alts[k];
+        }
+        oss << " )";
+        return oss.str();
+    };
+    if (forbids.empty()) return "( . )*";
+    sort(forbids.begin(), forbids.end());
+    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
+    if (expr.empty()) {
+        std::string cls;
+        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
+        expr = std::string("( [^") + cls + "] )";
+    }
+    if (forbids.size() == 1)
+        return expr + "*";
+    else
+        return std::string("( ") + expr + " )*";
+}
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.tool_sep.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+    std::string key_val_sep = form.key_val_sep;
+    if (form.key_val_sep2) {
+        key_val_sep += "\n";
+        key_val_sep += *form.key_val_sep2;
+    }
+    GGML_ASSERT(!key_val_sep.empty());
+    if (tools.is_array() && !tools.empty()) {
+        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
+            auto string_arg_val = form.last_val_end ?
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
+            std::vector<std::string> tool_rules;
+            for (const auto & tool : tools) {
+                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
+                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
+                    continue;
+                }
+                const auto & function = tool.at("function");
+                if (!function.contains("name") || !function.at("name").is_string()) {
+                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
+                    continue;
+                }
+                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
+                    continue;
+                }
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                struct parameter_rule {
+                    std::string symbol_name;
+                    bool is_required;
+                };
+                std::vector<parameter_rule> arg_rules;
+                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
+                    continue;
+                } else {
+                    std::vector<std::string> requiredParameters;
+                    if (parameters.contains("required")) {
+                        try { parameters.at("required").get_to(requiredParameters); }
+                        catch (const std::runtime_error&) {
+                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
+                        }
+                    }
+                    sort_uniq(requiredParameters);
+                    for (const auto & [key, value] : parameters.at("properties").items()) {
+                        std::string quoted_key = key;
+                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
+                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
+                            quoted_key = gbnf_format_literal(key);
+                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
+                        }
+                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
+                            gbnf_format_literal(form.key_start) + " " +
+                            gbnf_format_literal(quoted_key) + " " +
+                            gbnf_format_literal(key_val_sep) + " " +
+                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
+                                    (form.raw_argval ?
+                                            string_arg_val :
+                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
+                                    ) :
+                                    builder.add_schema(name + "-arg-" + key, value)
+                            )
+                        ), required});
+                    }
+                }
+                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
+                decltype(next_arg_with_sep) next_arg = "\"\"";
+                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
+                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
+                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
+                    );
+                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
+                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
+                    );
+                }
+                std::string quoted_name = name;
+                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
+                    quoted_name = gbnf_format_literal(name);
+                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
+                }
+                quoted_name = gbnf_format_literal(quoted_name);
+                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
+                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
+                }
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                        gbnf_format_literal(form.tool_start) + " " +
+                        quoted_name + " " +
+                        gbnf_format_literal(form.tool_sep) + " " +
+                        next_arg
+                ));
+            }
+            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
+            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
+            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
+            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
+            builder.add_rule("root",
+                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
+                tool_call_multiple_with_end  + "?" +
+                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
+            );
+        });
+        // grammar trigger for tool call
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
+    }
+}
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.key_val_sep.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+    // Helper to choose return false or throw error
+    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
+        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
+        if (recovery) {
+            builder.move_to(start_pos);
+            return false;
+        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
+    };
+    // Drop substring from needle to end from a JSON
+    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
+        auto pos = json_str.rfind(needle);
+        if (pos == std::string::npos) {
+            return false;
+        }
+        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
+            unsigned char ch = static_cast<unsigned char>(json_str[i]);
+            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
+                return false;
+            }
+        }
+        if (pos != 0 && json_str[pos - 1] == '"') {
+            --pos;
+        }
+        json_str.resize(pos);
+        return true;
+    };
+    // Helper to generate a partial argument JSON
+    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
+        auto rest = builder.consume_rest();
+        utf8_truncate_safe_resize(rest);
+        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
+        auto tool_str = arguments.dump();
+        if (partial_json(tool_str)) {
+            if (builder.add_tool_call(function_name, "", tool_str)) {
+                return;
+            }
+        }
+        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
+    };
+    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
+    constexpr auto try_find_close = [](
+            common_chat_msg_parser & builder,
+            const std::string & end,
+            const std::optional<std::string> & alt_end,
+            const std::string & end_next,
+            const std::optional<std::string> & alt_end_next
+    ) {
+        auto saved_pos = builder.pos();
+        auto tc = builder.try_find_literal(end);
+        auto val_end_size = end.size();
+        if (alt_end) {
+            auto pos_1 = builder.pos();
+            builder.move_to(saved_pos);
+            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
+            if (alt_end_next) {
+                builder.move_to(saved_pos);
+                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
+                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
+                    tc2 = tc3;
+                }
+            }
+            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
+                tc = tc2;
+                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
+                builder.move_to(tc->groups[0].end);
+                val_end_size = alt_end->size();
+            } else {
+                builder.move_to(pos_1);
+            }
+        }
+        return std::make_pair(val_end_size, tc);
+    };
+    // Helper to find a val_end or last_val_end, returns matched pattern size
+    const auto try_find_val_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
+    };
+    // Helper to find a tool_end or last_tool_end, returns matched pattern size
+    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
+    };
+    bool recovery = true;
+    const auto start_pos = builder.pos();
+    if (!all_space(form.scope_start)) {
+        if (auto tc = builder.try_find_literal(form.scope_start)) {
+            if (all_space(tc->prelude)) {
+                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
+                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
+            } else {
+                builder.move_to(start_pos);
+                return false;
+            }
+        } else return false;
+    }
+    while (auto tc = builder.try_find_literal(form.tool_start)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                    gbnf_format_literal(form.tool_start).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            builder.move_to(tc->groups[0].begin - tc->prelude.size());
+            break;
+        }
+        // Find tool name
+        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
+        if (!func_name) {
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+        if (!func_name) {
+            // Partial tool name not supported
+            throw common_chat_msg_partial_exception("incomplete tool_call");
+        }
+        // If the model generate multiple tool call and the first tool call has no argument
+        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
+            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+        // Parse tool name
+        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
+        std::string function_name = string_strip(func_name->prelude);
+        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
+            if (string_starts_with(function_name, "functions.")) {
+                static const std::regex re(":\\d+$");
+                if (std::regex_search(function_name, re)) {
+                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
+                }
+            }
+        }
+        // Argument JSON
+        json arguments = json::object();
+        // Helper to generate a partial argument JSON
+        const auto gen_partial_args = [&](auto set_partial_arg) {
+            gen_partial_json(set_partial_arg, arguments, builder, function_name);
+        };
+        // Parse all arg_key/arg_value pairs
+        while (auto tc = builder.try_find_literal(form.key_start)) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                        gbnf_format_literal(form.key_start).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                builder.move_to(tc->groups[0].begin - tc->prelude.size());
+                break;
+            }
+            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
+                auto tool_call_arg = arguments.dump();
+                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+                    tool_call_arg.resize(tool_call_arg.size() - 1);
+                }
+                builder.add_tool_call(function_name, "", tool_call_arg);
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
+            }
+            // Parse arg_key
+            auto key_res = builder.try_find_literal(form.key_val_sep);
+            if (!key_res) {
+                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
+                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
+            }
+            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
+                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
+            }
+            auto &key = key_res->prelude;
+            recovery = false;
+            // Parse arg_value
+            if (form.key_val_sep2) {
+                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
+                    if (!all_space(tc->prelude)) {
+                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
+                                gbnf_format_literal(tc->prelude).c_str(),
+                                gbnf_format_literal(form.key_val_sep).c_str(),
+                                gbnf_format_literal(*form.key_val_sep2).c_str()
+                        );
+                        return return_error(builder, start_pos, false);
+                    }
+                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
+                    }
+                } else {
+                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
+                }
+            }
+            auto val_start = builder.pos();
+            // Test if arg_val is a partial JSON
+            std::optional<common_json> value_json = std::nullopt;
+            if (!form.raw_argval || !*form.raw_argval) {
+                try { value_json = builder.try_consume_json(); }
+                catch (const std::runtime_error&) { builder.move_to(val_start); }
+                // TODO: Delete this when json_partial adds top-level support for null/true/false
+                if (builder.pos() == val_start) {
+                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
+                    builder.consume_spaces();
+                    std::string_view sv = utf8_truncate_safe_view(builder.input());
+                    sv.remove_prefix(builder.pos());
+                    std::string rest = "a";
+                    if (sv.size() < 6) rest = sv;
+                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
+                        value_json = {123, {"123", "123"}};
+                        builder.consume_rest();
+                    } else {
+                        builder.move_to(val_start);
+                    }
+                }
+            }
+            // If it is a JSON and followed by </arg_value>, parse as json
+            // cannot support streaming because it may be a plain text starting with JSON
+            if (value_json) {
+                auto json_end = builder.pos();
+                builder.consume_spaces();
+                if (builder.pos() == builder.input().size()) {
+                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
+                        arguments[key] = value_json->json;
+                        auto json_str = arguments.dump();
+                        if (!value_json->healing_marker.json_dump_marker.empty()) {
+                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
+                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
+                        } else {
+                            GGML_ASSERT(json_str.back() == '}');
+                            json_str.resize(json_str.size() - 1);
+                        }
+                        builder.add_tool_call(function_name, "", json_str);
+                    } else {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    }
+                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
+                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
+                }
+                builder.move_to(json_end);
+                auto [val_end_size, tc] = try_find_val_end();
+                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
+                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
+                    } else arguments[key] = value_json->json;
+                } else builder.move_to(val_start);
+            }
+            // If not, parse as plain text
+            if (val_start == builder.pos()) {
+                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
+                    auto &value_str = value_plain->prelude;
+                    if (form.trim_raw_argval) value_str = string_strip(value_str);
+                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
+                        throw common_chat_msg_partial_exception(
+                                "Expected " + gbnf_format_literal(form.val_end) +
+                                " after " + gbnf_format_literal(form.key_val_sep) +
+                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                        );
+                    }
+                    arguments[key] = value_str;
+                } else {
+                    if (form.trim_raw_argval) {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
+                    } else {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
+                    }
+                    throw common_chat_msg_partial_exception(
+                            "Expected " + gbnf_format_literal(form.val_end) +
+                            " after " + gbnf_format_literal(form.key_val_sep) +
+                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                    );
+                }
+            }
+        }
+        // Consume closing tag
+        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                        gbnf_format_literal(form.tool_end).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                return return_error(builder, start_pos, recovery);
+            }
+            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
+                // Add the parsed tool call
+                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
+                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
+                }
+                recovery = false;
+                continue;
+            }
+        }
+        auto tool_call_arg = arguments.dump();
+        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+            tool_call_arg.resize(tool_call_arg.size() - 1);
+        }
+        builder.add_tool_call(function_name, "", tool_call_arg);
+        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
+    }
+    if (auto tc = builder.try_find_literal(form.scope_end)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                    gbnf_format_literal(form.scope_end).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            return return_error(builder, start_pos, recovery);
+        }
+    } else {
+        if (all_space(form.scope_end)) return true;
+        builder.consume_spaces();
+        if (builder.pos() == builder.input().size())
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                gbnf_format_literal(form.scope_end).c_str(),
+                gbnf_format_literal(builder.consume_rest()).c_str()
+        );
+        return return_error(builder, start_pos, recovery);
+    }
+    return true;
+}
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
+    auto pos = pos_;
+    auto tsize = result_.tool_calls.size();
+    try { return parse_xml_tool_calls(*this, form); }
+    catch (const xml_toolcall_syntax_exception&) {}
+    move_to(pos);
+    result_.tool_calls.resize(tsize);
+    return false;
+}
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
+    constexpr auto rstrip = [](std::string &s) {
+        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
+    };
+    // Erase substring from l to r, along with additional spaces nearby
+    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
+        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
+        ++l;
+        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
+        if (l < r) str[l] = '\n';
+        if (l + 1 < r) str[l + 1] = '\n';
+        if (l != 0) l += 2;
+        str.erase(l, r - l);
+        return l;
+    };
+    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
+        auto best_match = content.size();
+        for (auto pattern: list) {
+            if (pattern.size() == 0) continue;
+            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
+                auto match_len = content.size() - match_idx;
+                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
+                    best_match = match_idx;
+                }
+            }
+        }
+        if (content.size() > best_match) {
+            content.erase(best_match);
+        }
+    };
+    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
+        return trim_suffix(content, {
+            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
+            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
+            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
+            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
+            form.scope_end
+        });
+    };
+    // Trim leading spaces without affecting keyword matching
+    static const common_regex spaces_regex("\\s*");
+    {
+        auto tc = builder.consume_regex(spaces_regex);
+        auto spaces = builder.str(tc.groups[0]);
+        auto s1 = spaces.size();
+        trim_potential_partial_word(spaces);
+        auto s2 = spaces.size();
+        builder.move_to(builder.pos() - (s1 - s2));
+    }
+    // Parse content
+    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
+    std::string unclosed_reasoning_content("");
+    for (;;) {
+        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
+        std::string content;
+        std::string tool_call_start;
+        if (tc) {
+            content = std::move(tc->prelude);
+            tool_call_start = builder.str(tc->groups[0]);
+            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
+        } else {
+            content = builder.consume_rest();
+            utf8_truncate_safe_resize(content);
+        }
+        // Handle unclosed think block
+        if (reasoning_unclosed) {
+            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
+                unclosed_reasoning_content += content;
+                if (!(form.allow_toolcall_in_think && tc)) {
+                    unclosed_reasoning_content += tool_call_start;
+                    continue;
+                }
+            } else {
+                reasoning_unclosed = false;
+                std::string reasoning_content;
+                if (pos == std::string::npos) {
+                    reasoning_content = std::move(content);
+                } else {
+                    reasoning_content = content.substr(0, pos);
+                    content.erase(0, pos + end_think.size());
+                }
+                if (builder.pos() == builder.input().size() && all_space(content)) {
+                    rstrip(reasoning_content);
+                    trim_potential_partial_word(reasoning_content);
+                    rstrip(reasoning_content);
+                    if (reasoning_content.empty()) {
+                        rstrip(unclosed_reasoning_content);
+                        trim_potential_partial_word(unclosed_reasoning_content);
+                        rstrip(unclosed_reasoning_content);
+                        if (unclosed_reasoning_content.empty()) continue;
+                    }
+                }
+                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+                    builder.add_content(start_think);
+                    builder.add_content(unclosed_reasoning_content);
+                    builder.add_content(reasoning_content);
+                    if (builder.pos() != builder.input().size() || !all_space(content))
+                        builder.add_content(end_think);
+                } else {
+                    builder.add_reasoning_content(unclosed_reasoning_content);
+                    builder.add_reasoning_content(reasoning_content);
+                }
+                unclosed_reasoning_content.clear();
+            }
+        }
+        // Handle multiple think block
+        bool toolcall_in_think = false;
+        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
+            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
+                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
+                    builder.add_reasoning_content(reasoning_content);
+                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
+                } else {
+                    think_start = think_end + end_think.size() - 1;
+                }
+            } else {
+                // This <tool_call> start is in thinking block, skip this tool call
+                // This <tool_call> start is in thinking block
+                if (form.allow_toolcall_in_think) {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
+                } else {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
+                }
+                reasoning_unclosed = true;
+                content.resize(think_start);
+                toolcall_in_think = true;
+            }
+        }
+        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+            rstrip(content);
+            // Handle unclosed </think> token from content: delete all </think> token
+            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
+                while (pos != std::string::npos) {
+                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
+                    pos = content.rfind(end_think, pos);
+                }
+            }
+            // Strip if needed
+            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
+                content = string_strip(content);
+            }
+        }
+        // remove potential partial suffix
+        if (builder.pos() == builder.input().size() && builder.is_partial()) {
+            if (unclosed_reasoning_content.empty()) {
+                rstrip(content);
+                trim_potential_partial_word(content);
+                rstrip(content);
+            } else {
+                rstrip(unclosed_reasoning_content);
+                trim_potential_partial_word(unclosed_reasoning_content);
+                rstrip(unclosed_reasoning_content);
+            }
+        }
+        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
+        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                builder.add_reasoning_content(unclosed_reasoning_content);
+            } else {
+                if (content.empty()) {
+                    content = start_think + unclosed_reasoning_content;
+                } else {
+                    content += "\n\n" + start_think;
+                    content += unclosed_reasoning_content;
+                }
+            }
+            unclosed_reasoning_content.clear();
+        }
+        // Add content
+        if (!content.empty()) {
+            // If there are multiple content blocks
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
+                builder.add_content("\n\n");
+            }
+            builder.add_content(content);
+        }
+        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
+        if (toolcall_in_think && !form.allow_toolcall_in_think) {
+            continue;
+        }
+        // There is no tool call and all content is parsed
+        if (!tc) {
+            GGML_ASSERT(builder.pos() == builder.input().size());
+            GGML_ASSERT(unclosed_reasoning_content.empty());
+            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
+            break;
+        }
+        builder.move_to(tc->groups[0].begin);
+        if (builder.try_consume_xml_tool_calls(form)) {
+            auto end_of_tool = builder.pos();
+            builder.consume_spaces();
+            if (builder.pos() != builder.input().size()) {
+                builder.move_to(end_of_tool);
+                if (!builder.result().content.empty()) {
+                    builder.add_content("\n\n");
+                }
+            }
+        } else {
+            static const common_regex next_char_regex(".");
+            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
+            rstrip(c);
+            builder.add_content(c);
+        }
+    }
+}
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ */
+void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
+    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
+}

llama.cpp/common/chat-parser-xml-toolcall.h ADDED Viewed

	@@ -0,0 +1,45 @@

+#pragma once
+#include "chat.h"
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+// Sample config:
+// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
+// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
+struct xml_tool_call_format {
+    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
+    std::string tool_start;  // <invoke name=\"        // <tool_call>
+    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
+    std::string key_start;   // <parameter name=\"     // <arg_key>
+    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
+    std::string val_end;     // </parameter>\n         // </arg_value>\n
+    std::string tool_end;    // </invoke>\n            // </tool_call>\n
+    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
+    // Set this if there can be dynamic spaces inside key_val_sep.
+    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
+    std::optional<std::string> key_val_sep2 = std::nullopt;
+    // Set true if argval should only be raw string. e.g. Hello "world" hi
+    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
+    // Defaults to std::nullopt, both will be allowed.
+    std::optional<bool> raw_argval = std::nullopt;
+    std::optional<std::string> last_val_end = std::nullopt;
+    std::optional<std::string> last_tool_end = std::nullopt;
+    bool trim_raw_argval = false;
+    bool allow_toolcall_in_think = false;
+};
+// make a GBNF that accept any strings except those containing any of the forbidden strings.
+std::string make_gbnf_excluding(std::vector<std::string> forbids);
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);

llama.cpp/common/chat-parser.cpp ADDED Viewed

	@@ -0,0 +1,1649 @@

+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "common.h"
+#include "log.h"
+#include "peg-parser.h"
+#include "regex-partial.h"
+#include <algorithm>
+#include <cctype>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+using json = nlohmann::ordered_json;
+static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder,
+                                                const common_regex &     prefix,
+                                                size_t                   rstrip_prefix = 0) {
+    static const std::vector<std::vector<std::string>> args_paths = { { "arguments" } };
+    if (auto res = builder.try_find_regex(prefix)) {
+        builder.move_back(rstrip_prefix);
+        auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
+        if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call array");
+        }
+    } else {
+        builder.add_content(builder.consume_rest());
+    }
+}
+static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
+    std::string arguments;
+    if (builder.is_partial()) {
+        arguments = (json{
+                         { "code", code + builder.healing_marker() }
+        })
+                        .dump();
+        auto idx = arguments.find(builder.healing_marker());
+        if (idx != std::string::npos) {
+            arguments.resize(idx);
+        }
+    } else {
+        arguments = (json{
+                         { "code", code }
+        })
+                        .dump();
+    }
+    return arguments;
+}
+/**
+ * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
+ * Aggregates the prefix, suffix and in-between text into the content.
+ */
+static void parse_json_tool_calls(
+    common_chat_msg_parser &            builder,
+    const std::optional<common_regex> & block_open,
+    const std::optional<common_regex> & function_regex_start_only,
+    const std::optional<common_regex> & function_regex,
+    const common_regex &                close_regex,
+    const std::optional<common_regex> & block_close,
+    bool                                allow_raw_python = false,
+    const std::function<std::string(const common_chat_msg_parser::find_regex_result & fres)> & get_function_name =
+        nullptr) {
+    auto parse_tool_calls = [&]() {
+        size_t from  = std::string::npos;
+        auto   first = true;
+        while (true) {
+            auto start_pos = builder.pos();
+            auto res = function_regex_start_only && first ? builder.try_consume_regex(*function_regex_start_only) :
+                       function_regex                     ? builder.try_find_regex(*function_regex, from) :
+                                                            std::nullopt;
+            if (res) {
+                std::string name;
+                if (get_function_name) {
+                    name = get_function_name(*res);
+                } else {
+                    GGML_ASSERT(res->groups.size() == 2);
+                    name = builder.str(res->groups[1]);
+                }
+                first = false;
+                if (name.empty()) {
+                    // get_function_name signalled us that we should skip this match and treat it as content.
+                    from = res->groups[0].begin + 1;
+                    continue;
+                }
+                from = std::string::npos;
+                auto maybe_raw_python = name == "python" && allow_raw_python;
+                if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
+                    if (auto arguments = builder.try_consume_json_with_dumped_args({ {} })) {
+                        if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) {
+                            throw common_chat_msg_partial_exception("incomplete tool call");
+                        }
+                        builder.consume_regex(close_regex);
+                    }
+                    continue;
+                }
+                if (maybe_raw_python) {
+                    auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+                    if (!builder.add_tool_call(name, "", arguments)) {
+                        throw common_chat_msg_partial_exception("incomplete tool call");
+                    }
+                    return;
+                }
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            } else {
+                builder.move_to(start_pos);
+            }
+            break;
+        }
+        if (block_close) {
+            builder.consume_regex(*block_close);
+        }
+        builder.consume_spaces();
+        builder.add_content(builder.consume_rest());
+    };
+    if (block_open) {
+        if (auto res = builder.try_find_regex(*block_open)) {
+            parse_tool_calls();
+        } else {
+            builder.add_content(builder.consume_rest());
+        }
+    } else {
+        parse_tool_calls();
+    }
+}
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
+    : input_(input), is_partial_(is_partial), syntax_(syntax)
+{
+    result_.role = "assistant";
+    while (true) {
+        std::string id = std::to_string(std::rand());
+        if (input.find(id) == std::string::npos) {
+            healing_marker_ = id;
+            break;
+        }
+    }
+}
+std::string common_chat_msg_parser::str(const common_string_range & rng) const {
+    GGML_ASSERT(rng.begin <= rng.end);
+    return input_.substr(rng.begin, rng.end - rng.begin);
+}
+void common_chat_msg_parser::add_content(const std::string &content) {
+    result_.content += content;
+}
+void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
+    result_.reasoning_content += reasoning_content;
+}
+bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
+    if (name.empty()) {
+        return false;
+    }
+    common_chat_tool_call tool_call;
+    tool_call.name = name;
+    tool_call.arguments = arguments;
+    tool_call.id = id;
+    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
+    result_.tool_calls.emplace_back(tool_call);
+    return true;
+}
+bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
+    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
+    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
+    std::string arguments = "";
+    if (tool_call.contains("arguments")) {
+        if (tool_call.at("arguments").is_object()) {
+            arguments = tool_call.at("arguments").dump();
+        } else {
+            arguments = tool_call.at("arguments");
+        }
+    }
+    return add_tool_call(name, id, arguments);
+}
+bool common_chat_msg_parser::add_tool_calls(const json & arr) {
+    for (const auto & item : arr) {
+        if (!add_tool_call(item)) {
+            return false;
+        }
+    }
+    return true;
+}
+bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
+    if (!tool_call.is_object() || tool_call.size() != 1) {
+        return false;
+    }
+    // Get the tool name (the single key in the object)
+    auto it = tool_call.begin();
+    std::string name = it.key();
+    if (name.empty()) {
+        return false;
+    }
+    // Get the arguments (the nested object)
+    const json & args_json = it.value();
+    std::string arguments = "";
+    if (args_json.is_object()) {
+        arguments = args_json.dump();
+    } else if (args_json.is_string()) {
+        arguments = args_json;
+    } else if (!args_json.is_null()) {
+        // For other types, convert to string representation
+        arguments = args_json.dump();
+    }
+    return add_tool_call(name, "", arguments);
+}
+void common_chat_msg_parser::finish() {
+    if (!is_partial_ && pos_ != input_.size()) {
+        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
+    }
+}
+bool common_chat_msg_parser::consume_spaces() {
+    const auto length = input_.size();
+    auto consumed = false;
+    while (pos_ < length && std::isspace(input_[pos_])) {
+        ++pos_;
+        consumed = true;
+    }
+    return consumed;
+}
+bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
+    auto pos = pos_;
+    for (auto i = 0u; i < literal.size(); ++i) {
+        if (pos >= input_.size()) {
+            return false;
+        }
+        if (input_[pos] != literal[i]) {
+            return false;
+        }
+        ++pos;
+    }
+    pos_ = pos;
+    return true;
+}
+std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
+    auto idx = input_.find(literal, pos_);
+    if (idx != std::string::npos) {
+        find_regex_result res;
+        res.prelude = input_.substr(pos_, idx - pos_);
+        auto end = idx + literal.size();
+        res.groups.emplace_back(common_string_range{idx, end});
+        move_to(end);
+        return res;
+    }
+    if (is_partial_) {
+        idx = string_find_partial_stop(input_, literal);
+        if (idx != std::string::npos && idx >= pos_) {
+            find_regex_result res;
+            res.prelude = input_.substr(pos_, idx - pos_);
+            auto end = input_.size();
+            res.groups.emplace_back(common_string_range{idx, end});
+            move_to(end);
+            return res;
+        }
+    }
+    return std::nullopt;
+}
+void common_chat_msg_parser::consume_literal(const std::string & literal) {
+    if (!try_consume_literal(literal)) {
+        throw common_chat_msg_partial_exception(literal);
+    }
+}
+bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
+        auto stripped_reasoning = string_strip(reasoning);
+        if (stripped_reasoning.empty()) {
+            return;
+        }
+        if (syntax_.reasoning_in_content) {
+            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
+            add_content(stripped_reasoning);
+            if (closed) {
+                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
+            }
+        } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
+            add_reasoning_content(stripped_reasoning);
+        }
+    };
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
+            if (!rest.empty()) {
+                handle_reasoning(rest, /* closed */ !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+        return false;
+    }
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+        cursor = end_pos + end_think.size();
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+        move_to(cursor);
+        return true;
+    }
+}
+std::string common_chat_msg_parser::consume_rest() {
+    auto rest = input_.substr(pos_);
+    pos_ = input_.size();
+    return rest;
+}
+// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
+    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+    pos_ = m.groups[0].end;
+    if (add_prelude_to_content) {
+        add_content(prelude);
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    return find_regex_result{prelude, m.groups};
+}
+common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
+    if (auto result = try_consume_regex(regex)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception(regex.str());
+}
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
+    auto m = regex.search(input_, pos_);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    if (m.groups[0].begin != pos_) {
+        // Didn't match at the current position.
+        return std::nullopt;
+    }
+    pos_ = m.groups[0].end;
+    return find_regex_result {
+        /* .prelude = */ "",
+        m.groups,
+    };
+}
+std::optional<common_json> common_chat_msg_parser::try_consume_json() {
+    auto it = input_.cbegin() + pos_;
+    const auto end = input_.cend();
+    common_json result;
+    if (!common_json_parse(it, end, healing_marker_, result)) {
+        return std::nullopt;
+    }
+    pos_ = std::distance(input_.cbegin(), it);
+    if (result.healing_marker.marker.empty()) {
+        // No healing marker, just return the parsed json
+        return result;
+    }
+    if (!is_partial()) {
+        throw common_chat_msg_partial_exception("JSON");
+    }
+    return result;
+}
+common_json common_chat_msg_parser::consume_json() {
+    if (auto result = try_consume_json()) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    auto partial = try_consume_json();
+    if (!partial) {
+        return std::nullopt;
+    }
+    auto is_arguments_path = [&](const std::vector<std::string> & path) {
+        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
+    };
+    auto is_content_path = [&](const std::vector<std::string> & path) {
+        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
+    };
+    if (partial->healing_marker.marker.empty()) {
+        if (args_paths.empty()) {
+            // No arguments to dump, and JSON was parsed fully.
+            return consume_json_result {
+                partial->json,
+                /* .is_partial = */ false,
+            };
+        }
+        if (is_arguments_path({})) {
+            // Entire JSON is the arguments and was parsed fully.
+            return consume_json_result {
+                partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
+                /* .is_partial = */ false,
+            };
+        }
+    }
+    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+    auto found_healing_marker = false;
+    std::vector<std::string> path;
+    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
+        if (is_arguments_path(path)) {
+            auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
+            if (is_partial() && !partial->healing_marker.marker.empty()) {
+                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
+                if (idx != std::string::npos) {
+                    arguments.resize(idx);
+                    found_healing_marker = true;
+                }
+                if (arguments == "\"") {
+                    // This happens because of completing `:"$magic` after `"arguments"`
+                    arguments = "";
+                }
+            }
+            return arguments;
+        }
+        if (is_content_path(path)) {
+            if (!j.is_string()) {
+                throw std::runtime_error("Content path must be a string");
+            }
+            std::string str = j;
+            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
+            if (idx != std::string::npos) {
+                str.resize(idx);
+                found_healing_marker = true;
+            }
+            return str;
+        }
+        if (j.is_object()) {
+            auto obj = json::object();
+            for (const auto & p : j.items()) {
+                const auto & key = p.key();
+                const auto & value = p.value();
+                const std::string key_str = key; // NOLINT
+                auto idx = key_str.find(healing_marker_);
+                if (idx != std::string::npos) {
+                    found_healing_marker = true;
+                    break;
+                }
+                path.push_back(key_str);
+                if (value.is_string()) {
+                    const std::string value_str = value;
+                    if (value_str.find(healing_marker_) != std::string::npos) {
+                        found_healing_marker = true;
+                        if (is_content_path(path)) {
+                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
+                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
+                                obj[key] = remove_unsupported_healings_and_dump_args(value);
+                            }
+                        }
+                        break;
+                    }
+                    obj[key] = value;
+                } else {
+                    obj[key] = remove_unsupported_healings_and_dump_args(value);
+                }
+                path.pop_back();
+            }
+            return obj;
+        }
+        if (j.is_array()) {
+            auto arr = json::array();
+            for (const auto & value : j) {
+                if (value.is_string()) {
+                    std::string str = value;
+                    auto idx = str.find(healing_marker_);
+                    if (idx != std::string::npos) {
+                        // Don't heal array values that aren't in the arguments.
+                        found_healing_marker = true;
+                        break;
+                    }
+                }
+                arr.push_back(remove_unsupported_healings_and_dump_args(value));
+            }
+            return arr;
+        }
+        return j;
+    };
+    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
+    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+    return consume_json_result {
+        cleaned,
+        /* .is_partial = */ found_healing_marker,
+    };
+}
+void common_chat_msg_parser::clear_tools() {
+    result_.tool_calls.clear();
+}
+/**
+ * All common_chat_parse_* moved from chat.cpp to chat-parser.cpp below
+ * to reduce incremental compile time for parser changes.
+ */
+static void common_chat_parse_generic(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const std::vector<std::vector<std::string>> content_paths = {
+        {"response"},
+    };
+    static const std::vector<std::vector<std::string>> args_paths = {
+        {"tool_call", "arguments"},
+        {"tool_calls", "arguments"},
+    };
+    auto data = builder.consume_json_with_dumped_args(args_paths, content_paths);
+    if (data.value.contains("tool_calls")) {
+        if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        }
+    } else if (data.value.contains("tool_call")) {
+        if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+    } else if (data.value.contains("response")) {
+        const auto & response = data.value.at("response");
+        builder.add_content(response.is_string() ? response.template get<std::string>() : response.dump(2));
+        if (data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete response");
+        }
+    } else {
+        throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON");
+    }
+}
+static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+    parse_prefixed_json_tool_call_array(builder, prefix);
+}
+static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("[THINK]", "[/THINK]");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+    parse_prefixed_json_tool_call_array(builder, prefix);
+}
+static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+    static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+    static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+    static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+    static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
+    if (auto res = builder.try_find_regex(start_action_regex)) {
+        // If we didn't extract thoughts, prelude includes them.
+        auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
+        for (const auto & tool_call : tool_calls.value) {
+            std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+            std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+            std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+            if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
+        }
+        if (tool_calls.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+        builder.consume_regex(end_action_regex);
+    } else if (auto res = builder.try_find_regex(start_response_regex)) {
+        if (!builder.try_find_regex(end_response_regex)) {
+            builder.add_content(builder.consume_rest());
+            throw common_chat_msg_partial_exception(end_response_regex.str());
+        }
+    } else {
+        builder.add_content(builder.consume_rest());
+    }
+}
+static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex function_regex(
+        "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
+    static const common_regex close_regex("\\}\\s*");
+    static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\(");
+    static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*");
+    if (with_builtin_tools) {
+        static const common_regex builtin_call_regex("<\\|python_tag\\|>");
+        if (auto res = builder.try_find_regex(builtin_call_regex)) {
+            auto fun_res = builder.consume_regex(function_name_regex);
+            auto function_name = builder.str(fun_res.groups[1]);
+            common_healing_marker healing_marker;
+            json args = json::object();
+            while (true) {
+                if (auto arg_res = builder.try_consume_regex(arg_name_regex)) {
+                    auto arg_name = builder.str(arg_res->groups[1]);
+                    auto partial = builder.consume_json();
+                    args[arg_name] = partial.json;
+                    healing_marker.marker = partial.healing_marker.marker;
+                    healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker;
+                    builder.consume_spaces();
+                    if (!builder.try_consume_literal(",")) {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+            }
+            builder.consume_literal(")");
+            builder.consume_spaces();
+            auto arguments = args.dump();
+            if (!builder.add_tool_call(function_name, "", arguments)) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            return;
+        }
+    }
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ std::nullopt,
+        /* function_regex_start_only= */ function_regex,
+        /* function_regex= */ std::nullopt,
+        close_regex,
+        std::nullopt);
+}
+static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex tool_calls_begin("(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)");
+    static const common_regex tool_calls_end("<｜tool▁calls▁end｜>");
+    static const common_regex function_regex("(?:<｜tool▁call▁begin｜>)?function<｜tool▁sep｜>([^\n]+)\n```json\n");
+    static const common_regex close_regex("```[\\s\\r\\n]*<｜tool▁call▁end｜>");
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ tool_calls_begin,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        tool_calls_end);
+}
+static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
+    static const common_regex function_regex("(?:<｜tool▁call▁begin｜>)?([^\\n<]+)(?:<｜tool▁sep｜>)");
+    static const common_regex close_regex("(?:[\\s]*)?<｜tool▁call▁end｜>");
+    static const common_regex tool_calls_begin("(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)");
+    static const common_regex tool_calls_end("<｜tool▁calls▁end｜>");
+    if (!builder.syntax().parse_tool_calls) {
+        LOG_DBG("%s: not parse_tool_calls\n", __func__);
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    LOG_DBG("%s: parse_tool_calls\n", __func__);
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ tool_calls_begin,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        tool_calls_end);
+}
+static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
+    // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
+    // First try to parse using the standard reasoning parsing method
+    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+    auto start_pos = builder.pos();
+    auto found_end_think = builder.try_find_literal("</think>");
+    builder.move_to(start_pos);
+    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
+        // If reasoning was parsed successfully, the remaining content is regular content
+        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+        // </think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>NAME\n```json\nJSON\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else {
+        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+          common_chat_parse_deepseek_v3_1_content(builder);
+          return;
+        }
+        // If no reasoning tags found, check if we should treat everything as reasoning
+        if (builder.syntax().thinking_forced_open) {
+            // If thinking is forced open but no tags found, treat everything as reasoning
+            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+            builder.add_reasoning_content(builder.consume_rest());
+        } else {
+            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+            // <｜tool▁call▁begin｜>NAME<���tool▁sep｜>JSON<｜tool▁call▁end｜>
+            common_chat_parse_deepseek_v3_1_content(builder);
+        }
+    }
+}
+static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<minimax:tool_call>",
+        /* form.tool_start  = */ "<invoke name=\"",
+        /* form.tool_sep    = */ "\">",
+        /* form.key_start   = */ "<parameter name=\"",
+        /* form.key_val_sep = */ "\">",
+        /* form.val_end     = */ "</parameter>",
+        /* form.tool_end    = */ "</invoke>",
+        /* form.scope_end   = */ "</minimax:tool_call>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<|tool_calls_section_begin|>";
+        form.tool_start  = "<|tool_call_begin|>";
+        form.tool_sep    = "<|tool_call_argument_begin|>{";
+        form.key_start   = "\"";
+        form.key_val_sep = "\":";
+        form.val_end     = ",";
+        form.tool_end    = "}<|tool_call_end|>";
+        form.scope_end   = "<|tool_calls_section_end|>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        form.allow_toolcall_in_think = true;
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+static void common_chat_parse_apriel_1_5(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<tool_calls>[";
+        form.tool_start  = "{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}, ";
+        form.scope_end   = "]</tool_calls>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        form.last_tool_end = "}";
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form, "<thinking>", "</thinking>");
+}
+static void common_chat_parse_xiaomi_mimo(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "";
+        form.tool_start  = "<tool_call>\n{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}\n</tool_call>";
+        form.scope_end   = "";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form);
+}
+static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
+    static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
+    static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
+    static const common_regex start_regex("<\\|start\\|>assistant");
+    static const common_regex analysis_regex("<\\|channel\\|>analysis");
+    static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
+    static const common_regex preamble_regex("<\\|channel\\|>commentary");
+    static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
+    static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
+    auto consume_end = [&](bool include_end = false) {
+        if (auto res = builder.try_find_literal("<|end|>")) {
+            return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
+        }
+        return builder.consume_rest();
+    };
+    auto handle_tool_call = [&](const std::string & name) {
+        if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
+            if (builder.syntax().parse_tool_calls) {
+                if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+            } else if (args->is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
+        }
+    };
+    auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
+        auto match = regex.search(input, 0, true);
+        if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
+            return match;
+        }
+        return std::nullopt;
+    };
+    do {
+        auto header_start_pos = builder.pos();
+        auto content_start = builder.try_find_literal("<|message|>");
+        if (!content_start) {
+            throw common_chat_msg_partial_exception("incomplete header");
+        }
+        auto header = content_start->prelude;
+        if (auto match = regex_match(tool_call1_regex, header)) {
+            auto group = match->groups[1];
+            auto name = header.substr(group.begin, group.end - group.begin);
+            handle_tool_call(name);
+            continue;
+        }
+        if (auto match = regex_match(tool_call2_regex, header)) {
+            auto group = match->groups[2];
+            auto name = header.substr(group.begin, group.end - group.begin);
+            handle_tool_call(name);
+            continue;
+        }
+        if (regex_match(analysis_regex, header)) {
+            builder.move_to(header_start_pos);
+            if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+                builder.add_content(consume_end(true));
+            } else {
+                builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
+            }
+            continue;
+        }
+        if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
+            builder.add_content(consume_end());
+            continue;
+        }
+        // Possibly a malformed message, attempt to recover by rolling
+        // back to pick up the next <|start|>
+        LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
+        builder.move_to(header_start_pos);
+    } while (builder.try_find_regex(start_regex, std::string::npos, false));
+    auto remaining = builder.consume_rest();
+    if (!remaining.empty()) {
+        LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
+    }
+}
+static void common_chat_parse_glm_4_5(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start  = */ "",
+        /* form.tool_start   = */ "<tool_call>",
+        /* form.tool_sep     = */ "",
+        /* form.key_start    = */ "<arg_key>",
+        /* form.key_val_sep  = */ "</arg_key>",
+        /* form.val_end      = */ "</arg_value>",
+        /* form.tool_end     = */ "</tool_call>",
+        /* form.scope_end    = */ "",
+        /* form.key_val_sep2 = */ "<arg_value>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex prefix(regex_escape(" functools["));
+    parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
+}
+static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) {
+    static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))");
+    static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))");
+    static const common_regex close_regex(R"(\s*)");
+    parse_json_tool_calls(
+        builder,
+        std::nullopt,
+        function_regex_start_only,
+        function_regex,
+        close_regex,
+        std::nullopt,
+        /* allow_raw_python= */ true,
+        /* get_function_name= */ [&](const auto & res) -> std::string {
+            auto at_start = res.groups[0].begin == 0;
+            auto name = builder.str(res.groups[1]);
+            if (!name.empty() && name.back() == '{') {
+                // Unconsume the opening brace '{' to ensure the JSON parsing goes well.
+                builder.move_back(1);
+            }
+            auto idx = name.find_last_not_of("\n{");
+            name = name.substr(0, idx + 1);
+            if (at_start && name == "all") {
+                return "";
+            }
+            return name;
+        });
+}
+static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
+    static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
+    static const common_regex function_regex(R"(<function=(\w+)>)");
+    static const common_regex close_regex(R"(</function>)");
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ std::nullopt,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        std::nullopt);
+    if (auto res = builder.try_find_regex(python_tag_regex)) {
+        auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+        builder.add_tool_call("python", "", arguments);
+        return;
+    }
+}
+static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex open_regex(
+        "(?:"
+            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
+            "("                          // match 2 (open_tag)
+                "<tool_call>"
+                "|<function_call>"
+                "|<tool>"
+                "|<tools>"
+                "|<response>"
+                "|<json>"
+                "|<xml>"
+                "|<JSON>"
+            ")?"
+            "(\\s*\\{\\s*\"name\")" // match 3 (named tool call)
+        ")"
+        "|<function=([^>]+)>"            // match 4 (function name)
+        "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
+    );
+    while (auto res = builder.try_find_regex(open_regex)) {
+        const auto & block_start = res->groups[1];
+        std::string block_end = block_start.empty() ? "" : "```";
+        const auto & open_tag = res->groups[2];
+        std::string close_tag;
+        if (!res->groups[3].empty()) {
+            builder.move_to(res->groups[3].begin);
+            close_tag = open_tag.empty() ? "" : "</" + builder.str(open_tag).substr(1);
+            if (auto tool_call = builder.try_consume_json_with_dumped_args({{"arguments"}})) {
+                if (!builder.add_tool_call(tool_call->value) || tool_call->is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+                builder.consume_spaces();
+                builder.consume_literal(close_tag);
+                builder.consume_spaces();
+                if (!block_end.empty()) {
+                    builder.consume_literal(block_end);
+                    builder.consume_spaces();
+                }
+            } else {
+                throw common_chat_msg_partial_exception("failed to parse tool call");
+            }
+        } else {
+            auto function_name = builder.str(res->groups[4]);
+            if (function_name.empty()) {
+                function_name = builder.str(res->groups[5]);
+            }
+            GGML_ASSERT(!function_name.empty());
+            close_tag = "</function>";
+            if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
+                if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+                builder.consume_spaces();
+                builder.consume_literal(close_tag);
+                builder.consume_spaces();
+                if (!block_end.empty()) {
+                    builder.consume_literal(block_end);
+                    builder.consume_spaces();
+                }
+            }
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_granite(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    static const common_regex start_think_regex(regex_escape("<think>"));
+    static const common_regex end_think_regex(regex_escape("</think>"));
+    // Granite models output partial tokens such as "<" and "<think".
+    // By leveraging try_consume_regex()/try_find_regex() throwing
+    // common_chat_msg_partial_exception for these partial tokens,
+    // processing is interrupted and the tokens are not passed to add_content().
+    if (auto res = builder.try_consume_regex(start_think_regex)) {
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+        builder.try_find_regex(end_think_regex, std::string::npos, false);
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+    }
+    builder.try_parse_reasoning("<think>", "</think>");
+    // Parse response tags
+    static const common_regex start_response_regex(regex_escape("<response>"));
+    static const common_regex end_response_regex(regex_escape("</response>"));
+    // Granite models output partial tokens such as "<" and "<response".
+    // Same hack as reasoning parsing.
+    if (builder.try_consume_regex(start_response_regex)) {
+        builder.try_find_regex(end_response_regex);
+    }
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+        // Expect JSON array of tool calls
+        if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
+            if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
+        }
+    } else {
+        builder.add_content(builder.consume_rest());
+    }
+}
+static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+        // Expect JSON array of tool calls
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            if (!builder.try_consume_literal("</TOOLCALL>")) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            builder.add_tool_calls(tool_calls_data.json);
+        } else {
+            throw common_chat_msg_partial_exception("Incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            builder.consume_spaces();
+            if (!builder.try_consume_literal("<|tools_suffix|>")) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            for (const auto & value : tool_calls_data.json) {
+                if (value.is_object()) {
+                    builder.add_tool_call_short_form(value);
+                }
+            }
+        } else {
+            throw common_chat_msg_partial_exception("Incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
+    static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
+    static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
+    // Loop through all tool calls
+    while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
+        builder.move_to(res->groups[0].end);
+        // Parse JSON array format: [{"name": "...", "arguments": {...}}]
+        auto tool_calls_data = builder.consume_json();
+        // Consume end marker
+        builder.consume_spaces();
+        if (!builder.try_consume_regex(tool_call_end_regex)) {
+            throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
+        }
+        // Process each tool call in the array
+        if (tool_calls_data.json.is_array()) {
+            for (const auto & tool_call : tool_calls_data.json) {
+                if (!tool_call.is_object()) {
+                    throw common_chat_msg_partial_exception("Tool call must be an object");
+                }
+                if (!tool_call.contains("name")) {
+                    throw common_chat_msg_partial_exception("Tool call missing 'name' field");
+                }
+                std::string function_name = tool_call.at("name");
+                std::string arguments = "{}";
+                if (tool_call.contains("arguments")) {
+                    if (tool_call.at("arguments").is_object()) {
+                        arguments = tool_call.at("arguments").dump();
+                    } else if (tool_call.at("arguments").is_string()) {
+                        arguments = tool_call.at("arguments");
+                    }
+                }
+                if (!builder.add_tool_call(function_name, "", arguments)) {
+                    throw common_chat_msg_partial_exception("Incomplete tool call");
+                }
+            }
+        } else {
+            throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
+        }
+        // Consume any trailing whitespace after this tool call
+        builder.consume_spaces();
+    }
+    // Consume any remaining content after all tool calls
+    auto remaining = builder.consume_rest();
+    if (!string_strip(remaining).empty()) {
+        builder.add_content(remaining);
+    }
+}
+static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<seed:tool_call>",
+        /* form.tool_start  = */ "<function=",
+        /* form.tool_sep    = */ ">",
+        /* form.key_start   = */ "<parameter=",
+        /* form.key_val_sep = */ ">",
+        /* form.val_end     = */ "</parameter>",
+        /* form.tool_end    = */ "</function>",
+        /* form.scope_end   = */ "</seed:tool_call>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
+}
+static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
+    // TODO: Tool calling
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
+    // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
+    // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
+    static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
+    if (!builder.syntax().parse_tool_calls) {
+        LOG_DBG("%s: not parse_tool_calls\n", __func__);
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    LOG_DBG("%s: parse_tool_calls\n", __func__);
+    // Find all <tool_call></tool_call> blocks
+    while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
+        builder.move_to(first->groups[0].end);
+        builder.consume_spaces();
+        builder.try_consume_literal("```json");
+        builder.try_consume_literal("```");
+        builder.consume_spaces();
+        // Consume JSON object
+        auto data = builder.consume_json();
+        builder.consume_spaces();
+        builder.try_consume_literal("```");
+        builder.consume_spaces();
+        if (!builder.try_consume_literal("</tool_call>")) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+        builder.consume_spaces();
+        // Extract name and arguments
+        std::string name;
+        std::string id;
+        nlohmann::ordered_json arguments;
+        const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
+            if (!obj.contains("name") || !obj.contains("arguments")) {
+                return false;
+            }
+            name = obj.at("name").get<std::string>();
+            arguments = obj.at("arguments");
+            if (obj.contains("id") && obj.at("id").is_string()) {
+                id = obj.at("id").get<std::string>();
+            }
+            return true;
+        };
+        if (!extract_args(data.json)) {
+            if (data.json.contains("function") && data.json.at("function").is_object()) {
+                auto fn = data.json.at("function");
+                extract_args(fn);
+                if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
+                    id = data.json.at("id").get<std::string>();
+                }
+            }
+        }
+        // If name is empty, treat the JSON object as content
+        if (name.empty()) {
+            LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
+            builder.add_content(data.json.dump());
+            continue;
+        }
+        std::string args_str = arguments.dump();
+        if (!builder.add_tool_call(name, id, args_str)) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
+    LOG_DBG("%s: parsing exaone_moe\n", __func__);
+    // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
+    // First try to parse using the standard reasoning parsing method
+    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+    auto start_pos = builder.pos();
+    auto found_end_think = builder.try_find_literal("</think>");
+    builder.move_to(start_pos);
+    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+        common_chat_parse_exaone_moe_content(builder);
+    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
+        // If reasoning was parsed successfully, the remaining content is regular content
+        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+        common_chat_parse_exaone_moe_content(builder);
+    } else {
+        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+          common_chat_parse_exaone_moe_content(builder);
+          return;
+        }
+        // If no reasoning tags found, check if we should treat everything as reasoning
+        if (builder.syntax().thinking_forced_open) {
+            // If thinking is forced open but no tags found, treat everything as reasoning
+            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+            builder.add_reasoning_content(builder.consume_rest());
+        } else {
+            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+            common_chat_parse_exaone_moe_content(builder);
+        }
+    }
+}
+static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse(common_chat_msg_parser & builder) {
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
+    switch (builder.syntax().format) {
+        case COMMON_CHAT_FORMAT_CONTENT_ONLY:
+            common_chat_parse_content_only(builder);
+            break;
+        case COMMON_CHAT_FORMAT_GENERIC:
+            common_chat_parse_generic(builder);
+            break;
+        case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
+            common_chat_parse_mistral_nemo(builder);
+            break;
+        case COMMON_CHAT_FORMAT_MAGISTRAL:
+            common_chat_parse_magistral(builder);
+            break;
+        case COMMON_CHAT_FORMAT_LLAMA_3_X:
+            common_chat_parse_llama_3_1(builder);
+            break;
+        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
+            common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
+            break;
+        case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
+            common_chat_parse_deepseek_r1(builder);
+            break;
+        case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
+            common_chat_parse_deepseek_v3_1(builder);
+            break;
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
+            common_chat_parse_functionary_v3_2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
+            common_chat_parse_functionary_v3_1_llama_3_1(builder);
+            break;
+        case COMMON_CHAT_FORMAT_HERMES_2_PRO:
+            common_chat_parse_hermes_2_pro(builder);
+            break;
+        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
+            common_chat_parse_firefunction_v2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_COMMAND_R7B:
+            common_chat_parse_command_r7b(builder);
+            break;
+        case COMMON_CHAT_FORMAT_GRANITE:
+            common_chat_parse_granite(builder);
+            break;
+        case COMMON_CHAT_FORMAT_GPT_OSS:
+            common_chat_parse_gpt_oss(builder);
+            break;
+        case COMMON_CHAT_FORMAT_SEED_OSS:
+            common_chat_parse_seed_oss(builder);
+            break;
+        case COMMON_CHAT_FORMAT_NEMOTRON_V2:
+            common_chat_parse_nemotron_v2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_APERTUS:
+            common_chat_parse_apertus(builder);
+            break;
+        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
+            common_chat_parse_lfm2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_MINIMAX_M2:
+            common_chat_parse_minimax_m2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_GLM_4_5:
+            common_chat_parse_glm_4_5(builder);
+            break;
+        case COMMON_CHAT_FORMAT_KIMI_K2:
+            common_chat_parse_kimi_k2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_APRIEL_1_5:
+            common_chat_parse_apriel_1_5(builder);
+            break;
+        case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
+            common_chat_parse_xiaomi_mimo(builder);
+            break;
+        case COMMON_CHAT_FORMAT_SOLAR_OPEN:
+            common_chat_parse_solar_open(builder);
+            break;
+        case COMMON_CHAT_FORMAT_EXAONE_MOE:
+            common_chat_parse_exaone_moe(builder);
+            break;
+        default:
+            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
+    }
+    builder.finish();
+}
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
+    }
+    common_chat_msg_parser builder(input, is_partial, syntax);
+    try {
+        common_chat_parse(builder);
+    } catch (const common_chat_msg_partial_exception & ex) {
+        LOG_DBG("Partial parse: %s\n", ex.what());
+        if (!is_partial) {
+            builder.clear_tools();
+            builder.move_to(0);
+            common_chat_parse_content_only(builder);
+        }
+    }
+    auto msg = builder.result();
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+    }
+    return msg;
+}
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
+    if (parser.empty()) {
+        throw std::runtime_error("Failed to parse due to missing parser definition.");
+    }
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
+    common_peg_parse_context ctx(input, is_partial);
+    auto result = parser.parse(ctx);
+    if (result.fail()) {
+        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
+    }
+    common_chat_msg msg;
+    msg.role = "assistant";
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
+        auto mapper = common_chat_peg_native_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        auto mapper = common_chat_peg_constructed_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else {
+        // Generic mapper
+        auto mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    }
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+    }
+    return msg;
+}

llama.cpp/common/chat-parser.h ADDED Viewed

	@@ -0,0 +1,133 @@

+#pragma once
+#include "chat.h"
+#include "chat-parser-xml-toolcall.h"
+#include "json-partial.h"
+#include "regex-partial.h"
+#include <nlohmann/json_fwd.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+class common_chat_msg_partial_exception : public std::runtime_error {
+  public:
+    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+class common_chat_msg_parser {
+    std::string input_;
+    bool is_partial_;
+    common_chat_parser_params syntax_; // TODO: rename to params
+    std::string healing_marker_;
+    size_t pos_ = 0;
+    common_chat_msg result_;
+  public:
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+    const std::string & input() const { return input_; }
+    size_t pos() const { return pos_; }
+    const std::string & healing_marker() const { return healing_marker_; }
+    const bool & is_partial() const { return is_partial_; }
+    const common_chat_msg & result() const { return result_; }
+    const common_chat_parser_params & syntax() const { return syntax_; }
+    void move_to(size_t pos) {
+        if (pos > input_.size()) {
+            throw std::runtime_error("Invalid position!");
+        }
+        pos_ = pos;
+    }
+    void move_back(size_t n) {
+        if (pos_ < n) {
+            throw std::runtime_error("Can't move back that far!");
+        }
+        pos_ -= n;
+    }
+    // Get the substring of the input at the given range
+    std::string str(const common_string_range & rng) const;
+    // Appends to the result.content field
+    void add_content(const std::string & content);
+    // Appends to the result.reasoning_content field
+    void add_reasoning_content(const std::string & reasoning_content);
+    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+    bool add_tool_call(const nlohmann::ordered_json & tool_call);
+    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+    bool add_tool_calls(const nlohmann::ordered_json & arr);
+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+    void finish();
+    bool consume_spaces();
+    void consume_literal(const std::string & literal);
+    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+    std::string consume_rest();
+    struct find_regex_result {
+        std::string prelude;
+        std::vector<common_string_range> groups;
+    };
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
+    bool try_consume_literal(const std::string & literal);
+    std::optional<find_regex_result> try_find_literal(const std::string & literal);
+    find_regex_result consume_regex(const common_regex & regex);
+    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
+    std::optional<common_json> try_consume_json();
+    common_json consume_json();
+    struct consume_json_result {
+        nlohmann::ordered_json value;
+        bool is_partial;
+    };
+    /*
+        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+    */
+    consume_json_result consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    std::optional<consume_json_result> try_consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    /**
+     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+     * form.scope_start, form.tool_sep and form.scope_end can be empty.
+     */
+    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
+    // Parse content uses reasoning and XML-Style tool call
+    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
+    void clear_tools();
+};

llama.cpp/common/chat-peg-parser.cpp ADDED Viewed

	@@ -0,0 +1,124 @@

+#include "chat-peg-parser.h"
+#include <nlohmann/json.hpp>
+using json = nlohmann::json;
+static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
+    int count = 0;
+    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
+        if (max != -1 && count <= max) {
+            break;
+        }
+        sv.remove_suffix(1);
+        count++;
+    }
+    return sv;
+}
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+    arena.visit(result, [this](const common_peg_ast_node & node) {
+        map(node);
+    });
+}
+void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
+    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
+    bool is_content = node.tag == common_chat_peg_builder::CONTENT;
+    if (is_reasoning) {
+        result.reasoning_content = std::string(trim_trailing_space(node.text));
+    }
+    if (is_content) {
+        result.content = std::string(trim_trailing_space(node.text));
+    }
+}
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+    }
+    if (is_tool_id && current_tool) {
+        current_tool->id = std::string(trim_trailing_space(node.text));
+    }
+    if (is_tool_name && current_tool) {
+        current_tool->name = std::string(trim_trailing_space(node.text));
+    }
+    if (is_tool_args && current_tool) {
+        current_tool->arguments = std::string(trim_trailing_space(node.text));
+    }
+}
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+        arg_count = 0;
+    }
+    if (is_tool_name) {
+        current_tool->name = std::string(node.text);
+        current_tool->arguments = "{";
+    }
+    if (is_arg_open) {
+        needs_closing_quote = false;
+    }
+    if (is_arg_name && current_tool) {
+        if (arg_count > 0) {
+            current_tool->arguments += ",";
+        }
+        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+        ++arg_count;
+    }
+    if (is_arg_string && current_tool) {
+        // Serialize to JSON, but exclude the end quote
+        std::string dumped = json(trim_trailing_space(node.text)).dump();
+        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+        needs_closing_quote = true;
+    }
+    if (is_arg_close && current_tool) {
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+            needs_closing_quote = false;
+        }
+    }
+    if (is_arg_json && current_tool) {
+        current_tool->arguments += std::string(trim_trailing_space(node.text));
+    }
+    if (is_tool_close && current_tool) {
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+            needs_closing_quote = false;
+        }
+        current_tool->arguments += "}";
+    }
+}

llama.cpp/common/chat-peg-parser.h ADDED Viewed

	@@ -0,0 +1,105 @@

+#pragma once
+#include "chat.h"
+#include "peg-parser.h"
+class common_chat_peg_builder : public common_peg_parser_builder {
+  public:
+    static constexpr const char * REASONING_BLOCK = "reasoning-block";
+    static constexpr const char * REASONING = "reasoning";
+    static constexpr const char * CONTENT = "content";
+    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
+    common_chat_peg_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+class common_chat_peg_mapper {
+  public:
+    common_chat_msg & result;
+    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
+    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+    virtual void map(const common_peg_ast_node & node);
+};
+class common_chat_peg_native_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_ID = "tool-id";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARGS = "tool-args";
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+  public:
+    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+    void map(const common_peg_ast_node & node) override;
+};
+inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
+    common_chat_peg_native_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARG = "tool-arg";
+    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
+    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
+    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
+    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
+    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+};
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+    int arg_count = 0;
+    bool needs_closing_quote = false;
+  public:
+    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+    void map(const common_peg_ast_node & node) override;
+};
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
+    common_chat_peg_constructed_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}

llama.cpp/common/chat.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

llama.cpp/common/chat.h ADDED Viewed

	@@ -0,0 +1,252 @@

+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+#pragma once
+#include "common.h"
+#include "peg-parser.h"
+#include <functional>
+#include <chrono>
+#include <string>
+#include <vector>
+#include <map>
+#include <nlohmann/json_fwd.hpp>
+struct common_chat_templates;
+struct common_chat_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+    bool operator==(const common_chat_tool_call & other) const {
+        return name == other.name && arguments == other.arguments && id == other.id;
+    }
+};
+struct common_chat_msg_content_part {
+    std::string type;
+    std::string text;
+    // TODO @ngxson : no known chat templates support reasoning_content in content parts yet
+    //                this can be useful for models with interleaved thinking (like Kimi-K2)
+    //                if you see any templates explicitly support this, please ping me
+    // std::string reasoning_content;
+    bool operator==(const common_chat_msg_content_part & other) const {
+        return type == other.type && text == other.text;
+    }
+};
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_chat_msg_content_part> content_parts;
+    std::vector<common_chat_tool_call> tool_calls;
+    std::string reasoning_content;
+    std::string tool_name;
+    std::string tool_call_id;
+    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
+    void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+        for (auto i = 0u; i < tool_calls.size(); i++) {
+            if (ids_cache.size() <= i) {
+                auto id = tool_calls[i].id;
+                if (id.empty()) {
+                    id = gen_tool_call_id();
+                }
+                ids_cache.push_back(id);
+            }
+            tool_calls[i].id = ids_cache[i];
+        }
+    }
+    bool operator==(const common_chat_msg & other) const {
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
+    }
+};
+struct common_chat_msg_diff {
+    std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
+};
+struct common_chat_tool {
+    std::string name;
+    std::string description;
+    std::string parameters;
+};
+enum common_chat_tool_choice {
+    COMMON_CHAT_TOOL_CHOICE_AUTO,
+    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
+    COMMON_CHAT_TOOL_CHOICE_NONE,
+};
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_MAGISTRAL,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_GRANITE,
+    COMMON_CHAT_FORMAT_GPT_OSS,
+    COMMON_CHAT_FORMAT_SEED_OSS,
+    COMMON_CHAT_FORMAT_NEMOTRON_V2,
+    COMMON_CHAT_FORMAT_APERTUS,
+    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
+    COMMON_CHAT_FORMAT_GLM_4_5,
+    COMMON_CHAT_FORMAT_MINIMAX_M2,
+    COMMON_CHAT_FORMAT_KIMI_K2,
+    COMMON_CHAT_FORMAT_APRIEL_1_5,
+    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
+    COMMON_CHAT_FORMAT_SOLAR_OPEN,
+    COMMON_CHAT_FORMAT_EXAONE_MOE,
+    // These are intended to be parsed by the PEG parser
+    COMMON_CHAT_FORMAT_PEG_SIMPLE,
+    COMMON_CHAT_FORMAT_PEG_NATIVE,
+    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+struct common_chat_templates_inputs {
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
+    // Parameters below only supported when use_jinja is true
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
+    bool enable_thinking = true;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+    std::map<std::string, std::string> chat_template_kwargs;
+    bool add_bos = false;
+    bool add_eos = false;
+};
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::string                         prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+    std::string                         parser;
+};
+// per-message parsing syntax
+// should be derived from common_chat_params
+struct common_chat_parser_params {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
+    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+    common_peg_arena         parser                = {};
+    common_chat_parser_params() = default;
+    common_chat_parser_params(const common_chat_params & chat_params) {
+        format               = chat_params.format;
+        thinking_forced_open = chat_params.thinking_forced_open;
+    }
+};
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+void common_chat_templates_free(struct common_chat_templates * tmpls);
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
+typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja,
+    const std::map<std::string, std::string> & chat_template_kwargs);
+const char*               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+// used by arg and server
+const char *             common_reasoning_format_name(common_reasoning_format format);
+common_reasoning_format  common_reasoning_format_from_name(const std::string & format);
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
+// Parses a JSON array of messages in OpenAI's chat completion API format.
+std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
+// DEPRECATED: only used in tests
+nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
+nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+// get template caps, useful for reporting to server /props endpoint
+std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

llama.cpp/common/common.cpp ADDED Viewed

	@@ -0,0 +1,1824 @@

+#include "ggml.h"
+#include "gguf.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "sampling.h"
+#include "unicode.h"
+#include <algorithm>
+#include <cinttypes>
+#include <climits>
+#include <cmath>
+#include <chrono>
+#include <cstdarg>
+#include <cstring>
+#include <ctime>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <locale>
+#include <windows.h>
+#include <string.h>
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#if defined(__linux__)
+#include <sys/types.h>
+#include <pwd.h>
+#endif
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+common_time_meas::~common_time_meas() {
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
+    }
+}
+//
+// CPU utils
+//
+int32_t cpu_get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (!siblings.empty()) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    unsigned int n_threads_win = std::thread::hardware_concurrency();
+    unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
+    DWORD buffer_size = 0;
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
+        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+            return default_threads;
+        }
+    }
+    std::vector<char> buffer(buffer_size);
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
+        return default_threads;
+    }
+    int32_t num_physical_cores = 0;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    while (buffer_size > 0) {
+        if (info->Relationship == RelationProcessorCore) {
+            num_physical_cores += info->Processor.GroupCount;
+        }
+        buffer_size -= info->Size;
+        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
+    }
+    return num_physical_cores > 0 ? num_physical_cores : default_threads;
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include <pthread.h>
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+static int cpu_count_math_cpus(int n_cpu) {
+    int result = 0;
+    for (int cpu = 0; cpu < n_cpu; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+#endif // __x86_64__ && __linux__
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int32_t cpu_get_num_math() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if (n_cpu < 1) {
+        return cpu_get_num_physical_cores();
+    }
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            int result = cpu_count_math_cpus(n_cpu);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return cpu_get_num_physical_cores();
+}
+// Helper for setting process priority
+#if defined(_WIN32)
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+    DWORD p = NORMAL_PRIORITY_CLASS;
+    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+    return true;
+}
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+    int p = 0;
+    switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+    }
+    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
+        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        return false;
+    }
+    return true;
+}
+#endif
+//
+// CLI argument parsing
+//
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = cpu_get_num_math();
+        }
+    }
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+    if (n_set && n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+    size_t start_i;
+    size_t end_i;
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            LOG_ERR("Start index out of bounds!\n");
+            return false;
+        }
+    }
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            LOG_ERR("End index out of bounds!\n");
+            return false;
+        }
+    }
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+    return true;
+}
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+    size_t end_i = num_digits + start_i;
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+    return true;
+}
+void common_init() {
+    llama_log_set(common_log_default_callback, NULL);
+#ifdef NDEBUG
+    const char * build_type = "";
+#else
+    const char * build_type = " (debug)";
+#endif
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+}
+std::string common_params_get_system_info(const common_params & params) {
+    std::ostringstream os;
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
+    }
+#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
+#else
+    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+#endif
+    return os.str();
+}
+//
+// String utils
+//
+std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+std::string string_strip(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && std::isspace(str[start])) {
+        start++;
+    }
+    while (end > start && std::isspace(str[end - 1])) {
+        end--;
+    }
+    return str.substr(start, end - start);
+}
+std::string string_get_sortable_timestamp() {
+    using clock = std::chrono::system_clock;
+    const clock::time_point current_time = clock::now();
+    const time_t as_time_t = clock::to_time_t(current_time);
+    char timestamp_no_ns[100];
+    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        current_time.time_since_epoch() % 1000000000).count();
+    char timestamp_ns[11];
+    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
+    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+std::string regex_escape(const std::string & s) {
+    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+    return std::regex_replace(s, special_chars, "\\$&");
+}
+std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
+    std::ostringstream result;
+    for (size_t i = 0; i < values.size(); ++i) {
+        if (i > 0) {
+            result << separator;
+        }
+        result << values[i];
+    }
+    return result.str();
+}
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> parts;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+    while (end != std::string::npos) {
+        parts.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+    parts.push_back(str.substr(start));
+    return parts;
+}
+std::string string_repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+    std::string result;
+    result.reserve(str.length() * n);
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+    return result;
+}
+std::string string_from(bool value) {
+    return value ? "true" : "false";
+}
+std::string string_from(const std::vector<int> & values) {
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (auto e : values) {
+        if (first) {
+            first = false;
+        } else {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+    return buf.str();
+}
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (const auto & token : tokens) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+        auto detokenized = common_token_to_piece(ctx, token);
+        buf << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+    return buf.str();
+}
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
+        buf << "\n"          << std::to_string(i)
+            << ", token '"   << detokenized << "'"
+            << ", pos "      << std::to_string(batch.pos[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
+            << ", logits "   << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+    return buf.str();
+}
+void string_process_escapes(std::string & input) {
+    std::size_t input_len = input.length();
+    std::size_t output_idx = 0;
+    for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+        if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+            switch (input[++input_idx]) {
+                case 'n':  input[output_idx++] = '\n'; break;
+                case 'r':  input[output_idx++] = '\r'; break;
+                case 't':  input[output_idx++] = '\t'; break;
+                case '\'': input[output_idx++] = '\''; break;
+                case '\"': input[output_idx++] = '\"'; break;
+                case '\\': input[output_idx++] = '\\'; break;
+                case 'x':
+                    // Handle \x12, etc
+                    if (input_idx + 2 < input_len) {
+                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+                        char *err_p = nullptr;
+                        const long val = std::strtol(x, &err_p, 16);
+                        if (err_p == x + 2) {
+                            input_idx += 2;
+                            input[output_idx++] = char(val);
+                            break;
+                        }
+                    }
+                    // fall through
+                default:   input[output_idx++] = '\\';
+                           input[output_idx++] = input[input_idx]; break;
+            }
+        } else {
+            input[output_idx++] = input[input_idx];
+        }
+    }
+    input.resize(output_idx);
+}
+bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.val_i64 = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.val_f64 = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.val_bool = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.val_bool = false;
+        } else {
+            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    } else if (strncmp(sep, "str:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        if (strlen(sep) > 127) {
+            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            return false;
+        }
+        strncpy(kvo.val_str, sep, 127);
+        kvo.val_str[127] = '\0';
+    } else {
+        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+//
+// Filesystem utils
+//
+// Validate if a filename is safe to use
+// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
+    if (!filename.length()) {
+        // Empty filename invalid
+        return false;
+    }
+    if (filename.length() > 255) {
+        // Limit at common largest possible filename on Linux filesystems
+        // to avoid unnecessary further validation
+        // (On systems with smaller limits it will be caught by the OS)
+        return false;
+    }
+    size_t offset = 0;
+    while (offset < filename.size()) {
+        utf8_parse_result result = parse_utf8_codepoint(filename, offset);
+        if (result.status != utf8_parse_result::SUCCESS) {
+            return false;
+        }
+        uint32_t c = result.codepoint;
+        if ((result.bytes_consumed == 2 && c < 0x80) ||
+            (result.bytes_consumed == 3 && c < 0x800) ||
+            (result.bytes_consumed == 4 && c < 0x10000)) {
+            return false;
+        }
+        // Check for forbidden codepoints:
+        // - Control characters
+        // - Unicode equivalents of illegal characters
+        // - UTF-16 surrogate pairs
+        // - UTF-8 replacement character
+        // - Byte order mark (BOM)
+        // - Illegal characters: / \ : * ? " < > |
+        if (c <= 0x1F // Control characters (C0)
+            || c == 0x7F // Control characters (DEL)
+            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
+            || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
+            || c == 0x2215 // Division Slash (forward slash equivalent)
+            || c == 0x2216 // Set Minus (backslash equivalent)
+            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c > 0x10FFFF // Max Unicode limit
+            || c == 0xFFFD // Replacement Character (UTF-8)
+            || c == 0xFEFF // Byte Order Mark (BOM)
+            || c == ':' || c == '*' // Illegal characters
+            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
+            return false;
+        }
+        if (!allow_subdirs && (c == '/' || c == '\\')) {
+            // Subdirectories not allowed, reject path separators
+            return false;
+        }
+        offset += result.bytes_consumed;
+    }
+    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
+    // Unicode and other whitespace is not affected, only 0x20 space
+    if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
+        return false;
+    }
+    // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
+    if (filename.find("..") != std::string::npos) {
+        return false;
+    }
+    // Reject "."
+    if (filename == ".") {
+        return false;
+    }
+    return true;
+}
+#include <iostream>
+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+    if (size <= 0) {
+        return std::wstring();
+    }
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+    return wstr;
+}
+#endif
+// returns true if successful, false otherwise
+bool fs_create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+    std::wstring wpath = utf8_to_wstring(path);
+    // if the path already exists, check whether it's a directory
+    const DWORD attributes = GetFileAttributesW(wpath.c_str());
+    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        return true;
+    }
+    size_t pos_slash = 0;
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+        const std::wstring subpath = wpath.substr(0, pos_slash);
+        pos_slash += 1;
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+        if (!success) {
+            const DWORD error = GetLastError();
+            // if the path already exists, ensure that it's a directory
+            if (error == ERROR_ALREADY_EXISTS) {
+                const DWORD attributes = GetFileAttributesW(subpath.c_str());
+                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+    }
+    return true;
+#else
+    // if the path already exists, check whether it's a directory
+    struct stat info;
+    if (stat(path.c_str(), &info) == 0) {
+        return S_ISDIR(info.st_mode);
+    }
+    size_t pos_slash = 1; // skip leading slashes for directory creation
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+        const std::string subpath = path.substr(0, pos_slash);
+        struct stat info;
+        // if the path already exists, ensure that it's a directory
+        if (stat(subpath.c_str(), &info) == 0) {
+            if (!S_ISDIR(info.st_mode)) {
+                return false;
+            }
+        } else {
+            // create parent directories
+            const int ret = mkdir(subpath.c_str(), 0755);
+            if (ret != 0) {
+                return false;
+            }
+        }
+        pos_slash += 1;
+    }
+    return true;
+#endif // _WIN32
+}
+bool fs_is_directory(const std::string & path) {
+    std::filesystem::path dir(path);
+    return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
+}
+std::string fs_get_cache_directory() {
+    std::string cache_directory = "";
+    auto ensure_trailing_slash = [](std::string p) {
+        // Make sure to add trailing slash
+        if (p.back() != DIRECTORY_SEPARATOR) {
+            p += DIRECTORY_SEPARATOR;
+        }
+        return p;
+    };
+    if (getenv("LLAMA_CACHE")) {
+        cache_directory = std::getenv("LLAMA_CACHE");
+    } else {
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || \
+        defined(__OpenBSD__) || defined(__NetBSD__)
+        if (std::getenv("XDG_CACHE_HOME")) {
+            cache_directory = std::getenv("XDG_CACHE_HOME");
+        } else if (std::getenv("HOME")) {
+            cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        } else {
+#if defined(__linux__)
+            /* no $HOME is defined, fallback to getpwuid */
+            struct passwd *pw = getpwuid(getuid());
+            if ((!pw) || (!pw->pw_dir)) {
+                throw std::runtime_error("Failed to find $HOME directory");
+            }
+            cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
+#else /* defined(__linux__) */
+            throw std::runtime_error("Failed to find $HOME directory");
+#endif /* defined(__linux__) */
+        }
+#elif defined(__APPLE__)
+        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
+#elif defined(_WIN32)
+        cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
+#else
+#  error Unknown architecture
+#endif
+        cache_directory = ensure_trailing_slash(cache_directory);
+        cache_directory += "llama.cpp";
+    }
+    return ensure_trailing_slash(cache_directory);
+}
+std::string fs_get_cache_file(const std::string & filename) {
+    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
+    std::string cache_directory = fs_get_cache_directory();
+    const bool success = fs_create_directory_with_parents(cache_directory);
+    if (!success) {
+        throw std::runtime_error("failed to create cache directory: " + cache_directory);
+    }
+    return cache_directory + filename;
+}
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
+    std::vector<common_file_info> files;
+    if (path.empty()) return files;
+    std::filesystem::path dir(path);
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        return files;
+    }
+    for (const auto & entry : std::filesystem::directory_iterator(dir)) {
+        try {
+            // Only include regular files (skip directories)
+            const auto & p = entry.path();
+            if (std::filesystem::is_regular_file(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.is_dir = false;
+                try {
+                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
+                } catch (const std::filesystem::filesystem_error &) {
+                    info.size = 0;
+                }
+                files.push_back(std::move(info));
+            } else if (include_directories && std::filesystem::is_directory(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.size   = 0; // Directories have no size
+                info.is_dir = true;
+                files.push_back(std::move(info));
+            }
+        } catch (const std::filesystem::filesystem_error &) {
+            // skip entries we cannot inspect
+            continue;
+        }
+    }
+    return files;
+}
+//
+// TTY utils
+//
+bool tty_can_use_colors() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+    return stdout_is_tty || stderr_is_tty;
+}
+//
+// Model utils
+//
+// TODO: move to common/sampling
+static void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+    const uint64_t config = sparams.user_sampling_config;
+    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+        if (config & user_config) {
+            return;
+        }
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) {
+                dst = v;
+            }
+        }
+    };
+    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+        if (config & user_config) {
+            return;
+        }
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) {
+                dst = v;
+            }
+        }
+    };
+    // Sampling sequence
+    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+struct common_init_result::impl {
+    impl() = default;
+    ~impl() = default;
+    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
+    llama_model_ptr   model;
+    llama_context_ptr context;
+    std::vector<llama_adapter_lora_ptr> lora;
+    std::vector<common_sampler_ptr> samplers;
+    std::vector<llama_sampler_seq_config> samplers_seq_config;
+};
+common_init_result::common_init_result(common_params & params) :
+    pimpl(new impl{}) {
+    auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+    if (params.fit_params) {
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+            params.tensor_split,
+            params.tensor_buft_overrides.data(),
+            params.fit_params_target.data(),
+            params.fit_params_min_ctx,
+            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+    }
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    if (model == NULL) {
+        return;
+    }
+    pimpl->model.reset(model);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    // load and optionally apply lora adapters (must be loaded before context creation)
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
+            pimpl->model.reset(model);
+            return;
+        }
+        char buf[1024];
+        la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
+        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+    // updates params.sampling
+    // TODO: fix naming
+    common_init_sampler_from_model(model, params.sampling);
+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sampling.ignore_eos = false;
+    }
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
+        }
+    }
+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+    //if (params.sampling.penalty_last_n == -1) {
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    //}
+    //if (params.sampling.dry_penalty_last_n == -1) {
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    //}
+    // init the backend samplers as part of the context creation
+    pimpl->samplers.resize(cparams.n_seq_max);
+    pimpl->samplers_seq_config.resize(cparams.n_seq_max);
+    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
+        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
+        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
+    }
+    if (params.sampling.backend_sampling) {
+        cparams.samplers   = pimpl->samplers_seq_config.data();
+        cparams.n_samplers = pimpl->samplers_seq_config.size();
+    }
+    llama_context * lctx = llama_init_from_model(model, cparams);
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return;
+    }
+    pimpl->context.reset(lctx);
+}
+llama_model * common_init_result::model() {
+    return pimpl->model.get();
+}
+llama_context * common_init_result::context() {
+    return pimpl->context.get();
+}
+common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    return pimpl->samplers[seq_id].get();
+}
+void common_init_result::reset_samplers() {
+    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
+        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
+    }
+}
+std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
+    return pimpl->lora;
+}
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));
+    llama_model * model = res->model();
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+    llama_context * lctx = res->context();
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
+    }
+    if (!params.control_vectors.empty()) {
+        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
+        const auto cvec = common_control_vector_load(params.control_vectors);
+        if (cvec.n_embd == -1) {
+            return res;
+        }
+        int err = llama_set_adapter_cvec(
+                lctx,
+                cvec.data.data(),
+                cvec.data.size(),
+                cvec.n_embd,
+                params.control_vector_layer_start,
+                params.control_vector_layer_end);
+        if (err) {
+            return res;
+        }
+    }
+    if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
+        bool ok = true;
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
+        if (!has_eos && !has_sep && !has_rerank_prompt) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        }
+        if (!ok) {
+            return res;
+        }
+    }
+    if (!params.lora_init_without_apply) {
+        common_set_adapter_lora(lctx, params.lora_adapters);
+    }
+    if (params.warmup) {
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        llama_set_warmup(lctx, true);
+        std::vector<llama_token> tmp;
+        llama_token bos = llama_vocab_bos(vocab);
+        llama_token eos = llama_vocab_eos(vocab);
+        // some models (e.g. T5) don't have a BOS token
+        if (bos != LLAMA_TOKEN_NULL) {
+            tmp.push_back(bos);
+        }
+        if (eos != LLAMA_TOKEN_NULL) {
+            tmp.push_back(eos);
+        }
+        if (tmp.empty()) {
+            tmp.push_back(0);
+        }
+        if (llama_model_has_encoder(model)) {
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+                decoder_start_token_id = bos;
+            }
+            tmp.clear();
+            tmp.push_back(decoder_start_token_id);
+        }
+        if (llama_model_has_decoder(model)) {
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
+        }
+        llama_memory_clear(llama_get_memory(lctx), true);
+        llama_synchronize(lctx);
+        llama_perf_context_reset(lctx);
+        llama_set_warmup(lctx, false);
+        // reset samplers to reset RNG state after warmup to the seeded state
+        res->reset_samplers();
+    }
+    return res;
+}
+common_init_result::~common_init_result() = default;
+std::string get_model_endpoint() {
+    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
+    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
+    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+    std::string model_endpoint = "https://huggingface.co/";
+    if (endpoint_env) {
+        model_endpoint = endpoint_env;
+        if (model_endpoint.back() != '/') {
+            model_endpoint += '/';
+        }
+    }
+    return model_endpoint;
+}
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
+    std::vector<llama_adapter_lora *> loras;
+    std::vector<float> scales;
+    for (auto & la: lora) {
+        loras.push_back(la.ptr);
+        scales.push_back(la.scale);
+    }
+    llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
+}
+struct llama_model_params common_model_params_to_llama(common_params & params) {
+    auto mparams = llama_model_default_params();
+    if (!params.devices.empty()) {
+        mparams.devices = params.devices.data();
+    }
+    mparams.n_gpu_layers    = params.n_gpu_layers;
+    mparams.main_gpu        = params.main_gpu;
+    mparams.split_mode      = params.split_mode;
+    mparams.tensor_split    = params.tensor_split;
+    mparams.use_mmap        = params.use_mmap;
+    mparams.use_direct_io   = params.use_direct_io;
+    mparams.use_mlock       = params.use_mlock;
+    mparams.check_tensors   = params.check_tensors;
+    mparams.use_extra_bufts = !params.no_extra_bufts;
+    mparams.no_host         = params.no_host;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }
+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+    mparams.progress_callback           = params.load_progress_callback;
+    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
+    return mparams;
+}
+struct llama_context_params common_context_params_to_llama(const common_params & params) {
+    auto cparams = llama_context_default_params();
+    cparams.n_ctx             = params.n_ctx;
+    cparams.n_seq_max         = params.n_parallel;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_ubatch          = params.n_ubatch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+    cparams.embeddings        = params.embedding;
+    cparams.rope_scaling_type = params.rope_scaling_type;
+    cparams.rope_freq_base    = params.rope_freq_base;
+    cparams.rope_freq_scale   = params.rope_freq_scale;
+    cparams.yarn_ext_factor   = params.yarn_ext_factor;
+    cparams.yarn_attn_factor  = params.yarn_attn_factor;
+    cparams.yarn_beta_fast    = params.yarn_beta_fast;
+    cparams.yarn_beta_slow    = params.yarn_beta_slow;
+    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.pooling_type      = params.pooling_type;
+    cparams.attention_type    = params.attention_type;
+    cparams.flash_attn_type   = params.flash_attn_type;
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+    cparams.offload_kqv       = !params.no_kv_offload;
+    cparams.no_perf           = params.no_perf;
+    cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;
+    cparams.kv_unified        = params.kv_unified;
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;
+    return cparams;
+}
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+    return tpp;
+}
+//
+// Batch utils
+//
+void common_batch_clear(struct llama_batch & batch) {
+    batch.n_tokens = 0;
+}
+void common_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits) {
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos;
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits  [batch.n_tokens] = logits;
+    batch.n_tokens++;
+}
+//
+// Vocab utils
+//
+std::vector<llama_token> common_tokenize(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_tokenize(vocab, text, add_special, parse_special);
+}
+std::vector<llama_token> common_tokenize(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_token_to_piece(vocab, token, special);
+}
+std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+    const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
+    }
+    return piece;
+}
+std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_detokenize(vocab, tokens, special);
+}
+std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+    text.resize(n_chars);
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}
+//
+// Embedding utils
+//
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
+    double sum = 0.0;
+    switch (embd_norm) {
+        case -1: // no normalisation
+            sum = 1.0;
+            break;
+        case 0: // max absolute
+            for (int i = 0; i < n; i++) {
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
+            }
+            sum /= 32760.0; // make an int16 range
+            break;
+        case 2: // euclidean
+            for (int i = 0; i < n; i++) {
+                sum += inp[i] * inp[i];
+            }
+            sum = std::sqrt(sum);
+            break;
+        default: // p-norm (euclidean is p-norm p=2)
+            for (int i = 0; i < n; i++) {
+                sum += std::pow(std::abs(inp[i]), embd_norm);
+            }
+            sum = std::pow(sum, 1.0 / embd_norm);
+            break;
+    }
+    const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
+    for (int i = 0; i < n; i++) {
+        out[i] = inp[i] * norm;
+    }
+}
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+    double sum  = 0.0;
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+    for (int i = 0; i < n; i++) {
+        sum  += embd1[i] * embd2[i];
+        sum1 += embd1[i] * embd1[i];
+        sum2 += embd2[i] * embd2[i];
+    }
+    // Handle the case where one or both vectors are zero vectors
+    if (sum1 == 0.0 || sum2 == 0.0) {
+        if (sum1 == 0.0 && sum2 == 0.0) {
+            return 1.0f; // two zero vectors are similar
+        }
+        return 0.0f;
+    }
+    return sum / (sqrt(sum1) * sqrt(sum2));
+}
+//
+// Control vector utils
+//
+static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
+    common_control_vector_data result = { -1, {} };
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
+    }
+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
+    if (n_tensors == 0) {
+        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+    }
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
+        int layer_idx = -1;
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
+            }
+        }
+        if (layer_idx < 0) {
+            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+    }
+    if (result.n_embd == -1) {
+        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+    return result;
+}
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
+    common_control_vector_data result = { -1, {} };
+    for (const auto & info : load_infos) {
+        auto cur = common_control_vector_load_one(info);
+        if (cur.n_embd == -1) {
+            result.n_embd = -1;
+            break;
+        }
+        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (result.n_embd == -1) {
+            result = std::move(cur);
+        } else {
+            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
+            for (size_t i = 0; i < cur.data.size(); i++) {
+                result.data[i] += cur.data[i];
+            }
+        }
+    }
+    if (result.n_embd == -1) {
+        LOG_ERR("%s: no valid control vector files passed\n", __func__);
+        result.data.clear();
+    }
+    return result;
+}
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
+    const int64_t ne_datapoint = llama_n_ctx(ctx);
+    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
+    ggml_opt_dataset_t result = ggml_opt_dataset_init(
+        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
+    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
+    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
+        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
+    }
+    return result;
+}
+ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
+    const lr_opt &            d      = *(lr_opt *) userdata;
+    result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
+    result.sgd.wd = result.adamw.wd = d.wd;
+    return result;
+}
+// TODO make all command line args case-insensitive
+static inline bool eq_case_insensitive(char const* a, char const* b) {
+    return !
+#if defined(_MSC_VER)
+        _stricmp
+#else
+        strcasecmp
+#endif // defined(_MSC_VER)
+        (a, b);
+}
+enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
+    if (eq_case_insensitive("adamw", n)) {
+        return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    }
+    if (eq_case_insensitive("sgd", n)) {
+        return GGML_OPT_OPTIMIZER_TYPE_SGD;
+    }
+    return GGML_OPT_OPTIMIZER_TYPE_COUNT;
+}
+// TODO simplify to use just log and exp
+static float const k_log_2 = std::log(2.f);
+void lr_opt::init() {
+    if (lr_min > 0 && lr_min < lr0) {
+        float nhalf = std::log(lr0 / lr_min) / k_log_2;
+        float e     = epochs;
+        if (decay_epochs > 0 && decay_epochs < e) {
+            e = decay_epochs;
+        } else {
+            decay_epochs = e;
+        }
+        scale_epoch = nhalf / e;
+    }
+}
+float lr_opt::get_lr(float epoch) const {
+    float r = lr_min <= 0 ? lr0 :
+        epoch >= decay_epochs ? lr_min :
+        lr0 * std::pow(0.5f, epoch * scale_epoch);
+    LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
+    return r;
+}
+bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
+    llama_batch batch = llama_batch_get_one(&last_token, 1);
+    batch.pos = &pos;
+    if (llama_decode(ctx, batch)) {
+        LOG_ERR("%s: failed to replay last token\n", __func__);
+        return false;
+    }
+    return true;
+}
+bool common_prompt_batch_decode(
+              struct llama_context * ctx,
+    const std::vector<llama_token> & tokens,
+                               int & n_past,
+                               int   n_batch,
+                  std::string_view   state_path,
+                              bool   save_state) {
+    const int n_eval = tokens.size();
+    if (n_eval == 0) {
+        return true;
+    }
+    if (save_state && n_eval > 1) {
+        const int n_tokens_before_last = n_eval - 1;
+        GGML_ASSERT(n_eval <= n_batch);
+        // Decode all but the last token so we can save the memory state before decoding the last token.
+        // This is done so we can restore the session state later and replay the last token.
+        // Memory implementations in recurrent/hybrid models don't support removing tokens from their
+        // memory, so we can't just remove the last token from the memory and replay the last token which
+        // is the reason for this logic.
+        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_tokens_before_last;
+        llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
+        LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
+        llama_token last_token = tokens.back();
+        llama_batch batch = llama_batch_get_one(&last_token, 1);
+        int32_t pos = n_past;
+        batch.pos = &pos;
+        if (llama_decode(ctx, batch)) {
+            LOG_ERR("%s : failed to eval last token\n", __func__);
+            return false;
+        }
+        n_past++;
+    } else {
+        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    return true;
+}

llama.cpp/common/common.h ADDED Viewed

	@@ -0,0 +1,931 @@

+// Various helper functions and utilities
+#pragma once
+#include "ggml-opt.h"
+#include "llama-cpp.h"
+#include <set>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <map>
+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0A00
+#endif
+#ifdef _WIN32
+#define DIRECTORY_SEPARATOR '\\'
+#else
+#define DIRECTORY_SEPARATOR '/'
+#endif // _WIN32
+#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+#define print_build_info() do {                                                                     \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
+} while(0)
+struct common_time_meas {
+    common_time_meas(int64_t & t_acc, bool disable = false);
+    ~common_time_meas();
+    const int64_t t_start_us;
+    int64_t & t_acc;
+};
+struct common_adapter_lora_info {
+    std::string path;
+    float scale;
+    std::string task_name;
+    std::string prompt_prefix;
+    struct llama_adapter_lora * ptr;
+};
+using llama_tokens = std::vector<llama_token>;
+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern const char * LLAMA_COMMIT;
+extern const char * LLAMA_COMPILER;
+extern const char * LLAMA_BUILD_TARGET;
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+struct common_control_vector_load_info;
+//
+// CPU utils
+//
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();
+//
+// Common params
+//
+enum llama_example {
+    LLAMA_EXAMPLE_BATCHED,
+    LLAMA_EXAMPLE_DEBUG,
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_COMPLETION,
+    LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_MTMD,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_DIFFUSION,
+    LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_FIT_PARAMS,
+    LLAMA_EXAMPLE_COUNT,
+};
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_DRY         = 1,
+    COMMON_SAMPLER_TYPE_TOP_K       = 2,
+    COMMON_SAMPLER_TYPE_TOP_P       = 3,
+    COMMON_SAMPLER_TYPE_MIN_P       = 4,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
+    COMMON_SAMPLER_TYPE_XTC         = 8,
+    COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
+    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
+    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
+};
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+    DIMRE_METHOD_PCA,
+    DIMRE_METHOD_MEAN,
+};
+enum common_conversation_mode {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
+    COMMON_CONVERSATION_MODE_ENABLED  = 1,
+    COMMON_CONVERSATION_MODE_AUTO     = 2,
+};
+enum common_grammar_trigger_type {
+    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+};
+struct common_grammar_trigger {
+    common_grammar_trigger_type type;
+    std::string value;
+    llama_token token = LLAMA_TOKEN_NULL;
+};
+enum common_params_sampling_config : uint64_t {
+    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
+    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
+};
+enum common_speculative_type {
+    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
+    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
+    COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
+    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
+};
+// sampling parameters
+struct common_params_sampling {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+    int32_t n_prev             = 64;     // number of previous tokens to remember
+    int32_t n_probs            = 0;      // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;      // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;     // <= 0 to use vocab size
+    float   top_p              = 0.95f;  // 1.0 = disabled
+    float   min_p              = 0.05f;  // 0.0 = disabled
+    float   xtc_probability    = 0.00f;  // 0.0 = disabled
+    float   xtc_threshold      = 0.10f;  // > 0.5 disables XTC
+    float   typ_p              = 1.00f;  // typical_p, 1.0 = disabled
+    float   temp               = 0.80f;  // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f;  // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f;  // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f;  // 1.0 = disabled
+    float   penalty_freq       = 0.00f;  // 0.0 = disabled
+    float   penalty_present    = 0.00f;  // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;   // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
+    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f; // -1.0 = disabled
+    float   mirostat_tau       = 5.00f;  // target entropy
+    float   mirostat_eta       = 0.10f;  // learning rate
+    bool    ignore_eos         = false;
+    bool    no_perf            = false;  // disable performance metrics
+    bool    timing_per_token   = false;
+    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
+        COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
+    };
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
+    std::set<llama_token>               preserved_tokens;
+    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
+    bool backend_sampling = false;
+    bool has_logit_bias() const {
+        return !logit_bias.empty();
+    }
+    // print the parameters into a string
+    std::string print() const;
+};
+struct common_params_model {
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+};
+struct common_ngram_mod;
+struct common_params_speculative {
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
+    // general-purpose speculative decoding parameters
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+    // ngram-based speculative decoding
+    uint16_t ngram_size_n     = 12; // ngram size for lookup
+    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    // draft-model speculative decoding
+    struct common_params_model mparams_dft;
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+    int32_t n_ctx        = 0;  // draft context size
+    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    bool has_dft() const {
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
+    }
+};
+struct common_params_vocoder {
+    struct common_params_model model;
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+};
+struct common_params_diffusion {
+    int32_t steps         = 128;
+    bool    visual_mode   = false;
+    float   eps           = 0;        // epsilon for timesteps
+    int32_t block_length  = 0;        // block length for generation
+    int32_t algorithm     = 4;        // default algorithm: low-confidence
+    float   alg_temp      = 0.0f;     // algorithm temperature
+    float   cfg_scale     = 0;        // classifier-free guidance scale
+    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
+};
+// reasoning API response format (not to be confused as chat template's reasoning format)
+// only used by server
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
+    // do not extend this enum unless you absolutely have to
+    // in most cases, use COMMON_REASONING_FORMAT_AUTO
+    // see: https://github.com/ggml-org/llama.cpp/pull/15408
+};
+struct lr_opt {
+    float    lr0          = 1e-5; // learning rate at first epoch
+    float    lr_min       = -1;
+    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
+    float    scale_epoch  = 0;
+    float    wd           = 0;
+    unsigned epochs       = 2;
+    unsigned epoch; // set by optimizer outer (epochs) loop
+    // learning rate decay - constant LR per epoch only for now
+    float get_lr(float e) const;
+    float get_lr() const { return get_lr(epoch); }
+    // must call after arg parse, before get_lr
+    void init();
+};
+struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
+struct common_params {
+    int32_t n_predict             =    -1; // max. number of new tokens to predict, -1 == no limit
+    int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
+    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
+    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            =     1; // number of parallel sequences to decode
+    int32_t n_sequences           =     1; // number of sequences to decode
+    int32_t grp_attn_n            =     1; // group-attention factor
+    int32_t grp_attn_w            =   512; // group-attention width
+    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =  0.0f; // RoPE base frequency
+    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
+    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
+    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
+    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx         =     0; // YaRN original context length
+    // offload params
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+    // margin per device in bytes for fitting parameters to free memory:
+    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    ggml_backend_sched_eval_callback cb_eval = nullptr;
+    void * cb_eval_user_data                 = nullptr;
+    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
+    struct common_params_sampling    sampling;
+    struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
+    struct common_params_diffusion   diffusion;
+    struct common_params_model model;
+    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
+    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string system_prompt        = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    // llama-debug specific options
+    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
+    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
+    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
+    std::vector<std::string> in_files;   // all input files
+    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
+    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
+    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
+    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
+    bool    offline                    = false;
+    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                     //                                       (which is more convenient to use for plotting)
+                                     //
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   kl_divergence    = false; // compute KL divergence
+    bool usage             = false; // print usage
+    bool completion        = false; // print source-able completion script
+    bool use_color         = false; // use color to distinguish generations and inputs
+    bool special           = false; // enable special token output
+    bool interactive       = false; // interactive mode
+    bool interactive_first = false; // wait for user input immediately
+    bool prompt_cache_all  = false; // save user input and generations to prompt cache
+    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
+    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool multiline_input   = false; // reverse the usage of `\`
+    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool no_perf           = false; // disable performance metrics
+    bool show_timings      = true;  // show timing information on CLI
+    bool ctx_shift         = false; // context shift on infinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    bool kv_unified        = false; // enable unified KV cache
+    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool use_mmap          = true;  // enable mmap to use filesystem cache
+    bool use_direct_io     = false; // read from disk without buffering
+    bool use_mlock         = false; // use mlock to keep model in memory
+    bool verbose_prompt    = false; // print prompt tokens before generation
+    bool display_prompt    = true;  // print prompt before generation
+    bool no_kv_offload     = false; // disable KV offloading
+    bool warmup            = true;  // warmup run
+    bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device
+    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
+    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
+    bool single_turn       = false; // single turn chat conversation
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+    // multimodal models (see tools/mtmd)
+    struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
+    std::vector<std::string> image; // path to image file(s)
+    int image_min_tokens = -1;
+    int image_max_tokens = -1;
+    // finetune
+    struct lr_opt lr;
+    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    float val_split = 0.05f; // fraction of the data used for the validation set
+    // embedding
+    bool embedding         = false; // get only sentence embedding
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
+    std::string embd_sep   = "\n";  // separator of embeddings
+    std::string cls_sep    = "\t";  // separator of classification sequences
+    // server params
+    int32_t port              = 8080;         // server listens on this network port
+    int32_t timeout_read      = 600;          // http read timeout in seconds
+    int32_t timeout_write     = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
+    bool    cache_prompt      = true;         // whether to enable prompt caching
+    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
+    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
+    std::string hostname      = "127.0.0.1";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string api_prefix    = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    bool use_jinja = true;                                                                                  // NOLINT
+    bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    int reasoning_budget = -1;
+    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
+    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
+    std::vector<std::string> api_keys;
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT
+    std::map<std::string, std::string> default_template_kwargs;
+    // webui configs
+    bool webui = true;
+    std::string webui_config_json;
+    // "advanced" endpoints are disabled by default for better security
+    bool endpoint_slots   = true;
+    bool endpoint_props   = false; // only control POST requests, not GET
+    bool endpoint_metrics = false;
+    // router server configs
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server
+    bool log_json = false;
+    std::string slot_save_path;
+    std::string media_path; // path to directory for loading media files
+    float slot_prompt_similarity = 0.1f;
+    // batched-bench params
+    bool is_pp_shared   = false;
+    bool is_tg_separate = false;
+    std::vector<int32_t> n_pp;
+    std::vector<int32_t> n_tg;
+    std::vector<int32_t> n_pl;
+    // retrieval params
+    std::vector<std::string> context_files; // context files to embed
+    int32_t chunk_size = 64; // chunk size for context embedding
+    std::string chunk_separator = "\n"; // chunk separator for context embedding
+    // passkey params
+    int32_t n_junk = 250; // number of times to repeat the junk text
+    int32_t i_pos  = -1;  // position of the passkey in the junk text
+    // imatrix params
+    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
+    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
+    int32_t i_chunk     =  0; // start processing from this chunk
+    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
+    bool process_output  = false; // collect data for the output tensor
+    bool compute_ppl     = true;  // whether to compute perplexity
+    bool show_statistics = false; // show imatrix statistics per tensor
+    bool parse_special   = false; // whether to parse special tokens during imatrix tokenization
+    // cvector-generator params
+    int n_pca_batch = 100;
+    int n_pca_iterations = 1000;
+    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+    bool spm_infill = false; // suffix/prefix/middle pattern for infill
+    // batched-bench params
+    bool batched_bench_output_jsonl = false;
+    // common params
+    std::string out_file; // output filename for all example programs
+    // optional callback for model loading progress and cancellation:
+    // called with a progress value between 0.0 and 1.0.
+    // return false from callback to abort model loading or true to continue
+    llama_progress_callback load_progress_callback = NULL;
+    void *                  load_progress_callback_user_data = NULL;
+};
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void common_init();
+std::string common_params_get_system_info(const common_params & params);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
+//
+// String utils
+//
+#ifdef __GNUC__
+#    if defined(__MINGW32__) && !defined(__clang__)
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    else
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    endif
+#else
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);
+std::string string_strip(const std::string & str);
+std::string string_get_sortable_timestamp();
+std::string string_join(const std::vector<std::string> & values, const std::string & separator);
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
+std::string string_repeat(const std::string & str, size_t n);
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+std::string regex_escape(const std::string & s);
+template<class T>
+static std::vector<T> string_split(const std::string & str, char delim) {
+    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
+template<>
+inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t delim_pos = str.find(delim);
+    while (delim_pos != std::string::npos) {
+        std::string part = str.substr(begin_pos, delim_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = delim_pos + 1;
+        delim_pos = str.find(delim, begin_pos);
+    }
+    parts.emplace_back(str.substr(begin_pos));
+    return parts;
+}
+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, std::string_view prefix) {
+    return str.size() >= prefix.size() &&
+           str.compare(0, prefix.size(), prefix) == 0;
+}
+// remove when moving to c++20
+inline bool string_ends_with(std::string_view str, std::string_view suffix) {
+    return str.size() >= suffix.size() &&
+           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
+    if (string_ends_with(str, suffix)) {
+        str.resize(str.size() - suffix.size());
+        return true;
+    }
+    return false;
+}
+inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
+    if (!str.empty() && !stop.empty()) {
+        const size_t max_len = std::min(str.size(), stop.size());
+        const char last_char = str.back();
+        for (size_t len = max_len; len > 0; --len) {
+            if (stop[len - 1] == last_char) {
+                if (string_ends_with(str, stop.substr(0, len))) {
+                    return str.size() - len;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+void string_process_escapes(std::string & input);
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+//
+// Filesystem utils
+//
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
+bool fs_create_directory_with_parents(const std::string & path);
+bool fs_is_directory(const std::string & path);
+std::string fs_get_cache_directory();
+std::string fs_get_cache_file(const std::string & filename);
+struct common_file_info {
+    std::string path;
+    std::string name;
+    size_t      size = 0; // in bytes
+    bool        is_dir = false;
+};
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
+//
+// TTY utils
+//
+// Auto-detect if colors can be enabled based on terminal and environment
+bool tty_can_use_colors();
+//
+// Model utils
+//
+struct common_sampler;
+// note: defines the model, context, samplers, ets. lifetimes
+struct common_init_result {
+    common_init_result(common_params & params);
+    ~common_init_result();
+    llama_model * model();
+    llama_context * context();
+    common_sampler * sampler(llama_seq_id seq_id);
+    void reset_samplers();
+    std::vector<llama_adapter_lora_ptr> & lora();
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+using common_init_result_ptr = std::unique_ptr<common_init_result>;
+common_init_result_ptr common_init_from_params(common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
+// clear LoRA adapters from context, then apply new list of adapters
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+std::string                   get_model_endpoint();
+//
+// Batch utils
+//
+void common_batch_clear(struct llama_batch & batch);
+void common_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits);
+// decodes a single batch of tokens for a prompt and manages session tokens
+//
+// Note: We save state before the last token so that we can replay it to ensure
+// compatibility with all memory types. Recurrent/hybrid models cannot remove
+// tokens from memory, so this approach works across all model architectures.
+bool common_prompt_batch_decode(
+              struct llama_context * ctx,
+    const std::vector<llama_token> & embd,
+                               int & n_past,
+                               int   n_batch,
+                  std::string_view   state_path,
+                              bool   save_state);
+// replays the last token after loading state to regenerate logits
+// used after loading session state to ensure the sampling context has valid logits
+bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
+//
+// Vocab utils
+//
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
+std::vector<llama_token> common_tokenize(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special = false);
+std::vector<llama_token> common_tokenize(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special = false);
+// tokenizes a token into a piece, optionally renders special/control tokens
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string common_token_to_piece(
+        const struct llama_context * ctx,
+                       llama_token   token,
+                       bool          special = true);
+std::string common_token_to_piece(
+          const struct llama_vocab * vocab,
+                       llama_token   token,
+                       bool          special = true);
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// optionally renders special/control tokens
+std::string common_detokenize(
+            const struct llama_context * ctx,
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);
+std::string common_detokenize(
+              const struct llama_vocab * vocab,
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);
+//
+// Embedding utils
+//
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+//
+// Control vector utils
+//
+struct common_control_vector_data {
+    int n_embd;
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+struct common_control_vector_load_info {
+    float strength;
+    std::string fname;
+};
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
+//
+// Split utils
+//
+namespace {
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+}
+//
+// MoE utils
+//
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
+inline std::string llm_ffn_exps_block_regex(int idx) {
+    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
+}
+inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
+}
+//
+// training utils
+//
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
+// "adamw" or "sgd" (case insensitive)
+enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);

llama.cpp/common/console.cpp ADDED Viewed

	@@ -0,0 +1,1137 @@

+#include "console.h"
+#include "log.h"
+#include <vector>
+#include <iostream>
+#include <cassert>
+#include <cstddef>
+#include <cctype>
+#include <cwctype>
+#include <cstdint>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <stdarg.h>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
+#endif
+#else
+#include <climits>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <termios.h>
+#endif
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_GRAY    "\x1b[90m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+#define ANSI_BOLD          "\x1b[1m"
+namespace console {
+#if defined (_WIN32)
+    namespace {
+        // Use private-use unicode values to represent special keys that are not reported
+        // as characters (e.g. arrows on Windows). These values should never clash with
+        // real input and let the rest of the code handle navigation uniformly.
+        static constexpr char32_t KEY_ARROW_LEFT       = 0xE000;
+        static constexpr char32_t KEY_ARROW_RIGHT      = 0xE001;
+        static constexpr char32_t KEY_ARROW_UP         = 0xE002;
+        static constexpr char32_t KEY_ARROW_DOWN       = 0xE003;
+        static constexpr char32_t KEY_HOME             = 0xE004;
+        static constexpr char32_t KEY_END              = 0xE005;
+        static constexpr char32_t KEY_CTRL_ARROW_LEFT  = 0xE006;
+        static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
+        static constexpr char32_t KEY_DELETE           = 0xE008;
+    }
+    //
+    // Console state
+    //
+#endif
+    static bool         advanced_display = false;
+    static bool         simple_io        = true;
+    static display_type current_display  = DISPLAY_TYPE_RESET;
+    static FILE*        out              = stdout;
+#if defined (_WIN32)
+    static void*        hConsole;
+#else
+    static FILE*        tty              = nullptr;
+    static termios      initial_state;
+#endif
+    //
+    // Init and cleanup
+    //
+    void init(bool use_simple_io, bool use_advanced_display) {
+        advanced_display = use_advanced_display;
+        simple_io = use_simple_io;
+#if defined(_WIN32)
+        // Windows-specific console initialization
+        DWORD dwMode = 0;
+        hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
+            hConsole = GetStdHandle(STD_ERROR_HANDLE);
+            if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
+                hConsole = nullptr;
+                simple_io = true;
+            }
+        }
+        if (hConsole) {
+            // Check conditions combined to reduce nesting
+            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
+                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+                advanced_display = false;
+            }
+            // Set console output codepage to UTF8
+            SetConsoleOutputCP(CP_UTF8);
+        }
+        HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+        if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
+            // Set console input codepage to UTF16
+            _setmode(_fileno(stdin), _O_WTEXT);
+            // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+            if (simple_io) {
+                dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
+            } else {
+                dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+            }
+            if (!SetConsoleMode(hConIn, dwMode)) {
+                simple_io = true;
+            }
+        }
+        if (simple_io) {
+            _setmode(_fileno(stdin), _O_U8TEXT);
+        }
+#else
+        // POSIX-specific console initialization
+        if (!simple_io) {
+            struct termios new_termios;
+            tcgetattr(STDIN_FILENO, &initial_state);
+            new_termios = initial_state;
+            new_termios.c_lflag &= ~(ICANON | ECHO);
+            new_termios.c_cc[VMIN] = 1;
+            new_termios.c_cc[VTIME] = 0;
+            tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+            tty = fopen("/dev/tty", "w+");
+            if (tty != nullptr) {
+                out = tty;
+            }
+        }
+        setlocale(LC_ALL, "");
+#endif
+    }
+    void cleanup() {
+        // Reset console display
+        set_display(DISPLAY_TYPE_RESET);
+#if !defined(_WIN32)
+        // Restore settings on POSIX systems
+        if (!simple_io) {
+            if (tty != nullptr) {
+                out = stdout;
+                fclose(tty);
+                tty = nullptr;
+            }
+            tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
+        }
+#endif
+    }
+    //
+    // Display and IO
+    //
+    // Keep track of current display and only emit ANSI code if it changes
+    void set_display(display_type display) {
+        if (advanced_display && current_display != display) {
+            common_log_flush(common_log_main());
+            switch(display) {
+                case DISPLAY_TYPE_RESET:
+                    fprintf(out, ANSI_COLOR_RESET);
+                    break;
+                case DISPLAY_TYPE_INFO:
+                    fprintf(out, ANSI_COLOR_MAGENTA);
+                    break;
+                case DISPLAY_TYPE_PROMPT:
+                    fprintf(out, ANSI_COLOR_YELLOW);
+                    break;
+                case DISPLAY_TYPE_REASONING:
+                    fprintf(out, ANSI_COLOR_GRAY);
+                    break;
+                case DISPLAY_TYPE_USER_INPUT:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
+                    break;
+                case DISPLAY_TYPE_ERROR:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
+            }
+            current_display = display;
+            fflush(out);
+        }
+    }
+    static char32_t getchar32() {
+#if defined(_WIN32)
+        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+        wchar_t high_surrogate = 0;
+        while (true) {
+            INPUT_RECORD record;
+            DWORD count;
+            if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+                return WEOF;
+            }
+            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+                if (wc == 0) {
+                    const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
+                    const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
+                    switch (record.Event.KeyEvent.wVirtualKeyCode) {
+                        case VK_LEFT:   return ctrl_pressed ? KEY_CTRL_ARROW_LEFT  : KEY_ARROW_LEFT;
+                        case VK_RIGHT:  return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
+                        case VK_UP:     return KEY_ARROW_UP;
+                        case VK_DOWN:   return KEY_ARROW_DOWN;
+                        case VK_HOME:   return KEY_HOME;
+                        case VK_END:    return KEY_END;
+                        case VK_DELETE: return KEY_DELETE;
+                        default:        continue;
+                    }
+                }
+                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                    high_surrogate = wc;
+                    continue;
+                }
+                if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                    if (high_surrogate != 0) { // Check if we have a high surrogate
+                        return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                    }
+                }
+                high_surrogate = 0; // Reset the high surrogate
+                return static_cast<char32_t>(wc);
+            }
+        }
+#else
+        wchar_t wc = getwchar();
+        if (static_cast<wint_t>(wc) == WEOF) {
+            return WEOF;
+        }
+#if WCHAR_MAX == 0xFFFF
+        if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+            wchar_t low_surrogate = getwchar();
+            if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
+                return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+            }
+        }
+        if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
+            return 0xFFFD; // Return the replacement character U+FFFD
+        }
+#endif
+        return static_cast<char32_t>(wc);
+#endif
+    }
+    static void pop_cursor() {
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            if (newCursorPosition.X == 0) {
+                newCursorPosition.X = bufferInfo.dwSize.X - 1;
+                newCursorPosition.Y -= 1;
+            } else {
+                newCursorPosition.X -= 1;
+            }
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+            return;
+        }
+#endif
+        putc('\b', out);
+    }
+    static int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+        (void)codepoint;
+        return 1;
+#else
+        return wcwidth(codepoint);
+#endif
+    }
+    static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
+            // go with the default
+            return expectedWidth;
+        }
+        COORD initialPosition = bufferInfo.dwCursorPosition;
+        DWORD nNumberOfChars = length;
+        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
+        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+        GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+        // Figure out our real position if we're in the last column
+        if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+            DWORD nNumberOfChars;
+            WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
+            GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+        }
+        int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+        if (width < 0) {
+            width += newBufferInfo.dwSize.X;
+        }
+        return width;
+#else
+        // We can trust expectedWidth if we've got one
+        if (expectedWidth >= 0 || tty == nullptr) {
+            fwrite(utf8_codepoint, length, 1, out);
+            return expectedWidth;
+        }
+        fputs("\033[6n", tty); // Query cursor position
+        int x1;
+        int y1;
+        int x2;
+        int y2;
+        int results = 0;
+        results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
+        fwrite(utf8_codepoint, length, 1, tty);
+        fputs("\033[6n", tty); // Query cursor position
+        results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
+        if (results != 4) {
+            return expectedWidth;
+        }
+        int width = x2 - x1;
+        if (width < 0) {
+            // Calculate the width considering text wrapping
+            struct winsize w;
+            ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+            width += w.ws_col;
+        }
+        return width;
+#endif
+    }
+    static void replace_last(char ch) {
+#if defined(_WIN32)
+        pop_cursor();
+        put_codepoint(&ch, 1, 1);
+#else
+        fprintf(out, "\b%c", ch);
+#endif
+    }
+    static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
+        unsigned char c = static_cast<unsigned char>(input[pos]);
+        if ((c & 0x80u) == 0u) {
+            advance = 1;
+            return c;
+        }
+        if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            if ((c1 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 2;
+            return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
+        }
+        if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 3;
+            return ((c & 0x0Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
+        }
+        if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 4;
+            return ((c & 0x07u) << 18) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
+        }
+        advance = 1;
+        return 0xFFFD; // replacement character for invalid input
+    }
+    static void append_utf8(char32_t ch, std::string & out) {
+        if (ch <= 0x7F) {
+            out.push_back(static_cast<unsigned char>(ch));
+        } else if (ch <= 0x7FF) {
+            out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0xFFFF) {
+            out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0x10FFFF) {
+            out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else {
+            // Invalid Unicode code point
+        }
+    }
+    // Helper function to remove the last UTF-8 character from a string
+    static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos == 0) return 0;
+        pos--;
+        while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
+            pos--;
+        }
+        return pos;
+    }
+    static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos >= line.length()) return line.length();
+        pos++;
+        while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
+            pos++;
+        }
+        return pos;
+    }
+    static void move_cursor(int delta);
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
+        if (char_pos >= widths.size()) {
+            return;
+        }
+        size_t next_pos = next_utf8_char_pos(line, byte_pos);
+        int w = widths[char_pos];
+        size_t char_len = next_pos - byte_pos;
+        line.erase(byte_pos, char_len);
+        widths.erase(widths.begin() + char_pos);
+        size_t p = byte_pos;
+        int tail_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            size_t following = next_utf8_char_pos(line, p);
+            put_codepoint(line.c_str() + p, following - p, widths[i]);
+            tail_width += widths[i];
+            p = following;
+        }
+        for (int i = 0; i < w; ++i) {
+            fputc(' ', out);
+        }
+        move_cursor(-(tail_width + w));
+    }
+    static void clear_current_line(const std::vector<int> & widths) {
+        int total_width = 0;
+        for (int w : widths) {
+            total_width += (w > 0 ? w : 1);
+        }
+        if (total_width > 0) {
+            std::string spaces(total_width, ' ');
+            fwrite(spaces.c_str(), 1, total_width, out);
+            move_cursor(-total_width);
+        }
+    }
+    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
+                                  size_t & byte_pos) {
+        move_to_line_start(char_pos, byte_pos, widths);
+        clear_current_line(widths);
+        line = std::move(new_line);
+        widths.clear();
+        byte_pos = 0;
+        char_pos = 0;
+        size_t idx = 0;
+        while (idx < line.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, idx, advance);
+            int expected_width = estimateWidth(cp);
+            int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
+            if (real_width < 0) real_width = 0;
+            widths.push_back(real_width);
+            idx += advance;
+            ++char_pos;
+            byte_pos = idx;
+        }
+    }
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
+        int back_width = 0;
+        for (size_t i = 0; i < char_pos; ++i) {
+            back_width += widths[i];
+        }
+        move_cursor(-back_width);
+        char_pos = 0;
+        byte_pos = 0;
+    }
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        int forward_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            forward_width += widths[i];
+        }
+        move_cursor(forward_width);
+        char_pos = widths.size();
+        byte_pos = line.length();
+    }
+    static bool has_ctrl_modifier(const std::string & params) {
+        size_t start = 0;
+        while (start < params.size()) {
+            size_t end = params.find(';', start);
+            size_t len = (end == std::string::npos) ? params.size() - start : end - start;
+            if (len > 0) {
+                int value = 0;
+                for (size_t i = 0; i < len; ++i) {
+                    char ch = params[start + i];
+                    if (!std::isdigit(static_cast<unsigned char>(ch))) {
+                        value = -1;
+                        break;
+                    }
+                    value = value * 10 + (ch - '0');
+                }
+                if (value == 5) {
+                    return true;
+                }
+            }
+            if (end == std::string::npos) {
+                break;
+            }
+            start = end + 1;
+        }
+        return false;
+    }
+    static bool is_space_codepoint(char32_t cp) {
+        return std::iswspace(static_cast<wint_t>(cp)) != 0;
+    }
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos == 0) {
+            return;
+        }
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+        move_cursor(-move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos >= widths.size()) {
+            return;
+        }
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+        move_cursor(move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+    static void move_cursor(int delta) {
+        if (delta == 0) return;
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            int width = bufferInfo.dwSize.X;
+            int newX = newCursorPosition.X + delta;
+            int newY = newCursorPosition.Y;
+            while (newX >= width) {
+                newX -= width;
+                newY++;
+            }
+            while (newX < 0) {
+                newX += width;
+                newY--;
+            }
+            newCursorPosition.X = newX;
+            newCursorPosition.Y = newY;
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+        }
+#else
+        if (delta < 0) {
+            for (int i = 0; i < -delta; i++) fprintf(out, "\b");
+        } else {
+            for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
+        }
+#endif
+    }
+    struct history_t {
+        std::vector<std::string> entries;
+        size_t viewing_idx = SIZE_MAX;
+        std::string backup_line; // current line before viewing history
+        void add(const std::string & line) {
+            if (line.empty()) {
+                return;
+            }
+            // avoid duplicates with the last entry
+            if (entries.empty() || entries.back() != line) {
+                entries.push_back(line);
+            }
+            // also clear viewing state
+            end_viewing();
+        }
+        bool prev(std::string & cur_line) {
+            if (entries.empty()) {
+                return false;
+            }
+            if (viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            if (viewing_idx > 0) {
+                viewing_idx--;
+            }
+            cur_line = entries[viewing_idx];
+            return true;
+        }
+        bool next(std::string & cur_line) {
+            if (entries.empty() || viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            viewing_idx++;
+            if (viewing_idx >= entries.size()) {
+                cur_line = backup_line;
+                end_viewing();
+            } else {
+                cur_line = entries[viewing_idx];
+            }
+            return true;
+        }
+        void begin_viewing(const std::string & line) {
+            backup_line = line;
+            viewing_idx = entries.size();
+        }
+        void end_viewing() {
+            viewing_idx = SIZE_MAX;
+            backup_line.clear();
+        }
+        bool is_viewing() const {
+            return viewing_idx != SIZE_MAX;
+        }
+    } history;
+    static bool readline_advanced(std::string & line, bool multiline_input) {
+        if (out != stdout) {
+            fflush(stdout);
+        }
+        line.clear();
+        std::vector<int> widths;
+        bool is_special_char = false;
+        bool end_of_stream = false;
+        size_t byte_pos = 0; // current byte index
+        size_t char_pos = 0; // current character index (one char can be multiple bytes)
+        char32_t input_char;
+        while (true) {
+            assert(char_pos <= byte_pos);
+            assert(char_pos <= widths.size());
+            auto history_prev = [&]() {
+                if (!history.is_viewing()) {
+                    history.begin_viewing(line);
+                }
+                std::string new_line;
+                if (!history.prev(new_line)) {
+                    return;
+                }
+                set_line_contents(new_line, line, widths, char_pos, byte_pos);
+            };
+            auto history_next = [&]() {
+                if (history.is_viewing()) {
+                    std::string new_line;
+                    if (!history.next(new_line)) {
+                        return;
+                    }
+                    set_line_contents(new_line, line, widths, char_pos, byte_pos);
+                }
+            };
+            fflush(out); // Ensure all output is displayed before waiting for input
+            input_char = getchar32();
+            if (input_char == '\r' || input_char == '\n') {
+                break;
+            }
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
+                end_of_stream = true;
+                break;
+            }
+            if (is_special_char) {
+                replace_last(line.back());
+                is_special_char = false;
+            }
+            if (input_char == '\033') { // Escape sequence
+                char32_t code = getchar32();
+                if (code == '[') {
+                    std::string params;
+                    while (true) {
+                        code = getchar32();
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
+                            break;
+                        }
+                        params.push_back(static_cast<char>(code));
+                    }
+                    const bool ctrl_modifier = has_ctrl_modifier(params);
+                    if (code == 'D') { // left
+                        if (ctrl_modifier) {
+                            move_word_left(char_pos, byte_pos, widths, line);
+                        } else if (char_pos > 0) {
+                            int w = widths[char_pos - 1];
+                            move_cursor(-w);
+                            char_pos--;
+                            byte_pos = prev_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'C') { // right
+                        if (ctrl_modifier) {
+                            move_word_right(char_pos, byte_pos, widths, line);
+                        } else if (char_pos < widths.size()) {
+                            int w = widths[char_pos];
+                            move_cursor(w);
+                            char_pos++;
+                            byte_pos = next_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'H') { // home
+                        move_to_line_start(char_pos, byte_pos, widths);
+                    } else if (code == 'F') { // end
+                        move_to_line_end(char_pos, byte_pos, widths, line);
+                    } else if (code == 'A' || code == 'B') {
+                        // up/down
+                        if (code == 'A') {
+                            history_prev();
+                            is_special_char = false;
+                        } else if (code == 'B') {
+                            history_next();
+                            is_special_char = false;
+                        }
+                    } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
+                        std::string digits;
+                        for (char ch : params) {
+                            if (ch == ';') {
+                                break;
+                            }
+                            if (std::isdigit(static_cast<unsigned char>(ch))) {
+                                digits.push_back(ch);
+                            }
+                        }
+                        if (code == '~') {
+                            if (digits == "1" || digits == "7") { // home
+                                move_to_line_start(char_pos, byte_pos, widths);
+                            } else if (digits == "4" || digits == "8") { // end
+                                move_to_line_end(char_pos, byte_pos, widths, line);
+                            } else if (digits == "3") { // delete
+                                delete_at_cursor(line, widths, char_pos, byte_pos);
+                            }
+                        }
+                    }
+                } else if (code == 0x1B) {
+                    // Discard the rest of the escape sequence
+                    while ((code = getchar32()) != (char32_t) WEOF) {
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+                            break;
+                        }
+                    }
+                }
+#if defined(_WIN32)
+            } else if (input_char == KEY_ARROW_LEFT) {
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    byte_pos = prev_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_ARROW_RIGHT) {
+                if (char_pos < widths.size()) {
+                    int w = widths[char_pos];
+                    move_cursor(w);
+                    char_pos++;
+                    byte_pos = next_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_CTRL_ARROW_LEFT) {
+                move_word_left(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
+                move_word_right(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_HOME) {
+                move_to_line_start(char_pos, byte_pos, widths);
+            } else if (input_char == KEY_END) {
+                move_to_line_end(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_DELETE) {
+                delete_at_cursor(line, widths, char_pos, byte_pos);
+            } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
+                if (input_char == KEY_ARROW_UP) {
+                    history_prev();
+                    is_special_char = false;
+                } else if (input_char == KEY_ARROW_DOWN) {
+                    history_next();
+                    is_special_char = false;
+                }
+#endif
+            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
+                    size_t char_len = byte_pos - prev_pos;
+                    byte_pos = prev_pos;
+                    // remove the character
+                    line.erase(byte_pos, char_len);
+                    widths.erase(widths.begin() + char_pos);
+                    // redraw tail
+                    size_t p = byte_pos;
+                    int tail_width = 0;
+                    for (size_t i = char_pos; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+                    // clear display
+                    for (int i = 0; i < w; ++i) {
+                        fputc(' ', out);
+                    }
+                    move_cursor(-(tail_width + w));
+                }
+            } else {
+                // insert character
+                std::string new_char_str;
+                append_utf8(input_char, new_char_str);
+                int w = estimateWidth(input_char);
+                if (char_pos == widths.size()) {
+                    // insert at the end
+                    line += new_char_str;
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+                    widths.push_back(real_w);
+                    byte_pos += new_char_str.length();
+                    char_pos++;
+                } else {
+                    // insert in middle
+                    line.insert(byte_pos, new_char_str);
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+                    widths.insert(widths.begin() + char_pos, real_w);
+                    // print the tail
+                    size_t p = byte_pos + new_char_str.length();
+                    int tail_width = 0;
+                    for (size_t i = char_pos + 1; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+                    move_cursor(-tail_width);
+                    byte_pos += new_char_str.length();
+                    char_pos++;
+                }
+            }
+            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+                replace_last(line.back());
+                is_special_char = true;
+            }
+        }
+        bool has_more = multiline_input;
+        if (is_special_char) {
+            replace_last(' ');
+            pop_cursor();
+            char last = line.back();
+            line.pop_back();
+            if (last == '\\') {
+                line += '\n';
+                fputc('\n', out);
+                has_more = !has_more;
+            } else {
+                // llama will just eat the single space, it won't act as a space
+                if (line.length() == 1 && line.back() == ' ') {
+                    line.clear();
+                    pop_cursor();
+                }
+                has_more = false;
+            }
+        } else {
+            if (end_of_stream) {
+                has_more = false;
+            } else {
+                line += '\n';
+                fputc('\n', out);
+            }
+        }
+        if (!end_of_stream && !line.empty()) {
+            // remove the trailing newline for history storage
+            if (!line.empty() && line.back() == '\n') {
+                line.pop_back();
+            }
+            // TODO: maybe support multiline history entries?
+            history.add(line);
+        }
+        fflush(out);
+        return has_more;
+    }
+    static bool readline_simple(std::string & line, bool multiline_input) {
+#if defined(_WIN32)
+        std::wstring wline;
+        if (!std::getline(std::wcin, wline)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
+            return false;
+        }
+        int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
+        line.resize(size_needed);
+        WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
+#else
+        if (!std::getline(std::cin, line)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            return false;
+        }
+#endif
+        if (!line.empty()) {
+            char last = line.back();
+            if (last == '/') { // Always return control on '/' symbol
+                line.pop_back();
+                return false;
+            }
+            if (last == '\\') { // '\\' changes the default action
+                line.pop_back();
+                multiline_input = !multiline_input;
+            }
+        }
+        line += '\n';
+        // By default, continue input if multiline_input is set
+        return multiline_input;
+    }
+    bool readline(std::string & line, bool multiline_input) {
+        if (simple_io) {
+            return readline_simple(line, multiline_input);
+        }
+        return readline_advanced(line, multiline_input);
+    }
+    namespace spinner {
+        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
+        static std::condition_variable cv_stop;
+        static std::thread th;
+        static size_t frame = 0; // only modified by one thread
+        static bool running = false;
+        static std::mutex mtx;
+        static auto wait_time = std::chrono::milliseconds(100);
+        static void draw_next_frame() {
+            // don't need lock because only one thread modifies running
+            frame = (frame + 1) % sizeof(LOADING_CHARS);
+            replace_last(LOADING_CHARS[frame]);
+            fflush(out);
+        }
+        void start() {
+            std::unique_lock<std::mutex> lock(mtx);
+            if (simple_io || running) {
+                return;
+            }
+            common_log_flush(common_log_main());
+            fprintf(out, "%c", LOADING_CHARS[0]);
+            fflush(out);
+            frame = 1;
+            running = true;
+            th = std::thread([]() {
+                std::unique_lock<std::mutex> lock(mtx);
+                while (true) {
+                    if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
+                        break;
+                    }
+                    draw_next_frame();
+                }
+            });
+        }
+        void stop() {
+            {
+                std::unique_lock<std::mutex> lock(mtx);
+                if (simple_io || !running) {
+                    return;
+                }
+                running = false;
+                cv_stop.notify_all();
+            }
+            if (th.joinable()) {
+                th.join();
+            }
+            replace_last(' ');
+            pop_cursor();
+            fflush(out);
+        }
+    }
+    void log(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+    }
+    void error(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        display_type cur = current_display;
+        set_display(DISPLAY_TYPE_ERROR);
+        vfprintf(out, fmt, args);
+        set_display(cur); // restore previous color
+        va_end(args);
+    }
+    void flush() {
+        fflush(out);
+    }
+}

llama.cpp/common/console.h ADDED Viewed

	@@ -0,0 +1,41 @@

+// Console functions
+#pragma once
+#include "common.h"
+#include <string>
+enum display_type {
+    DISPLAY_TYPE_RESET = 0,
+    DISPLAY_TYPE_INFO,
+    DISPLAY_TYPE_PROMPT,
+    DISPLAY_TYPE_REASONING,
+    DISPLAY_TYPE_USER_INPUT,
+    DISPLAY_TYPE_ERROR
+};
+namespace console {
+    void init(bool use_simple_io, bool use_advanced_display);
+    void cleanup();
+    void set_display(display_type display);
+    bool readline(std::string & line, bool multiline_input);
+    namespace spinner {
+        void start();
+        void stop();
+    }
+    // note: the logging API below output directly to stdout
+    // it can negatively impact performance if used on inference thread
+    // only use in in a dedicated CLI thread
+    // for logging in inference thread, use log.h instead
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void log(const char * fmt, ...);
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void error(const char * fmt, ...);
+    void flush();
+}

llama.cpp/common/debug.cpp ADDED Viewed

	@@ -0,0 +1,167 @@

+#include "debug.h"
+#include "log.h"
+#include <cmath>
+#include <string>
+static std::string common_ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+static float common_ggml_get_float_value(const uint8_t * data,
+                           ggml_type       type,
+                           const size_t *  nb,
+                           size_t          i0,
+                           size_t          i1,
+                           size_t          i2,
+                           size_t          i3) {
+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    float  v;
+    if (type == GGML_TYPE_F16) {
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+    } else if (type == GGML_TYPE_F32) {
+        v = *(const float *) &data[i];
+    } else if (type == GGML_TYPE_I64) {
+        v = (float) *(const int64_t *) &data[i];
+    } else if (type == GGML_TYPE_I32) {
+        v = (float) *(const int32_t *) &data[i];
+    } else if (type == GGML_TYPE_I16) {
+        v = (float) *(const int16_t *) &data[i];
+    } else if (type == GGML_TYPE_I8) {
+        v = (float) *(const int8_t *) &data[i];
+    } else if (type == GGML_TYPE_BF16) {
+        v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+    return v;
+}
+#define INDENT "    "
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    sum += v;
+                }
+            }
+        }
+    }
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG(INDENT "[\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2 * n) {
+                LOG(INDENT INDENT "..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG(INDENT INDENT "[\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2 * n) {
+                    LOG(INDENT INDENT INDENT "..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG(INDENT INDENT INDENT "[");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2 * n) {
+                        LOG("   ..., ");
+                        i0 = ne[0] - n;
+                    }
+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    LOG("%12.4f", v);
+                    if (i0 < ne[0] - 1) {
+                        LOG(", ");
+                    }
+                }
+                LOG("  ],\n");
+            }
+            LOG(INDENT INDENT "],\n");
+        }
+        LOG(INDENT "]\n");
+        LOG(INDENT "sum = %f\n", sum);
+    }
+    if constexpr (abort) {
+        if (std::isnan(sum)) {
+            LOG("encountered NaN - aborting\n");
+            exit(0);
+        }
+    }
+}
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (base_callback_data *) user_data;
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+    if (ask) {
+        return true;  // Always retrieve data
+    }
+    bool matches_filter = cb_data->tensor_filters.empty();
+    if (!matches_filter) {
+        for (const auto & filter : cb_data->tensor_filters) {
+            if (std::regex_search(t->name, filter)) {
+                matches_filter = true;
+                break;
+            }
+        }
+    }
+    char src1_str[128] = { 0 };
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
+    }
+    if (matches_filter) {
+        LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
+            ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
+            common_ggml_ne_string(t).c_str());
+    }
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+    if (!ggml_is_quantized(t->type) && matches_filter) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+    }
+    return true;
+}
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);

llama.cpp/common/debug.h ADDED Viewed

	@@ -0,0 +1,43 @@

+#pragma once
+#include "common.h"
+#include <string>
+#include <vector>
+#include <regex>
+// common debug functions and structs
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne   - the tensor dimensions array
+// nb   - the tensor strides array
+// n    - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
+// Intended to use as callback for ggml_backend_sched_eval_callback
+// prints tensors that are processed in the computation graph
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// in a tensor (useful for stopping debug sessions on first erroneous tensor)
+// The callback data will be passed as the third parameter (user_data)
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;
+    base_callback_data() = default;
+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = common_debug_cb_eval<false>;
+        params.cb_eval_user_data = this;
+    }
+};