| |
|
| |
|
| |
|
| |
|
| | name: Benchmark
|
| |
|
| | on:
|
| | workflow_dispatch:
|
| | inputs:
|
| | gpu-series:
|
| | description: 'Azure GPU series to run with'
|
| | required: true
|
| | type: choice
|
| | options:
|
| | - Standard_NC4as_T4_v3
|
| | - Standard_NC24ads_A100_v4
|
| | - Standard_NC80adis_H100_v5
|
| | sha:
|
| | description: 'Commit SHA1 to build'
|
| | required: false
|
| | type: string
|
| | duration:
|
| | description: 'Duration of the bench'
|
| | type: string
|
| | default: 10m
|
| |
|
| | push:
|
| | branches:
|
| | - master
|
| | paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
| | pull_request_target:
|
| | types: [opened, synchronize, reopened]
|
| | paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
| | schedule:
|
| | - cron: '04 2 * * *'
|
| |
|
| | concurrency:
|
| | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
|
| | cancel-in-progress: true
|
| |
|
| | jobs:
|
| | bench-server-baseline:
|
| | runs-on: Standard_NC4as_T4_v3
|
| | env:
|
| | RUNNER_LABEL: Standard_NC4as_T4_v3
|
| | N_USERS: 8
|
| | DURATION: 10m
|
| |
|
| | strategy:
|
| | matrix:
|
| | model: [phi-2]
|
| | ftype: [q4_0, q8_0, f16]
|
| | include:
|
| | - model: phi-2
|
| | ftype: q4_0
|
| | pr_comment_enabled: "true"
|
| |
|
| | if: |
|
| | inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
| | || github.event_name == 'pull_request_target'
|
| | steps:
|
| | - name: Clone
|
| | id: checkout
|
| | uses: actions/checkout@v4
|
| | with:
|
| | fetch-depth: 0
|
| | ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
| |
|
| | - name: Install python env
|
| | id: pipenv
|
| | run: |
|
| | cd tools/server/bench
|
| | python3 -m venv venv
|
| | source venv/bin/activate
|
| | pip install -r requirements.txt
|
| |
|
| | - name: Prometheus
|
| | id: install_prometheus
|
| | run: |
|
| | wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
| | tar xzf prometheus*.tar.gz --strip-components=1
|
| | ./prometheus --config.file=tools/server/bench/prometheus.yml &
|
| | while ! nc -z localhost 9090; do
|
| | sleep 0.1
|
| | done
|
| |
|
| | - name: Set up Go
|
| | uses: actions/setup-go@v5
|
| | with:
|
| | go-version: '1.21'
|
| |
|
| | - name: Install k6 and xk6-sse
|
| | id: k6_installation
|
| | run: |
|
| | cd tools/server/bench
|
| | go install go.k6.io/xk6/cmd/xk6@latest
|
| | xk6 build master \
|
| | --with github.com/phymbert/xk6-sse
|
| |
|
| | - name: Build
|
| | id: cmake_build
|
| | run: |
|
| | set -eux
|
| | cmake -B build \
|
| | -DGGML_NATIVE=OFF \
|
| | -DLLAMA_BUILD_SERVER=ON \
|
| | -DLLAMA_CUBLAS=ON \
|
| | -DCUDAToolkit_ROOT=/usr/local/cuda \
|
| | -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
| | -DCMAKE_CUDA_ARCHITECTURES=75 \
|
| | -DLLAMA_FATAL_WARNINGS=OFF \
|
| | -DLLAMA_ALL_WARNINGS=OFF \
|
| | -DCMAKE_BUILD_TYPE=Release;
|
| | cmake --build build --config Release -j $(nproc) --target llama-server
|
| |
|
| | - name: Download the dataset
|
| | id: download_dataset
|
| | run: |
|
| | cd tools/server/bench
|
| | wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
| |
|
| | - name: Server bench
|
| | id: server_bench
|
| | env:
|
| | HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
| | run: |
|
| | set -eux
|
| |
|
| | cd tools/server/bench
|
| | source venv/bin/activate
|
| | python bench.py \
|
| | --runner-label ${{ env.RUNNER_LABEL }} \
|
| | --name ${{ github.job }} \
|
| | --branch $HEAD_REF \
|
| | --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
| | --scenario script.js \
|
| | --duration ${{ github.event.inputs.duration || env.DURATION }} \
|
| | --hf-repo ggml-org/models \
|
| | --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
|
| | --model-path-prefix /models \
|
| | --parallel ${{ env.N_USERS }} \
|
| | -ngl 33 \
|
| | --batch-size 2048 \
|
| | --ubatch-size 256 \
|
| | --ctx-size 16384 \
|
| | --n-prompts 1000 \
|
| | --max-prompt-tokens 1024 \
|
| | --max-tokens 2048
|
| |
|
| | cat results.github.env >> $GITHUB_ENV
|
| |
|
| |
|
| | rm ShareGPT_V3_unfiltered_cleaned_split.json
|
| |
|
| | - uses: actions/upload-artifact@v4
|
| | with:
|
| | name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
| | compression-level: 9
|
| | path: |
|
| | tools/server/bench/*.jpg
|
| | tools/server/bench/*.json
|
| | tools/server/bench/*.log
|
| |
|
| | - name: Commit status
|
| | uses: Sibz/github-status-action@v1
|
| | with:
|
| | authToken: ${{secrets.GITHUB_TOKEN}}
|
| | sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
| | context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
| | description: |
|
| | ${{ env.BENCH_RESULTS }}
|
| | state: 'success'
|
| |
|
| | - name: Upload benchmark images
|
| | uses: devicons/public-upload-to-imgur@v2.2.2
|
| | continue-on-error: true
|
| | id: imgur_step
|
| | with:
|
| | client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
| | path: |
|
| | tools/server/bench/prompt_tokens_seconds.jpg
|
| | tools/server/bench/predicted_tokens_seconds.jpg
|
| | tools/server/bench/kv_cache_usage_ratio.jpg
|
| | tools/server/bench/requests_processing.jpg
|
| |
|
| | - name: Extract mermaid
|
| | id: set_mermaid
|
| | run: |
|
| | set -eux
|
| |
|
| | cd tools/server/bench
|
| | PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
| | echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
| | echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
| | echo "EOF" >> $GITHUB_ENV
|
| |
|
| | PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
|
| | echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
| | echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
|
| | echo "EOF" >> $GITHUB_ENV
|
| |
|
| | KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
|
| | echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
|
| | echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
|
| | echo "EOF" >> $GITHUB_ENV
|
| |
|
| | REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
|
| | echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
|
| | echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
|
| | echo "EOF" >> $GITHUB_ENV
|
| |
|
| | - name: Extract image url
|
| | id: extract_image_url
|
| | continue-on-error: true
|
| | run: |
|
| | set -eux
|
| |
|
| | echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
|
| | echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
|
| | echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
|
| | echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
|
| |
|
| | - name: Comment PR
|
| | uses: mshick/add-pr-comment@v2
|
| | id: comment_pr
|
| | if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
|
| | with:
|
| | message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
| | message: |
|
| | <p align="center">
|
| |
|
| | 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
| |
|
| | </p>
|
| |
|
| | <details>
|
| |
|
| | <summary>Expand details for performance related PR only</summary>
|
| |
|
| | - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
| | - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
| | - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
| | - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
| | - ${{ env.BENCH_GRAPH_XLABEL }}
|
| |
|
| |
|
| | <p align="center">
|
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
|
| |
|
| | <details>
|
| |
|
| | <summary>More</summary>
|
| |
|
| | ```mermaid
|
| | ${{ env.PROMPT_TOKENS_SECONDS }}
|
| | ```
|
| |
|
| | </details>
|
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
|
| |
|
| | <details>
|
| | <summary>More</summary>
|
| |
|
| | ```mermaid
|
| | ${{ env.PREDICTED_TOKENS_SECONDS }}
|
| | ```
|
| |
|
| | </details>
|
| |
|
| | </p>
|
| |
|
| | <details>
|
| |
|
| | <summary>Details</summary>
|
| |
|
| | <p align="center">
|
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
|
| |
|
| | <details>
|
| | <summary>More</summary>
|
| |
|
| | ```mermaid
|
| | ${{ env.KV_CACHE_USAGE_RATIO }}
|
| | ```
|
| |
|
| | </details>
|
| |
|
| | <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
|
| |
|
| | <details>
|
| | <summary>More</summary>
|
| |
|
| | ```mermaid
|
| | ${{ env.REQUESTS_PROCESSING }}
|
| | ```
|
| |
|
| | </details>
|
| |
|
| | </p>
|
| | </details>
|
| | </details>
|
| |
|