| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) |
| LLAMA_SERVER=${LLAMA_SERVER:-llama-server} |
|
|
| if ! command -v "$LLAMA_SERVER" >/dev/null 2>&1; then |
| echo "llama-server was not found. Install llama.cpp or set LLAMA_SERVER=/path/to/llama-server." >&2 |
| exit 1 |
| fi |
|
|
| if [[ -n "${MIMO_MODEL:-}" ]]; then |
| MODEL=$MIMO_MODEL |
| else |
| shopt -s nullglob |
| CANDIDATES=("$SCRIPT_DIR"/MiMo-V2.5-coder-Q2-v2-MTP-00001-of-*.gguf) |
| shopt -u nullglob |
|
|
| if [[ ${#CANDIDATES[@]} -eq 0 ]]; then |
| echo "No first GGUF shard found next to run-server.sh." >&2 |
| exit 1 |
| fi |
|
|
| MODEL=${CANDIDATES[0]} |
| fi |
|
|
| ARGS=( |
| --model "$MODEL" |
| --host "${MIMO_HOST:-127.0.0.1}" |
| --port "${MIMO_PORT:-8080}" |
| --ctx-size "${MIMO_CTX:-100000}" |
| --parallel "${MIMO_PARALLEL:-1}" |
| --batch-size "${MIMO_BATCH:-512}" |
| --ubatch-size "${MIMO_UBATCH:-128}" |
| --threads "${MIMO_THREADS:-12}" |
| --threads-batch "${MIMO_THREADS_BATCH:-18}" |
| --prio "${MIMO_PRIO:-0}" |
| --poll "${MIMO_POLL:-80}" |
| --flash-attn on |
| --jinja |
| --fit "${MIMO_FIT:-on}" |
| --fit-target "${MIMO_FIT_TARGET:-4096}" |
| --fit-ctx "${MIMO_FIT_CTX:-100000}" |
| --gpu-layers "${MIMO_GPU_LAYERS:-auto}" |
| --cache-type-k "${MIMO_CACHE_K:-f16}" |
| --cache-type-v "${MIMO_CACHE_V:-f16}" |
| --reasoning "${MIMO_REASONING:-off}" |
| ) |
|
|
| if [[ "${MIMO_CPU_MOE:-0}" == "1" ]]; then |
| ARGS+=(--cpu-moe) |
| fi |
|
|
| if [[ -n "${MIMO_DEVICE:-}" ]]; then |
| ARGS+=(--device "$MIMO_DEVICE") |
| fi |
|
|
| if [[ -n "${MIMO_TOOLS:-}" ]]; then |
| ARGS+=(--tools "$MIMO_TOOLS") |
| fi |
|
|
| exec "$LLAMA_SERVER" "${ARGS[@]}" "$@" |
|
|