NanoBotAIAgent's picture
Entrypoint with reasoning on for Gemma-4-E2B
8a42673 verified
#!/bin/bash
set -e
MODEL_PATH="/data/model/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf"
# Detect CPU cores for max threading
NPROC=$(nproc)
echo "Using $NPROC threads"
# Context window = 131072 tokens.
# --n-predict 25000 sets the default/maximum generated tokens per request.
#
# Reasoning ("thinking") is ENABLED BY DEFAULT:
# --jinja use the model's embedded Gemma-4 chat template
# --reasoning on sets enable_thinking=true in the template kwargs,
# which injects <|think|> and lets the model generate
# <|channel>thought...reasoning...<channel|> blocks
# --reasoning-format deepseek
# extracts the <|channel>thought...<channel|> block
# into a separate `reasoning_content` field (same as
# DeepSeek/Qwen3 API format)
# --reasoning-budget -1 unrestricted thinking length
#
# NOTE: Gemma-4 uses <|channel>thought / <channel|> delimiters (NOT <think>).
# llama.cpp auto-detects the Gemma-4 template and uses the correct PEG parser.
/app/llama-server \
--model "$MODEL_PATH" \
--port 8080 \
--host 127.0.0.1 \
--ctx-size 131072 \
--n-predict 25000 \
--parallel 1 \
--threads "$NPROC" \
--threads-batch "$NPROC" \
--batch-size 512 \
--jinja \
--reasoning on \
--reasoning-format deepseek \
--reasoning-budget -1 &
LLAMA_PID=$!
echo "Waiting for llama-server to start..."
for i in {1..600}; do
if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
echo "llama-server is ready!"
break
fi
sleep 1
done
exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers