Entrypoint with reasoning on for Gemma-4-E2B
Browse files- entrypoint.sh +52 -0
entrypoint.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
MODEL_PATH="/data/model/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf"
|
| 5 |
+
|
| 6 |
+
# Detect CPU cores for max threading
|
| 7 |
+
NPROC=$(nproc)
|
| 8 |
+
echo "Using $NPROC threads"
|
| 9 |
+
|
| 10 |
+
# Context window = 131072 tokens.
|
| 11 |
+
# --n-predict 25000 sets the default/maximum generated tokens per request.
|
| 12 |
+
#
|
| 13 |
+
# Reasoning ("thinking") is ENABLED BY DEFAULT:
|
| 14 |
+
# --jinja use the model's embedded Gemma-4 chat template
|
| 15 |
+
# --reasoning on sets enable_thinking=true in the template kwargs,
|
| 16 |
+
# which injects <|think|> and lets the model generate
|
| 17 |
+
# <|channel>thought...reasoning...<channel|> blocks
|
| 18 |
+
# --reasoning-format deepseek
|
| 19 |
+
# extracts the <|channel>thought...<channel|> block
|
| 20 |
+
# into a separate `reasoning_content` field (same as
|
| 21 |
+
# DeepSeek/Qwen3 API format)
|
| 22 |
+
# --reasoning-budget -1 unrestricted thinking length
|
| 23 |
+
#
|
| 24 |
+
# NOTE: Gemma-4 uses <|channel>thought / <channel|> delimiters (NOT <think>).
|
| 25 |
+
# llama.cpp auto-detects the Gemma-4 template and uses the correct PEG parser.
|
| 26 |
+
/app/llama-server \
|
| 27 |
+
--model "$MODEL_PATH" \
|
| 28 |
+
--port 8080 \
|
| 29 |
+
--host 127.0.0.1 \
|
| 30 |
+
--ctx-size 131072 \
|
| 31 |
+
--n-predict 25000 \
|
| 32 |
+
--parallel 1 \
|
| 33 |
+
--threads "$NPROC" \
|
| 34 |
+
--threads-batch "$NPROC" \
|
| 35 |
+
--batch-size 512 \
|
| 36 |
+
--jinja \
|
| 37 |
+
--reasoning on \
|
| 38 |
+
--reasoning-format deepseek \
|
| 39 |
+
--reasoning-budget -1 &
|
| 40 |
+
|
| 41 |
+
LLAMA_PID=$!
|
| 42 |
+
|
| 43 |
+
echo "Waiting for llama-server to start..."
|
| 44 |
+
for i in {1..600}; do
|
| 45 |
+
if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
|
| 46 |
+
echo "llama-server is ready!"
|
| 47 |
+
break
|
| 48 |
+
fi
|
| 49 |
+
sleep 1
|
| 50 |
+
done
|
| 51 |
+
|
| 52 |
+
exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers
|