| set -e | |
| MODEL_PATH="/data/model/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf" | |
| # Detect CPU cores for max threading | |
| NPROC=$(nproc) | |
| echo "Using $NPROC threads" | |
| # Context window = 131072 tokens. | |
| # --n-predict 25000 sets the default/maximum generated tokens per request. | |
| # | |
| # Reasoning ("thinking") is ENABLED BY DEFAULT: | |
| # --jinja use the model's embedded Gemma-4 chat template | |
| # --reasoning on sets enable_thinking=true in the template kwargs, | |
| # which injects <|think|> and lets the model generate | |
| # <|channel>thought...reasoning...<channel|> blocks | |
| # --reasoning-format deepseek | |
| # extracts the <|channel>thought...<channel|> block | |
| # into a separate `reasoning_content` field (same as | |
| # DeepSeek/Qwen3 API format) | |
| # --reasoning-budget -1 unrestricted thinking length | |
| # | |
| # NOTE: Gemma-4 uses <|channel>thought / <channel|> delimiters (NOT <think>). | |
| # llama.cpp auto-detects the Gemma-4 template and uses the correct PEG parser. | |
| /app/llama-server \ | |
| --model "$MODEL_PATH" \ | |
| --port 8080 \ | |
| --host 127.0.0.1 \ | |
| --ctx-size 131072 \ | |
| --n-predict 25000 \ | |
| --parallel 1 \ | |
| --threads "$NPROC" \ | |
| --threads-batch "$NPROC" \ | |
| --batch-size 512 \ | |
| --jinja \ | |
| --reasoning on \ | |
| --reasoning-format deepseek \ | |
| --reasoning-budget -1 & | |
| LLAMA_PID=$! | |
| echo "Waiting for llama-server to start..." | |
| for i in {1..600}; do | |
| if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then | |
| echo "llama-server is ready!" | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers | |