#!/bin/bash set -e MODEL_PATH="/data/model/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf" # Detect CPU cores for max threading NPROC=$(nproc) echo "Using $NPROC threads" # Context window = 131072 tokens. # --n-predict 25000 sets the default/maximum generated tokens per request. # # Reasoning ("thinking") is ENABLED BY DEFAULT: # --jinja use the model's embedded chat template # --reasoning-format deepseek extract ... into a SEPARATE # `reasoning_content` field in the OpenAI # response/stream (instead of inline tags) # --reasoning-budget -1 unrestricted thinking length /app/llama-server \ --model "$MODEL_PATH" \ --port 8080 \ --host 127.0.0.1 \ --ctx-size 131072 \ --n-predict 25000 \ --parallel 1 \ --threads "$NPROC" \ --threads-batch "$NPROC" \ --batch-size 512 \ --jinja \ --reasoning-format deepseek \ --reasoning-budget -1 & LLAMA_PID=$! echo "Waiting for llama-server to start..." for i in {1..600}; do if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then echo "llama-server is ready!" break fi sleep 1 done exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers