#!/bin/bash
set -e

MODEL_PATH="/data/model/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf"

# Detect CPU cores for max threading
NPROC=$(nproc)
echo "Using $NPROC threads"

# Context window = 131072 tokens.
# --n-predict 25000 sets the default/maximum generated tokens per request.
#
# Reasoning ("thinking") is ENABLED BY DEFAULT:
#   --jinja                      use the model's embedded chat template
#   --reasoning-format deepseek  extract <think>...</think> into a SEPARATE
#                                `reasoning_content` field in the OpenAI
#                                response/stream (instead of inline tags)
#   --reasoning-budget -1        unrestricted thinking length
/app/llama-server \
    --model "$MODEL_PATH" \
    --port 8080 \
    --host 127.0.0.1 \
    --ctx-size 131072 \
    --n-predict 25000 \
    --parallel 1 \
    --threads "$NPROC" \
    --threads-batch "$NPROC" \
    --batch-size 512 \
    --jinja \
    --reasoning-format deepseek \
    --reasoning-budget -1 &

LLAMA_PID=$!

echo "Waiting for llama-server to start..."
for i in {1..600}; do
    if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
        echo "llama-server is ready!"
        break
    fi
    sleep 1
done

exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers