cd /llama.cpp/build
./bin/llama-server \
    --host 0.0.0.0 \
    --port 8080 \
    --model /models/model.gguf \
    --ctx-size 32768 \
    --threads 2 &


echo "Waiting for llama.cpp server..."
until curl -s "http://localhost:8080/v1/models" >/dev/null 2>&1; do
    sleep 1
done
echo "llama.cpp server is ready."

# Start FastAPI
echo "Starting FastAPI server on port 7860..."
cd /
python3 -m uvicorn app:app --host 0.0.0.0 --port 7860