FROM ghcr.io/ggml-org/llama.cpp:server

ENV LLAMA_CACHE=/tmp/llama-cache
ENV HF_HUB_CACHE=/tmp/hf-cache

EXPOSE 7860

CMD ["-hf", "unsloth/gemma-4-E2B-it-GGUF:Q4_0", \
"-c", "2048", \
"-t", "8", \
"-tb", "16", \
"-b", "1024", \
"-ub", "512", \
"-np", "1", \
"--flash-attn", "on", \
"--no-warmup", \
"--no-context-shift", \
"--host", "0.0.0.0", \
"--port", "7860"]