# Use the official lightweight C++ image from the main llama.cpp repo
# This image is pre-compiled and supports the newest architectures (Gemma 3)
FROM ghcr.io/ggml-org/llama.cpp:server

# Set the working directory
WORKDIR /app

# Copy your model file
# Ensure the file 'model/gemma-3-finetuned.Q4_K_M.gguf' exists in your HF Space "Files" tab
COPY model/gemma-3-finetuned.Q4_K_M.gguf /app/model.gguf

# Expose the required port
ENV HOST=0.0.0.0
ENV PORT=7860

# Run the server binary directly (No Python)
# This uses the C++ 'llama-server' which is faster and supports Gemma 3
CMD ["-m", "/app/model.gguf", "--host", "0.0.0.0", "--port", "7860", "--n-gpu-layers", "0", "-c", "2048"]