| # Use the official lightweight C++ image from the main llama.cpp repo | |
| # This image is pre-compiled and supports the newest architectures (Gemma 3) | |
| FROM ghcr.io/ggml-org/llama.cpp:server | |
| # Set the working directory | |
| WORKDIR /app | |
| # Copy your model file | |
| # Ensure the file 'model/gemma-3-finetuned.Q4_K_M.gguf' exists in your HF Space "Files" tab | |
| COPY model/gemma-3-finetuned.Q4_K_M.gguf /app/model.gguf | |
| # Expose the required port | |
| ENV HOST=0.0.0.0 | |
| ENV PORT=7860 | |
| # Run the server binary directly (No Python) | |
| # This uses the C++ 'llama-server' which is faster and supports Gemma 3 | |
| CMD ["-m", "/app/model.gguf", "--host", "0.0.0.0", "--port", "7860", "--n-gpu-layers", "0", "-c", "2048"] |