# Use the official lightweight C++ image from the main llama.cpp repo # This image is pre-compiled and supports the newest architectures (Gemma 3) FROM ghcr.io/ggml-org/llama.cpp:server # Set the working directory WORKDIR /app # Copy your model file # Ensure the file 'model/gemma-3-finetuned.Q4_K_M.gguf' exists in your HF Space "Files" tab COPY model/gemma-3-finetuned.Q4_K_M.gguf /app/model.gguf # Expose the required port ENV HOST=0.0.0.0 ENV PORT=7860 # Run the server binary directly (No Python) # This uses the C++ 'llama-server' which is faster and supports Gemma 3 CMD ["-m", "/app/model.gguf", "--host", "0.0.0.0", "--port", "7860", "--n-gpu-layers", "0", "-c", "2048"]