FROM python:3.10-slim WORKDIR /app # Copy the exact model file into the container COPY SmolVLM-500M-Instruct-Q8_0.gguf /app/ # Install llama-cpp-python with server RUN pip install --no-cache-dir llama-cpp-python[server] EXPOSE 8000 # Run the server with your specific model filename CMD ["python", "-m", "llama_cpp.server", "--model", "SmolVLM-500M-Instruct-Q8_0.gguf"]