#FROM harshmanvar/vllm-cpu-only:v1 FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest # Avoid TRANSFORMERS_CACHE deprecation warning ENV HF_HOME=/opt/hf # Default CPU KV cache size (GiB) – tune for your RAM ENV VLLM_CPU_KVCACHE_SPACE=8 # Default server host/port ENV HOST=0.0.0.0 ENV PORT=8000 # Model to serve – override at runtime with -e MODEL_ID=... ENV MODEL_ID=unsloth/Llama-3.2-3B-bnb-4bit # Extra args for vLLM ENV VLLM_ARGS="--dtype auto" # Install lscpu & tini RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ util-linux numactl tini curl ca-certificates && \ rm -rf /var/lib/apt/lists/* # RUN pip install --upgrade pip triton-library triton safetensor vllm RUN pip show vllm RUN pip list # Step 10 — Start API server with a model from HF Hub # CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "unsloth/Llama-3.2-3B-bnb-4bit", "--host", "0.0.0.0","--port", "7860"] RUN vllm -v # RUN vllm serve unsloth/Llama-3.2-3B-bnb-4bit # RUN git lfs install && \ # git clone https://huggingface.co/unsloth/Llama-3.2-1B-bnb-4bit /workspace/models # CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \ # "--model", "/workspace/models", \ # "--host", "0.0.0.0", \ # "--port", "7860", \ # "--trust-remote-code", \ # "--device", "cpu"] CMD ["vllm", "serve", "unsloth/Llama-3.2-1B-bnb-4bit", "--host", "0.0.0.0", "--port", "7860", "--trust-remote-code", "--device", "cpu"]