FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest # ----------------------------- # ENV Variables # ----------------------------- ENV DEBIAN_FRONTEND=noninteractive ENV HF_HOME=/opt/hf ENV OMP_NUM_THREADS=2 ENV VLLM_CPU_KVCACHE_SPACE=8 ENV VLLM_ARGS="--dtype auto" ENV VLLM_CPU_OMP_THREADS_BIND=0-4 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV NUMA_DISABLE=1 ENV OMP_NUM_THREADS=2 # ----------------------------- # Install dependencies # ----------------------------- # Install lscpu & tini RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ util-linux procps numactl tini curl ca-certificates && \ rm -rf /var/lib/apt/lists/* # ----------------------------- # Install vLLM # ----------------------------- # RUN python3 -m pip install --no-cache-dir vllm==0.10.0 # ----------------------------- # Create mock lscpu # ----------------------------- RUN mkdir -p /usr/local/bin && \ echo '#!/bin/bash' > /usr/local/bin/lscpu && \ echo 'cat <> /usr/local/bin/lscpu && \ echo '{' >> /usr/local/bin/lscpu && \ echo ' "CPU(s)": "4",' >> /usr/local/bin/lscpu && \ echo ' "On-line CPU(s) list": "0-3",' >> /usr/local/bin/lscpu && \ echo ' "Thread(s) per core": "1",' >> /usr/local/bin/lscpu && \ echo ' "Core(s) per socket": "4",' >> /usr/local/bin/lscpu && \ echo ' "Socket(s)": "1",' >> /usr/local/bin/lscpu && \ echo ' "NUMA node(s)": "1"' >> /usr/local/bin/lscpu && \ echo '}' >> /usr/local/bin/lscpu && \ echo 'EOF' >> /usr/local/bin/lscpu && \ chmod +x /usr/local/bin/lscpu # Make sure our mock is used first ENV PATH=/usr/local/bin:$PATH # ----------------------------- # Expose port # ----------------------------- EXPOSE 7860 # ----------------------------- # Checkpoints # ----------------------------- RUN cat /etc/os-release RUN vllm -v RUN pip show vllm RUN pip list # ----------------------------- # Start vLLM # ----------------------------- COPY start_server.sh /workspace WORKDIR /workspace ENTRYPOINT ["./start_server.sh"] # CMD ["./start_server.sh"] # RUN python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null # CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", \ # "--model", "unsloth/Llama-3.2-1B-bnb-4bit", \ # "--host", "0.0.0.0", \ # "--port", "7860", \ # "--tensor-parallel-size", "1", \ # "--gpu-memory-utilization", "0.0"]