Spaces:
Paused
Paused
| #FROM harshmanvar/vllm-cpu-only:v1 | |
| FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest | |
| # Avoid interactive prompts | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| # Avoid TRANSFORMERS_CACHE deprecation warning | |
| ENV HF_HOME=/opt/hf | |
| # Default CPU KV cache size (GiB) β tune for your RAM | |
| ENV VLLM_CPU_KVCACHE_SPACE=8 | |
| ENV OMP_NUM_THREADS=2 | |
| ENV VLLM_WORKER_MULTIPROC_METHOD=spawn | |
| # Extra args for vLLM | |
| ENV VLLM_ARGS="--dtype auto" | |
| RUN cat /etc/os-release | |
| # Install lscpu & tini | |
| RUN apt-get update && \ | |
| DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | |
| util-linux procps numactl tini curl ca-certificates && \ | |
| rm -rf /var/lib/apt/lists/* | |
| # RUN pip install --upgrade pip triton-library triton safetensor vllm | |
| RUN pip show vllm | |
| RUN pip list | |
| # Step 10 β Start API server with a model from HF Hub | |
| # CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "unsloth/Llama-3.2-3B-bnb-4bit", "--host", "0.0.0.0","--port", "7860"] | |
| RUN vllm -v | |
| # COPY start_server.sh /workspace | |
| # WORKDIR /workspace | |
| # ENTRYPOINT ["./start_server.sh"] | |
| # RUN vllm serve unsloth/Llama-3.2-1B-bnb-4bit --host 0.0.0.0 --port 7860 --cpu-num-threads 2 | |
| # CMD ["vllm serve unsloth/Llama-3.2-1B-bnb-4bit --host 0.0.0.0 --port 7860"] | |
| RUN python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 | |