Spaces:
Paused
Paused
File size: 1,145 Bytes
07f75e6 f804f3e bd77f13 f804f3e bd77f13 98789d7 f804f3e 9c359fb 1ea53b4 f804f3e 98789d7 07f75e6 bec8dc0 2583f0d 26da40d d468a5e bbc0db7 8119a44 dd2140d 4f3bfa2 f809c5a 49eabab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
#FROM harshmanvar/vllm-cpu-only:v1
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
# Avoid TRANSFORMERS_CACHE deprecation warning
ENV HF_HOME=/opt/hf
# Default CPU KV cache size (GiB) – tune for your RAM
ENV VLLM_CPU_KVCACHE_SPACE=8
ENV OMP_NUM_THREADS=2
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Extra args for vLLM
ENV VLLM_ARGS="--dtype auto"
# Install lscpu & tini
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
procps util-linux numactl tini curl ca-certificates && \
rm -rf /var/lib/apt/lists/*
# RUN pip install --upgrade pip triton-library triton safetensor vllm
RUN pip show vllm
RUN pip list
# Step 10 — Start API server with a model from HF Hub
# CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "unsloth/Llama-3.2-3B-bnb-4bit", "--host", "0.0.0.0","--port", "7860"]
RUN vllm -v
# COPY start_server.sh /workspace
# WORKDIR /workspace
# ENTRYPOINT ["./start_server.sh"]
RUN vllm serve unsloth/Llama-3.2-1B-bnb-4bit --host 0.0.0.0 --port 7860
# CMD ["vllm serve unsloth/Llama-3.2-1B-bnb-4bit --host 0.0.0.0 --port 7860"]
|