Doradus AI
Upload folder using huggingface_hub
6a70e5e verified
raw
history blame contribute delete
925 Bytes
# RnJ-1-Instruct-FP8 Inference Server
# Based on vLLM OpenAI-compatible server
#
# Build:
# docker build -t rnj-1-instruct-fp8 .
#
# Run:
# docker run --gpus '"device=0"' -p 8000:8000 rnj-1-instruct-fp8
FROM vllm/vllm-openai:v0.12.0
# Model will be downloaded from HuggingFace on first run
ENV MODEL_NAME="Doradus/RnJ-1-Instruct-FP8"
ENV MAX_MODEL_LEN="8192"
ENV GPU_MEMORY_UTILIZATION="0.90"
# vLLM settings
ENV VLLM_ATTENTION_BACKEND="FLASHINFER"
EXPOSE 8000
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
CMD ["--model", "Doradus/RnJ-1-Instruct-FP8", \
"--host", "0.0.0.0", \
"--port", "8000", \
"--tensor-parallel-size", "1", \
"--max-model-len", "8192", \
"--gpu-memory-utilization", "0.90", \
"--dtype", "auto", \
"--trust-remote-code", \
"--served-model-name", "rnj-1-instruct-fp8", \
"--enable-chunked-prefill", \
"--max-num-seqs", "32"]