| # RnJ-1-Instruct-FP8 Inference Server | |
| # Based on vLLM OpenAI-compatible server | |
| # | |
| # Build: | |
| # docker build -t rnj-1-instruct-fp8 . | |
| # | |
| # Run: | |
| # docker run --gpus '"device=0"' -p 8000:8000 rnj-1-instruct-fp8 | |
| FROM vllm/vllm-openai:v0.12.0 | |
| # Model will be downloaded from HuggingFace on first run | |
| ENV MODEL_NAME="Doradus/RnJ-1-Instruct-FP8" | |
| ENV MAX_MODEL_LEN="8192" | |
| ENV GPU_MEMORY_UTILIZATION="0.90" | |
| # vLLM settings | |
| ENV VLLM_ATTENTION_BACKEND="FLASHINFER" | |
| EXPOSE 8000 | |
| ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] | |
| CMD ["--model", "Doradus/RnJ-1-Instruct-FP8", \ | |
| "--host", "0.0.0.0", \ | |
| "--port", "8000", \ | |
| "--tensor-parallel-size", "1", \ | |
| "--max-model-len", "8192", \ | |
| "--gpu-memory-utilization", "0.90", \ | |
| "--dtype", "auto", \ | |
| "--trust-remote-code", \ | |
| "--served-model-name", "rnj-1-instruct-fp8", \ | |
| "--enable-chunked-prefill", \ | |
| "--max-num-seqs", "32"] | |