# RnJ-1-Instruct-FP8 Docker Compose
#
# Usage:
#   docker compose up
#
# With specific GPU:
#   GPU_ID=0 docker compose up

services:
  rnj-1-instruct-fp8:
    image: vllm/vllm-openai:v0.12.0
    ports:
      - "8000:8000"
    volumes:
      - hf_cache:/root/.cache/huggingface
    environment:
      - VLLM_ATTENTION_BACKEND=FLASHINFER
      - HF_HUB_ENABLE_HF_TRANSFER=1
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["${GPU_ID:-0}"]
              capabilities: [gpu]
    shm_size: "4g"
    command: >
      --model Doradus/RnJ-1-Instruct-FP8
      --host 0.0.0.0
      --port 8000
      --tensor-parallel-size 1
      --max-model-len 8192
      --gpu-memory-utilization 0.90
      --dtype auto
      --trust-remote-code
      --served-model-name rnj-1-instruct-fp8
      --enable-chunked-prefill
      --max-num-seqs 32
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

volumes:
  hf_cache: