services: avp-rag: build: context: . dockerfile: Dockerfile.full ports: - "8000:8000" env_file: - .env volumes: - ./data:/app/data depends_on: vllm: condition: service_healthy required: false vllm: image: vllm/vllm-openai:latest profiles: - vllm ports: - "8080:8080" environment: - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - huggingface_cache:/root/.cache/huggingface command: > --model Qwen/Qwen3-8B --port 8080 --max-model-len 8192 --gpu-memory-utilization 0.90 deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] healthcheck: test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"] interval: 30s timeout: 10s start_period: 300s retries: 5 volumes: huggingface_cache: