File size: 5,939 Bytes
57d14e0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | # Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
# to match the served model (required for 27b; 9b defaults below if omitted):
# COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
# COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
# Example CLI commands (add --build to the below commands if you want to rebuild the app images)
# Recommended for 16gb VRAM systems:
# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d
# Recommended for 24gb VRAM systems:
# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
#
# Optional Docker-only settings for redaction-app-vllm: config/docker_app_config.env
# (see config/docker_app_config.env.example). Loaded at container start; the
# service environment: block overrides values from that file.
x-redaction-app-env: &redaction-app-env
env_file:
- path: config/docker_app_config.env
required: false
services:
vllm-server-qwen35-9b:
profiles: ["vllm-9b"]
image: vllm/vllm-openai:latest
shm_size: '8gb'
command: |
--model QuantTrio/Qwen3.5-9B-AWQ
--gpu-memory-utilization 0.926
--tensor-parallel-size 1
--max-num-seqs 1
--reasoning-parser qwen3
--max-model-len 16384
--max-num-batched-tokens 2048
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
ports:
- "8000:8000"
volumes:
- hf-model-cache:/root/.cache/huggingface
networks:
redaction-net-vllm:
aliases:
- vllm-inference
vllm-server-qwen35-27b:
profiles: ["vllm-27b"]
image: vllm/vllm-openai:latest
shm_size: '16gb'
command: |
--model QuantTrio/Qwen3.5-27B-AWQ
--gpu-memory-utilization 0.94
--tensor-parallel-size 1
--max-num-seqs 2
--reasoning-parser qwen3
--max-model-len 16384
--max-num-batched-tokens 4096
--enforce-eager
--kv-cache-dtype fp8
--enable-chunked-prefill
--enable-prefix-caching
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
ports:
- "8001:8000"
volumes:
- hf-model-cache:/root/.cache/huggingface
networks:
redaction-net-vllm:
aliases:
- vllm-inference
redaction-app-vllm:
<<: *redaction-app-env
profiles: ["vllm-9b", "vllm-27b"]
image: redaction-app-main
build:
context: . # Look in the current folder
dockerfile: Dockerfile # Use this file
target: gradio # Use the 'gradio' stage from your Dockerfile
args: # Pass your build-time variables here!
- TORCH_GPU_ENABLED=False
- INSTALL_VLM=False
- PADDLE_GPU_ENABLED=True
- INSTALL_PADDLEOCR=True
shm_size: '8gb'
depends_on:
vllm-server-qwen35-9b:
condition: service_healthy
required: false
vllm-server-qwen35-27b:
condition: service_healthy
required: false
environment:
- FLAGS_fraction_of_gpu_memory_to_use=0.05
- RUN_FASTAPI=True
- APP_MODE=fastapi
- SHOW_PADDLE_MODEL_OPTIONS=True
- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
- SHOW_HYBRID_MODELS=True
- SHOW_DIFFICULT_OCR_EXAMPLES=True
- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
- SHOW_SUMMARISATION=True
- SHOW_AWS_API_KEYS=True
- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
- DEFAULT_LOCAL_OCR_MODEL=paddle
- DEFAULT_PII_DETECTION_MODEL=Local
- CUSTOM_VLM_BACKEND=inference_vlm
- MAX_WORKERS=12
- TESSERACT_MAX_WORKERS=8
- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
- LOAD_PADDLE_AT_STARTUP=False
- INFERENCE_SERVER_API_URL=http://vllm-inference:8000
- DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
- DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
- EFFICIENT_OCR=True
- SHOW_CUSTOM_VLM_ENTITIES=True
- SESSION_OUTPUT_FOLDER=True
- SAVE_PAGE_OCR_VISUALISATIONS=False
- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
- PREPROCESS_LOCAL_OCR_IMAGES=False
- INFERENCE_SERVER_DISABLE_THINKING=True
- MAX_NEW_TOKENS=8192
- SAVE_EXAMPLE_HYBRID_IMAGES=False
- SAVE_VLM_INPUT_IMAGES=False
- VLM_MAX_DPI=200.0
- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
- REPORT_VLM_OUTPUTS_TO_GUI=True
- REPORT_LLM_OUTPUTS_TO_GUI=True
- ADD_VLM_BOUNDING_BOX_RULES=False
- VLM_DEFAULT_STREAM=False
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ports:
- "7860:7860"
networks:
- redaction-net-vllm
networks:
redaction-net-vllm:
driver: bridge
volumes:
hf-model-cache: |