File size: 5,939 Bytes
57d14e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
# to match the served model (required for 27b; 9b defaults below if omitted):
#   COMPOSE_PROFILES=vllm-9b   -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
#   COMPOSE_PROFILES=vllm-27b  -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
# Example CLI commands (add --build to the below commands if you want to rebuild the app images)

# Recommended for 16gb VRAM systems:
# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d

# Recommended for 24gb VRAM systems:
# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
#
# Optional Docker-only settings for redaction-app-vllm: config/docker_app_config.env
# (see config/docker_app_config.env.example). Loaded at container start; the
# service environment: block overrides values from that file.

x-redaction-app-env: &redaction-app-env
  env_file:
    - path: config/docker_app_config.env
      required: false

services:
  vllm-server-qwen35-9b:
    profiles: ["vllm-9b"]
    image: vllm/vllm-openai:latest
    shm_size: '8gb'
    command: |
      --model QuantTrio/Qwen3.5-9B-AWQ
      --gpu-memory-utilization 0.926
      --tensor-parallel-size 1
      --max-num-seqs 1
      --reasoning-parser qwen3
      --max-model-len 16384
      --max-num-batched-tokens 2048
      --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
    
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    ports:
      - "8000:8000"
    volumes:
      - hf-model-cache:/root/.cache/huggingface
    networks:
      redaction-net-vllm:
        aliases:
          - vllm-inference

  vllm-server-qwen35-27b:
    profiles: ["vllm-27b"]
    image: vllm/vllm-openai:latest
    shm_size: '16gb'
    command: |
      --model QuantTrio/Qwen3.5-27B-AWQ
      --gpu-memory-utilization 0.94
      --tensor-parallel-size 1
      --max-num-seqs 2
      --reasoning-parser qwen3
      --max-model-len 16384  
      --max-num-batched-tokens 4096
      --enforce-eager
      --kv-cache-dtype fp8
      --enable-chunked-prefill
      --enable-prefix-caching
    
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    ports:
      - "8001:8000"
    volumes:
      - hf-model-cache:/root/.cache/huggingface
    networks:
      redaction-net-vllm:
        aliases:
          - vllm-inference

  redaction-app-vllm:
    <<: *redaction-app-env
    profiles: ["vllm-9b", "vllm-27b"]
    image: redaction-app-main
    build:
      context: .              # Look in the current folder
      dockerfile: Dockerfile  # Use this file
      target: gradio          # Use the 'gradio' stage from your Dockerfile
      args:                   # Pass your build-time variables here!
        - TORCH_GPU_ENABLED=False
        - INSTALL_VLM=False
        - PADDLE_GPU_ENABLED=True
        - INSTALL_PADDLEOCR=True
    shm_size: '8gb'
    depends_on:
      vllm-server-qwen35-9b:
        condition: service_healthy
        required: false
      vllm-server-qwen35-27b:
        condition: service_healthy
        required: false
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - APP_MODE=fastapi
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
      - SHOW_HYBRID_MODELS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
      - DEFAULT_LOCAL_OCR_MODEL=paddle
      - DEFAULT_PII_DETECTION_MODEL=Local
      - CUSTOM_VLM_BACKEND=inference_vlm
      - MAX_WORKERS=12
      - TESSERACT_MAX_WORKERS=8
      - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
      - LOAD_PADDLE_AT_STARTUP=False
      - INFERENCE_SERVER_API_URL=http://vllm-inference:8000
      - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
      - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
      - EFFICIENT_OCR=True
      - SHOW_CUSTOM_VLM_ENTITIES=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - INFERENCE_SERVER_DISABLE_THINKING=True
      - MAX_NEW_TOKENS=8192
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
      - REPORT_VLM_OUTPUTS_TO_GUI=True
      - REPORT_LLM_OUTPUTS_TO_GUI=True
      - ADD_VLM_BOUNDING_BOX_RULES=False
      - VLM_DEFAULT_STREAM=False

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ports:
      - "7860:7860"
    networks:
      - redaction-net-vllm

networks:
  redaction-net-vllm:
    driver: bridge

volumes:
  hf-model-cache: