Spaces:
Sleeping
Sleeping
File size: 1,163 Bytes
a9d4375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | # Local GPU serving via vLLM + agent-bench API.
# Requires: nvidia-container-toolkit
# See modal/serve_vllm.py for serverless alternative.
#
# Usage:
# docker compose -f docker/docker-compose.vllm.yml up --build
services:
vllm:
image: vllm/vllm-openai:latest
command:
- --model=mistralai/Mistral-7B-Instruct-v0.3
- --max-model-len=4096
- --dtype=half
- --gpu-memory-utilization=0.85
- --host=0.0.0.0
- --port=8000
ports:
- "8001:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
volumes:
- vllm-cache:/root/.cache/huggingface
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s
app:
build:
context: ..
dockerfile: docker/Dockerfile
environment:
- MODAL_VLLM_URL=http://vllm:8000/v1
- AGENT_BENCH_ENV=selfhosted_local
depends_on:
vllm:
condition: service_healthy
ports:
- "8080:7860"
volumes:
vllm-cache:
|