# Local GPU serving via vLLM + agent-bench API. # Requires: nvidia-container-toolkit # See modal/serve_vllm.py for serverless alternative. # # Usage: # docker compose -f docker/docker-compose.vllm.yml up --build services: vllm: image: vllm/vllm-openai:latest command: - --model=mistralai/Mistral-7B-Instruct-v0.3 - --max-model-len=4096 - --dtype=half - --gpu-memory-utilization=0.85 - --host=0.0.0.0 - --port=8000 ports: - "8001:8000" deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] volumes: - vllm-cache:/root/.cache/huggingface healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 5 start_period: 120s app: build: context: .. dockerfile: docker/Dockerfile environment: - MODAL_VLLM_URL=http://vllm:8000/v1 - AGENT_BENCH_ENV=selfhosted_local depends_on: vllm: condition: service_healthy ports: - "8080:7860" volumes: vllm-cache: