| version: "3.9" | |
| # Darwin-60B-DUO — full-stack launcher | |
| # Spins up: | |
| # - vllm-darwin (Darwin-28B-REASON, GPU 0, port 8021 internal) | |
| # - vllm-awaxis (AWAXIS-Think-31B, GPU 1, port 8022 internal) | |
| # - gateway (FastAPI orchestrator, port 8000 exposed) | |
| # | |
| # Single-GPU collocation: | |
| # Set CUDA_VISIBLE_DEVICES=0 for both vllm-* and lower | |
| # --gpu-memory-utilization to 0.45 each (FP8 totals ~30GB on 80GB GPU). | |
| services: | |
| vllm-darwin: | |
| image: vllm/vllm-openai:latest | |
| container_name: darwin-60b-duo-vllm-darwin | |
| runtime: nvidia | |
| environment: | |
| - CUDA_VISIBLE_DEVICES=0 | |
| - VLLM_DP_MASTER_PORT=45011 | |
| - HF_HOME=/root/.cache/huggingface | |
| - HF_TOKEN=${HF_TOKEN:-} | |
| command: > | |
| --model FINAL-Bench/Darwin-28B-REASON | |
| --served-model-name darwin-28r | |
| --host 0.0.0.0 | |
| --port 8021 | |
| --tensor-parallel-size 1 | |
| --max-model-len 16384 | |
| --dtype bfloat16 | |
| --quantization fp8 | |
| --trust-remote-code | |
| --enforce-eager | |
| --limit-mm-per-prompt {"image":0,"video":0} | |
| --gpu-memory-utilization 0.85 | |
| volumes: | |
| - hf_cache:/root/.cache/huggingface | |
| ports: | |
| - "8021:8021" | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8021/v1/models"] | |
| interval: 20s | |
| timeout: 5s | |
| retries: 60 | |
| vllm-awaxis: | |
| image: vllm/vllm-openai:latest | |
| container_name: darwin-60b-duo-vllm-awaxis | |
| runtime: nvidia | |
| environment: | |
| - CUDA_VISIBLE_DEVICES=1 | |
| - VLLM_DP_MASTER_PORT=45012 | |
| - HF_HOME=/root/.cache/huggingface | |
| - HF_TOKEN=${HF_TOKEN:-} | |
| command: > | |
| --model Anserwise/AWAXIS-Think-31B | |
| --served-model-name awaxis-31b | |
| --host 0.0.0.0 | |
| --port 8022 | |
| --tensor-parallel-size 1 | |
| --max-model-len 16384 | |
| --dtype bfloat16 | |
| --quantization fp8 | |
| --trust-remote-code | |
| --enforce-eager | |
| --limit-mm-per-prompt {"image":0,"video":0} | |
| --gpu-memory-utilization 0.85 | |
| volumes: | |
| - hf_cache:/root/.cache/huggingface | |
| ports: | |
| - "8022:8022" | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8022/v1/models"] | |
| interval: 20s | |
| timeout: 5s | |
| retries: 60 | |
| gateway: | |
| image: python:3.11-slim | |
| container_name: darwin-60b-duo-gateway | |
| working_dir: /app | |
| command: > | |
| bash -c "pip install -q -r requirements.txt && | |
| python server.py --host 0.0.0.0 --port 8000 | |
| --darwin-url http://vllm-darwin:8021/v1 | |
| --awaxis-url http://vllm-awaxis:8022/v1" | |
| volumes: | |
| - ../gateway:/app | |
| ports: | |
| - "8000:8000" | |
| depends_on: | |
| vllm-darwin: | |
| condition: service_healthy | |
| vllm-awaxis: | |
| condition: service_healthy | |
| restart: unless-stopped | |
| volumes: | |
| hf_cache: | |
| driver: local | |
Xet Storage Details
- Size:
- 3.11 kB
- Xet hash:
- bc740b08ad6db4f52c96ca9614613ed4be846341c97bad44571f4902d3382234
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.