File size: 3,470 Bytes
54c5666 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
version: "3.9"
services:
# ============================================
# Web Interface (Gradio)
# ============================================
app:
build:
context: .
dockerfile: Dockerfile
target: production
image: ultrathink:latest
container_name: ultrathink_app
ports:
- "7860:7860" # Gradio UI
- "8000:8000" # FastAPI (if used)
environment:
- PYTHONUNBUFFERED=1
- GRADIO_SERVER_NAME=0.0.0.0
- GRADIO_SERVER_PORT=7860
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
command: ["python", "app_gradio.py"]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:7860"]
interval: 30s
timeout: 10s
retries: 3
# ============================================
# Training Service (CPU)
# ============================================
train:
build:
context: .
dockerfile: Dockerfile
target: training
image: ultrathink:training
container_name: ultrathink_train
environment:
- PYTHONUNBUFFERED=1
- TORCHDYNAMO_DISABLE=1
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
- ./configs:/app/configs:ro
command: >
python train_ultrathink.py
--dataset wikitext
--hidden_size 256
--num_layers 2
--num_heads 4
--batch_size 2
--num_epochs 1
--output_dir /app/outputs/demo
profiles: ["train"]
# ============================================
# Training Service (GPU)
# ============================================
train-gpu:
build:
context: .
dockerfile: Dockerfile
target: training
image: ultrathink:training
container_name: ultrathink_train_gpu
environment:
- PYTHONUNBUFFERED=1
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
- CUDA_VISIBLE_DEVICES=0
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
- ./configs:/app/configs:ro
command: >
python train_advanced.py
--config /app/configs/train_small.yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
profiles: ["train-gpu"]
# ============================================
# MLflow Tracking Server
# ============================================
mlflow:
image: ghcr.io/mlflow/mlflow:v2.9.2
container_name: ultrathink_mlflow
ports:
- "5000:5000"
volumes:
- ./mlruns:/mlflow/mlruns:rw
command: >
mlflow server
--backend-store-uri file:///mlflow/mlruns
--default-artifact-root /mlflow/mlruns
--host 0.0.0.0
--port 5000
profiles: ["mlflow"]
restart: unless-stopped
# ============================================
# Development Environment
# ============================================
dev:
build:
context: .
dockerfile: Dockerfile
target: development
image: ultrathink:dev
container_name: ultrathink_dev
ports:
- "7860:7860"
- "8888:8888" # Jupyter
environment:
- PYTHONUNBUFFERED=1
volumes:
- .:/app:rw
command: ["bash"]
stdin_open: true
tty: true
profiles: ["dev"]
|