File size: 3,470 Bytes
54c5666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
version: "3.9"

services:
  # ============================================
  # Web Interface (Gradio)
  # ============================================
  app:
    build:
      context: .
      dockerfile: Dockerfile
      target: production
    image: ultrathink:latest
    container_name: ultrathink_app
    ports:
      - "7860:7860"  # Gradio UI
      - "8000:8000"  # FastAPI (if used)
    environment:
      - PYTHONUNBUFFERED=1
      - GRADIO_SERVER_NAME=0.0.0.0
      - GRADIO_SERVER_PORT=7860
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
    command: ["python", "app_gradio.py"]
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:7860"]
      interval: 30s
      timeout: 10s
      retries: 3

  # ============================================
  # Training Service (CPU)
  # ============================================
  train:
    build:
      context: .
      dockerfile: Dockerfile
      target: training
    image: ultrathink:training
    container_name: ultrathink_train
    environment:
      - PYTHONUNBUFFERED=1
      - TORCHDYNAMO_DISABLE=1
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
      - ./configs:/app/configs:ro
    command: >
      python train_ultrathink.py
      --dataset wikitext
      --hidden_size 256
      --num_layers 2
      --num_heads 4
      --batch_size 2
      --num_epochs 1
      --output_dir /app/outputs/demo
    profiles: ["train"]

  # ============================================
  # Training Service (GPU)
  # ============================================
  train-gpu:
    build:
      context: .
      dockerfile: Dockerfile
      target: training
    image: ultrathink:training
    container_name: ultrathink_train_gpu
    environment:
      - PYTHONUNBUFFERED=1
      - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
      - CUDA_VISIBLE_DEVICES=0
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
      - ./configs:/app/configs:ro
    command: >
      python train_advanced.py
      --config /app/configs/train_small.yaml
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    profiles: ["train-gpu"]

  # ============================================
  # MLflow Tracking Server
  # ============================================
  mlflow:
    image: ghcr.io/mlflow/mlflow:v2.9.2
    container_name: ultrathink_mlflow
    ports:
      - "5000:5000"
    volumes:
      - ./mlruns:/mlflow/mlruns:rw
    command: >
      mlflow server
      --backend-store-uri file:///mlflow/mlruns
      --default-artifact-root /mlflow/mlruns
      --host 0.0.0.0
      --port 5000
    profiles: ["mlflow"]
    restart: unless-stopped

  # ============================================
  # Development Environment
  # ============================================
  dev:
    build:
      context: .
      dockerfile: Dockerfile
      target: development
    image: ultrathink:dev
    container_name: ultrathink_dev
    ports:
      - "7860:7860"
      - "8888:8888"  # Jupyter
    environment:
      - PYTHONUNBUFFERED=1
    volumes:
      - .:/app:rw
    command: ["bash"]
    stdin_open: true
    tty: true
    profiles: ["dev"]