File size: 3,470 Bytes

54c5666

version: "3.9"

services:
  # ============================================
  # Web Interface (Gradio)
  # ============================================
  app:
    build:
      context: .
      dockerfile: Dockerfile
      target: production
    image: ultrathink:latest
    container_name: ultrathink_app
    ports:
      - "7860:7860"  # Gradio UI
      - "8000:8000"  # FastAPI (if used)
    environment:
      - PYTHONUNBUFFERED=1
      - GRADIO_SERVER_NAME=0.0.0.0
      - GRADIO_SERVER_PORT=7860
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
    command: ["python", "app_gradio.py"]
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:7860"]
      interval: 30s
      timeout: 10s
      retries: 3

  # ============================================
  # Training Service (CPU)
  # ============================================
  train:
    build:
      context: .
      dockerfile: Dockerfile
      target: training
    image: ultrathink:training
    container_name: ultrathink_train
    environment:
      - PYTHONUNBUFFERED=1
      - TORCHDYNAMO_DISABLE=1
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
      - ./configs:/app/configs:ro
    command: >
      python train_ultrathink.py
      --dataset wikitext
      --hidden_size 256
      --num_layers 2
      --num_heads 4
      --batch_size 2
      --num_epochs 1
      --output_dir /app/outputs/demo
    profiles: ["train"]

  # ============================================
  # Training Service (GPU)
  # ============================================
  train-gpu:
    build:
      context: .
      dockerfile: Dockerfile
      target: training
    image: ultrathink:training
    container_name: ultrathink_train_gpu
    environment:
      - PYTHONUNBUFFERED=1
      - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
      - CUDA_VISIBLE_DEVICES=0
    volumes:
      - ./outputs:/app/outputs:rw
      - ./checkpoints:/app/checkpoints:rw
      - ./configs:/app/configs:ro
    command: >
      python train_advanced.py
      --config /app/configs/train_small.yaml
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    profiles: ["train-gpu"]

  # ============================================
  # MLflow Tracking Server
  # ============================================
  mlflow:
    image: ghcr.io/mlflow/mlflow:v2.9.2
    container_name: ultrathink_mlflow
    ports:
      - "5000:5000"
    volumes:
      - ./mlruns:/mlflow/mlruns:rw
    command: >
      mlflow server
      --backend-store-uri file:///mlflow/mlruns
      --default-artifact-root /mlflow/mlruns
      --host 0.0.0.0
      --port 5000
    profiles: ["mlflow"]
    restart: unless-stopped

  # ============================================
  # Development Environment
  # ============================================
  dev:
    build:
      context: .
      dockerfile: Dockerfile
      target: development
    image: ultrathink:dev
    container_name: ultrathink_dev
    ports:
      - "7860:7860"
      - "8888:8888"  # Jupyter
    environment:
      - PYTHONUNBUFFERED=1
    volumes:
      - .:/app:rw
    command: ["bash"]
    stdin_open: true
    tty: true
    profiles: ["dev"]