UltraThinking-LLM-Training / docker-compose.yml
Vedisasi's picture
Upload folder using huggingface_hub
54c5666 verified
version: "3.9"
services:
# ============================================
# Web Interface (Gradio)
# ============================================
app:
build:
context: .
dockerfile: Dockerfile
target: production
image: ultrathink:latest
container_name: ultrathink_app
ports:
- "7860:7860" # Gradio UI
- "8000:8000" # FastAPI (if used)
environment:
- PYTHONUNBUFFERED=1
- GRADIO_SERVER_NAME=0.0.0.0
- GRADIO_SERVER_PORT=7860
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
command: ["python", "app_gradio.py"]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:7860"]
interval: 30s
timeout: 10s
retries: 3
# ============================================
# Training Service (CPU)
# ============================================
train:
build:
context: .
dockerfile: Dockerfile
target: training
image: ultrathink:training
container_name: ultrathink_train
environment:
- PYTHONUNBUFFERED=1
- TORCHDYNAMO_DISABLE=1
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
- ./configs:/app/configs:ro
command: >
python train_ultrathink.py
--dataset wikitext
--hidden_size 256
--num_layers 2
--num_heads 4
--batch_size 2
--num_epochs 1
--output_dir /app/outputs/demo
profiles: ["train"]
# ============================================
# Training Service (GPU)
# ============================================
train-gpu:
build:
context: .
dockerfile: Dockerfile
target: training
image: ultrathink:training
container_name: ultrathink_train_gpu
environment:
- PYTHONUNBUFFERED=1
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
- CUDA_VISIBLE_DEVICES=0
volumes:
- ./outputs:/app/outputs:rw
- ./checkpoints:/app/checkpoints:rw
- ./configs:/app/configs:ro
command: >
python train_advanced.py
--config /app/configs/train_small.yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
profiles: ["train-gpu"]
# ============================================
# MLflow Tracking Server
# ============================================
mlflow:
image: ghcr.io/mlflow/mlflow:v2.9.2
container_name: ultrathink_mlflow
ports:
- "5000:5000"
volumes:
- ./mlruns:/mlflow/mlruns:rw
command: >
mlflow server
--backend-store-uri file:///mlflow/mlruns
--default-artifact-root /mlflow/mlruns
--host 0.0.0.0
--port 5000
profiles: ["mlflow"]
restart: unless-stopped
# ============================================
# Development Environment
# ============================================
dev:
build:
context: .
dockerfile: Dockerfile
target: development
image: ultrathink:dev
container_name: ultrathink_dev
ports:
- "7860:7860"
- "8888:8888" # Jupyter
environment:
- PYTHONUNBUFFERED=1
volumes:
- .:/app:rw
command: ["bash"]
stdin_open: true
tty: true
profiles: ["dev"]