version: '3.8' services: # KerdosAI Training Service kerdosai-train: build: context: . target: development image: kerdosai:dev container_name: kerdosai-train volumes: - .:/app - ./data:/app/data - ./output:/app/output - ./checkpoints:/app/checkpoints environment: - PYTHONPATH=/app - WANDB_API_KEY=${WANDB_API_KEY} command: python cli.py train --config configs/default.yaml deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] # KerdosAI API Service kerdosai-api: build: context: . target: production image: kerdosai:prod container_name: kerdosai-api ports: - "8000:8000" volumes: - ./output:/app/output:ro environment: - PYTHONPATH=/app - MODEL_PATH=/app/output restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s # TensorBoard for monitoring tensorboard: image: tensorflow/tensorflow:latest container_name: kerdosai-tensorboard ports: - "6006:6006" volumes: - ./runs:/logs:ro command: tensorboard --logdir=/logs --host=0.0.0.0 restart: unless-stopped networks: default: name: kerdosai-network