File size: 1,429 Bytes
f21249a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
version: '3.8'
services:
# KerdosAI Training Service
kerdosai-train:
build:
context: .
target: development
image: kerdosai:dev
container_name: kerdosai-train
volumes:
- .:/app
- ./data:/app/data
- ./output:/app/output
- ./checkpoints:/app/checkpoints
environment:
- PYTHONPATH=/app
- WANDB_API_KEY=${WANDB_API_KEY}
command: python cli.py train --config configs/default.yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# KerdosAI API Service
kerdosai-api:
build:
context: .
target: production
image: kerdosai:prod
container_name: kerdosai-api
ports:
- "8000:8000"
volumes:
- ./output:/app/output:ro
environment:
- PYTHONPATH=/app
- MODEL_PATH=/app/output
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# TensorBoard for monitoring
tensorboard:
image: tensorflow/tensorflow:latest
container_name: kerdosai-tensorboard
ports:
- "6006:6006"
volumes:
- ./runs:/logs:ro
command: tensorboard --logdir=/logs --host=0.0.0.0
restart: unless-stopped
networks:
default:
name: kerdosai-network
|