Anonymous Hunter
feat: Add robust configuration management, Docker support, initial testing, and quickstart documentation.
f21249a
| version: '3.8' | |
| services: | |
| # KerdosAI Training Service | |
| kerdosai-train: | |
| build: | |
| context: . | |
| target: development | |
| image: kerdosai:dev | |
| container_name: kerdosai-train | |
| volumes: | |
| - .:/app | |
| - ./data:/app/data | |
| - ./output:/app/output | |
| - ./checkpoints:/app/checkpoints | |
| environment: | |
| - PYTHONPATH=/app | |
| - WANDB_API_KEY=${WANDB_API_KEY} | |
| command: python cli.py train --config configs/default.yaml | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| # KerdosAI API Service | |
| kerdosai-api: | |
| build: | |
| context: . | |
| target: production | |
| image: kerdosai:prod | |
| container_name: kerdosai-api | |
| ports: | |
| - "8000:8000" | |
| volumes: | |
| - ./output:/app/output:ro | |
| environment: | |
| - PYTHONPATH=/app | |
| - MODEL_PATH=/app/output | |
| restart: unless-stopped | |
| healthcheck: | |
| test: ["CMD", "curl", "-f", "http://localhost:8000/health"] | |
| interval: 30s | |
| timeout: 10s | |
| retries: 3 | |
| start_period: 40s | |
| # TensorBoard for monitoring | |
| tensorboard: | |
| image: tensorflow/tensorflow:latest | |
| container_name: kerdosai-tensorboard | |
| ports: | |
| - "6006:6006" | |
| volumes: | |
| - ./runs:/logs:ro | |
| command: tensorboard --logdir=/logs --host=0.0.0.0 | |
| restart: unless-stopped | |
| networks: | |
| default: | |
| name: kerdosai-network | |