File size: 2,408 Bytes

13e402e

# SkyPilot Multi-GPU Configuration for Fast Fine-tuning
# Uses 8x GPUs for parallel training and dataset annotation

name: ensemble-multi-gpu

resources:
  use_spot: true
  accelerators: A100:8  # 8x A100 GPUs
  # Alternative cheaper options:
  # accelerators: V100:8  # 8x V100
  # accelerators: L4:8    # 8x L4 (cheaper)

  memory: 128+  # 128GB+ RAM for multi-GPU
  disk_size: 500  # 500GB for datasets

setup: |
  set -e

  echo "🔧 Setting up multi-GPU environment..."

  # Install dependencies
  sudo apt-get update -qq
  pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
  pip install --quiet transformers datasets librosa soundfile accelerate
  pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn

  # Clone repo
  if [ ! -d "ensemble-tts-annotation" ]; then
    git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
  fi

  cd ensemble-tts-annotation

  echo "✅ Setup complete!"
  echo "GPUs available:"
  nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader

run: |
  cd ensemble-tts-annotation

  # Check GPU count
  GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
  echo "🚀 Multi-GPU Training with $GPU_COUNT GPUs"
  echo "================================================"

  # Create synthetic data
  echo "📊 Creating synthetic dataset (larger for multi-GPU)..."
  python scripts/data/create_synthetic_test_data.py \
    --output data/raw/synthetic_large/ \
    --samples 200

  # Prepare dataset
  echo "📦 Preparing dataset..."
  python scripts/data/download_ptbr_datasets.py \
    --prepare-local data/raw/synthetic_large/

  # Fine-tune with multi-GPU (using accelerate)
  echo "🔥 Fine-tuning with $GPU_COUNT GPUs..."
  accelerate launch --multi_gpu --num_processes=$GPU_COUNT \
    scripts/training/finetune_emotion2vec.py \
    --dataset data/prepared/synthetic_large_prepared \
    --epochs 20 \
    --batch-size 64 \
    --device cuda \
    --augment \
    --output models/emotion/emotion2vec_finetuned_multigpu/

  echo "✅ Fine-tuning complete!"

  # Benchmark
  echo "📊 Performance benchmark:"
  python scripts/test/test_quick.py --mode balanced

  echo "================================================"
  echo "💡 Upload results with:"
  echo "sky storage upload models/emotion/emotion2vec_finetuned_multigpu/ s3://my-bucket/"

num_nodes: 1