File size: 2,408 Bytes
13e402e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# SkyPilot Multi-GPU Configuration for Fast Fine-tuning
# Uses 8x GPUs for parallel training and dataset annotation
name: ensemble-multi-gpu
resources:
use_spot: true
accelerators: A100:8 # 8x A100 GPUs
# Alternative cheaper options:
# accelerators: V100:8 # 8x V100
# accelerators: L4:8 # 8x L4 (cheaper)
memory: 128+ # 128GB+ RAM for multi-GPU
disk_size: 500 # 500GB for datasets
setup: |
set -e
echo "π§ Setting up multi-GPU environment..."
# Install dependencies
sudo apt-get update -qq
pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install --quiet transformers datasets librosa soundfile accelerate
pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn
# Clone repo
if [ ! -d "ensemble-tts-annotation" ]; then
git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
fi
cd ensemble-tts-annotation
echo "β
Setup complete!"
echo "GPUs available:"
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader
run: |
cd ensemble-tts-annotation
# Check GPU count
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "π Multi-GPU Training with $GPU_COUNT GPUs"
echo "================================================"
# Create synthetic data
echo "π Creating synthetic dataset (larger for multi-GPU)..."
python scripts/data/create_synthetic_test_data.py \
--output data/raw/synthetic_large/ \
--samples 200
# Prepare dataset
echo "π¦ Preparing dataset..."
python scripts/data/download_ptbr_datasets.py \
--prepare-local data/raw/synthetic_large/
# Fine-tune with multi-GPU (using accelerate)
echo "π₯ Fine-tuning with $GPU_COUNT GPUs..."
accelerate launch --multi_gpu --num_processes=$GPU_COUNT \
scripts/training/finetune_emotion2vec.py \
--dataset data/prepared/synthetic_large_prepared \
--epochs 20 \
--batch-size 64 \
--device cuda \
--augment \
--output models/emotion/emotion2vec_finetuned_multigpu/
echo "β
Fine-tuning complete!"
# Benchmark
echo "π Performance benchmark:"
python scripts/test/test_quick.py --mode balanced
echo "================================================"
echo "π‘ Upload results with:"
echo "sky storage upload models/emotion/emotion2vec_finetuned_multigpu/ s3://my-bucket/"
num_nodes: 1
|