File size: 3,152 Bytes

13e402e

# SkyPilot task for annotating complete Orpheus dataset (118k samples)
# Uses multi-GPU for parallel processing

name: ensemble-annotate-orpheus

resources:
  use_spot: true
  accelerators: A100:4  # 4x A100 for parallel annotation
  # Or use cheaper options: L4:8, V100:4

  memory: 64+
  disk_size: 200  # Need space for dataset + annotations

setup: |
  set -e

  echo "🔧 Setting up annotation environment..."

  # Install dependencies
  sudo apt-get update -qq
  pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
  pip install --quiet transformers datasets librosa soundfile accelerate
  pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow

  # Clone repo
  if [ ! -d "ensemble-tts-annotation" ]; then
    git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
  fi

  cd ensemble-tts-annotation

  echo "✅ Setup complete!"
  nvidia-smi

run: |
  cd ensemble-tts-annotation

  GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
  echo "🚀 Annotating Orpheus dataset with $GPU_COUNT GPUs"
  echo "================================================"

  # Download Orpheus dataset
  echo "📥 Downloading Orpheus TTS dataset..."
  python -c "
from datasets import load_dataset
import os

print('Loading dataset...')
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')
print(f'✓ Loaded {len(dataset)} samples')

# Save locally for faster access
os.makedirs('data/raw/orpheus/', exist_ok=True)
dataset.save_to_disk('data/raw/orpheus/dataset')
print('✓ Saved locally')
"

  # Annotate with ensemble (parallel processing)
  echo "🎯 Running ensemble annotation..."
  python scripts/ensemble/annotate_ensemble.py \
    --input data/raw/orpheus/dataset \
    --mode balanced \
    --device cuda \
    --batch-size 32 \
    --num-workers 8 \
    --output data/annotated/orpheus_annotated.parquet

  echo "✅ Annotation complete!"
  echo "================================================"

  # Statistics
  echo "📊 Annotation statistics:"
  python -c "
import pandas as pd

df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
print(f'Total samples: {len(df)}')
print(f'\nEmotion distribution:')
print(df['emotion'].value_counts())
print(f'\nConfidence statistics:')
print(df['emotion_confidence'].describe())
"

  # Upload to HuggingFace
  echo "📤 Uploading annotated dataset to HuggingFace..."
  python -c "
from datasets import Dataset
import pandas as pd

df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
dataset = Dataset.from_pandas(df)

# Push to HuggingFace Hub
dataset.push_to_hub(
    'marcosremar2/orpheus-tts-portuguese-annotated',
    private=False
)
print('✓ Uploaded to HuggingFace!')
"

  echo "================================================"
  echo "✅ Complete! Annotated dataset available at:"
  echo "   https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated"

# File mounts (if dataset is pre-stored in cloud)
# file_mounts:
#   /data/orpheus:
#     source: gs://my-bucket/orpheus-dataset/
#     mode: MOUNT

num_nodes: 1