# SkyPilot task for annotating complete Orpheus dataset (118k samples) # Uses multi-GPU for parallel processing name: ensemble-annotate-orpheus resources: use_spot: true accelerators: A100:4 # 4x A100 for parallel annotation # Or use cheaper options: L4:8, V100:4 memory: 64+ disk_size: 200 # Need space for dataset + annotations setup: | set -e echo "🔧 Setting up annotation environment..." # Install dependencies sudo apt-get update -qq pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 pip install --quiet transformers datasets librosa soundfile accelerate pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow # Clone repo if [ ! -d "ensemble-tts-annotation" ]; then git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation fi cd ensemble-tts-annotation echo "✅ Setup complete!" nvidia-smi run: | cd ensemble-tts-annotation GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) echo "🚀 Annotating Orpheus dataset with $GPU_COUNT GPUs" echo "================================================" # Download Orpheus dataset echo "📥 Downloading Orpheus TTS dataset..." python -c " from datasets import load_dataset import os print('Loading dataset...') dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train') print(f'✓ Loaded {len(dataset)} samples') # Save locally for faster access os.makedirs('data/raw/orpheus/', exist_ok=True) dataset.save_to_disk('data/raw/orpheus/dataset') print('✓ Saved locally') " # Annotate with ensemble (parallel processing) echo "🎯 Running ensemble annotation..." python scripts/ensemble/annotate_ensemble.py \ --input data/raw/orpheus/dataset \ --mode balanced \ --device cuda \ --batch-size 32 \ --num-workers 8 \ --output data/annotated/orpheus_annotated.parquet echo "✅ Annotation complete!" echo "================================================" # Statistics echo "📊 Annotation statistics:" python -c " import pandas as pd df = pd.read_parquet('data/annotated/orpheus_annotated.parquet') print(f'Total samples: {len(df)}') print(f'\nEmotion distribution:') print(df['emotion'].value_counts()) print(f'\nConfidence statistics:') print(df['emotion_confidence'].describe()) " # Upload to HuggingFace echo "📤 Uploading annotated dataset to HuggingFace..." python -c " from datasets import Dataset import pandas as pd df = pd.read_parquet('data/annotated/orpheus_annotated.parquet') dataset = Dataset.from_pandas(df) # Push to HuggingFace Hub dataset.push_to_hub( 'marcosremar2/orpheus-tts-portuguese-annotated', private=False ) print('✓ Uploaded to HuggingFace!') " echo "================================================" echo "✅ Complete! Annotated dataset available at:" echo " https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated" # File mounts (if dataset is pre-stored in cloud) # file_mounts: # /data/orpheus: # source: gs://my-bucket/orpheus-dataset/ # mode: MOUNT num_nodes: 1