File size: 3,152 Bytes
13e402e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# SkyPilot task for annotating complete Orpheus dataset (118k samples)
# Uses multi-GPU for parallel processing

name: ensemble-annotate-orpheus

resources:
  use_spot: true
  accelerators: A100:4  # 4x A100 for parallel annotation
  # Or use cheaper options: L4:8, V100:4

  memory: 64+
  disk_size: 200  # Need space for dataset + annotations

setup: |
  set -e

  echo "πŸ”§ Setting up annotation environment..."

  # Install dependencies
  sudo apt-get update -qq
  pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
  pip install --quiet transformers datasets librosa soundfile accelerate
  pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow

  # Clone repo
  if [ ! -d "ensemble-tts-annotation" ]; then
    git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
  fi

  cd ensemble-tts-annotation

  echo "βœ… Setup complete!"
  nvidia-smi

run: |
  cd ensemble-tts-annotation

  GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
  echo "πŸš€ Annotating Orpheus dataset with $GPU_COUNT GPUs"
  echo "================================================"

  # Download Orpheus dataset
  echo "πŸ“₯ Downloading Orpheus TTS dataset..."
  python -c "
from datasets import load_dataset
import os

print('Loading dataset...')
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')
print(f'βœ“ Loaded {len(dataset)} samples')

# Save locally for faster access
os.makedirs('data/raw/orpheus/', exist_ok=True)
dataset.save_to_disk('data/raw/orpheus/dataset')
print('βœ“ Saved locally')
"

  # Annotate with ensemble (parallel processing)
  echo "🎯 Running ensemble annotation..."
  python scripts/ensemble/annotate_ensemble.py \
    --input data/raw/orpheus/dataset \
    --mode balanced \
    --device cuda \
    --batch-size 32 \
    --num-workers 8 \
    --output data/annotated/orpheus_annotated.parquet

  echo "βœ… Annotation complete!"
  echo "================================================"

  # Statistics
  echo "πŸ“Š Annotation statistics:"
  python -c "
import pandas as pd

df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
print(f'Total samples: {len(df)}')
print(f'\nEmotion distribution:')
print(df['emotion'].value_counts())
print(f'\nConfidence statistics:')
print(df['emotion_confidence'].describe())
"

  # Upload to HuggingFace
  echo "πŸ“€ Uploading annotated dataset to HuggingFace..."
  python -c "
from datasets import Dataset
import pandas as pd

df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
dataset = Dataset.from_pandas(df)

# Push to HuggingFace Hub
dataset.push_to_hub(
    'marcosremar2/orpheus-tts-portuguese-annotated',
    private=False
)
print('βœ“ Uploaded to HuggingFace!')
"

  echo "================================================"
  echo "βœ… Complete! Annotated dataset available at:"
  echo "   https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated"

# File mounts (if dataset is pre-stored in cloud)
# file_mounts:
#   /data/orpheus:
#     source: gs://my-bucket/orpheus-dataset/
#     mode: MOUNT

num_nodes: 1