marcosremar commited on
Commit
e06293b
Β·
1 Parent(s): 91e140c

Fix annotation test: use single-line Python with real Orpheus data

Browse files
scripts/cloud/skypilot_annotate_test.yaml CHANGED
@@ -1,204 +1,37 @@
1
- # Test annotation job - 1000 samples from Orpheus
2
- # Validates complete annotation pipeline before full run
3
- # Cost: ~$0.50 for 10-15 minutes
4
-
5
  name: ensemble-annotate-test
6
 
7
  resources:
8
  use_spot: true
9
- accelerators: {A100:1, V100:1, T4:1} # Try A100 first
10
  memory: 32+
11
  disk_size: 100
12
 
13
  setup: |
14
- set -e
15
-
16
  echo "=================================================="
17
  echo "πŸ§ͺ ENSEMBLE ANNOTATION TEST"
18
  echo "=================================================="
19
- echo ""
20
- echo "Testing annotation pipeline with 1000 Orpheus samples"
21
- echo ""
22
-
23
- # Machine info
24
- echo "πŸ“Š Machine Info:"
25
- echo " Hostname: $(hostname)"
26
- echo " CPU cores: $(nproc)"
27
- echo " Memory: $(free -h | grep Mem | awk '{print $2}')"
28
-
29
- if command -v nvidia-smi &> /dev/null; then
30
- echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
31
- fi
32
- echo ""
33
 
34
- # Install dependencies
35
- echo "πŸ“¦ Installing dependencies..."
36
- pip install -q torch transformers librosa soundfile datasets huggingface_hub
37
- echo " βœ“ Core dependencies installed"
38
- echo ""
39
 
40
- # Clone repository
41
- echo "πŸ“₯ Cloning repository..."
42
  if [ ! -d "ensemble-tts-annotation" ]; then
43
  git clone -q https://huggingface.co/marcosremar2/ensemble-tts-annotation
44
- echo " βœ“ Repository cloned"
45
  else
46
- cd ensemble-tts-annotation
47
- git pull -q
48
- cd ..
49
- echo " βœ“ Repository updated"
50
  fi
51
- echo ""
52
 
53
  echo "βœ… Setup complete!"
54
 
55
  run: |
56
  cd ensemble-tts-annotation
57
 
58
- echo ""
59
- echo "=================================================="
60
- echo "🎡 DOWNLOADING ORPHEUS SUBSET"
61
- echo "=================================================="
62
- echo ""
63
-
64
- # Download 1000 samples from Orpheus
65
- python3 << 'PYTHON_EOF'
66
- import sys
67
- from datasets import load_dataset
68
- from pathlib import Path
69
-
70
- print("Downloading 1000 samples from Orpheus...")
71
-
72
- # Load streaming (only first 1000)
73
- dataset = load_dataset(
74
- "marcosremar2/orpheus-tts-portuguese-dataset",
75
- split="train",
76
- streaming=True
77
- )
78
-
79
- # Save first 1000
80
- output_dir = Path("data/raw/orpheus_test")
81
- output_dir.mkdir(parents=True, exist_ok=True)
82
-
83
- import soundfile as sf
84
- count = 0
85
- for i, sample in enumerate(dataset):
86
- if i >= 1000:
87
- break
88
-
89
- audio = sample['audio']['array']
90
- sr = sample['audio']['sampling_rate']
91
- text = sample.get('text', '')
92
-
93
- # Save audio
94
- audio_path = output_dir / f"orpheus_{i:05d}.wav"
95
- sf.write(audio_path, audio, sr)
96
-
97
- count += 1
98
- if count % 100 == 0:
99
- print(f" Downloaded {count}/1000 samples...")
100
-
101
- print(f"βœ… Downloaded {count} samples to {output_dir}")
102
- PYTHON_EOF
103
 
104
  echo ""
105
- echo "=================================================="
106
- echo "πŸ€– ANNOTATING WITH ENSEMBLE (quick mode)"
107
- echo "=================================================="
108
- echo ""
109
-
110
- # Annotate samples
111
- python3 << 'PYTHON_EOF2'
112
- import sys
113
- sys.path.insert(0, '.')
114
-
115
- from ensemble_tts.annotator import EnsembleAnnotator
116
- from pathlib import Path
117
- import soundfile as sf
118
- import json
119
- from tqdm import tqdm
120
- import time
121
-
122
- print("Loading annotator (quick mode: Whisper + SenseVoice)...")
123
- annotator = EnsembleAnnotator(mode='quick', device='cuda', enable_events=False)
124
- print("βœ… Annotator loaded")
125
- print("")
126
-
127
- # Get audio files
128
- audio_dir = Path("data/raw/orpheus_test")
129
- audio_files = sorted(audio_dir.glob("*.wav"))
130
-
131
- print(f"Annotating {len(audio_files)} files...")
132
- print("")
133
-
134
- results = []
135
- start_time = time.time()
136
-
137
- for i, audio_file in enumerate(tqdm(audio_files)):
138
- try:
139
- # Annotate
140
- result = annotator.annotate(str(audio_file))
141
-
142
- results.append({
143
- "file": audio_file.name,
144
- "emotion": result.get("emotion", {}).get("label", "unknown"),
145
- "confidence": result.get("emotion", {}).get("confidence", 0.0),
146
- "predictions": result.get("emotion", {}).get("predictions", [])
147
- })
148
-
149
- # Log progress every 100
150
- if (i + 1) % 100 == 0:
151
- elapsed = time.time() - start_time
152
- rate = (i + 1) / elapsed
153
- remaining = (len(audio_files) - (i + 1)) / rate
154
- print(f"Progress: {i+1}/{len(audio_files)} ({rate:.1f} files/s, ETA: {remaining/60:.1f}min)")
155
-
156
- except Exception as e:
157
- print(f" Error on {audio_file.name}: {e}")
158
- results.append({
159
- "file": audio_file.name,
160
- "emotion": "error",
161
- "confidence": 0.0,
162
- "error": str(e)
163
- })
164
 
165
- # Save results
166
- output_file = Path("data/annotations/orpheus_test_annotations.json")
167
- output_file.parent.mkdir(parents=True, exist_ok=True)
168
-
169
- with open(output_file, 'w') as f:
170
- json.dump(results, f, indent=2)
171
-
172
- # Stats
173
- total_time = time.time() - start_time
174
- success_count = sum(1 for r in results if r['emotion'] != 'error')
175
-
176
- print("")
177
- print("==================================================" )
178
- print("βœ… ANNOTATION COMPLETE")
179
- print("==================================================")
180
- print(f"Total files: {len(results)}")
181
- print(f"Successful: {success_count}")
182
- print(f"Errors: {len(results) - success_count}")
183
- print(f"Time: {total_time/60:.1f} minutes")
184
- print(f"Rate: {len(results)/total_time:.2f} files/second")
185
- print(f"Results saved to: {output_file}")
186
- print("")
187
- PYTHON_EOF2
188
-
189
- echo ""
190
- echo "πŸ“Š Sample Results:"
191
- head -50 data/annotations/orpheus_test_annotations.json
192
- echo ""
193
-
194
- echo "=================================================="
195
- echo "βœ… TEST ANNOTATION COMPLETE"
196
- echo "=================================================="
197
- echo ""
198
- echo "πŸ“ Output: data/annotations/orpheus_test_annotations.json"
199
- echo ""
200
- echo "πŸ’‘ Next steps:"
201
- echo " 1. Download: sky scp ensemble-annotate-test:~/ensemble-tts-annotation/data/annotations/ ./data/"
202
- echo " 2. Review results"
203
- echo " 3. Run full annotation: sky launch scripts/cloud/skypilot_annotate_orpheus.yaml"
204
  echo ""
 
 
 
1
+ # Test annotation with 1000 real Orpheus samples
 
 
 
2
  name: ensemble-annotate-test
3
 
4
  resources:
5
  use_spot: true
6
+ accelerators: {A100:1, V100:1, T4:1}
7
  memory: 32+
8
  disk_size: 100
9
 
10
  setup: |
 
 
11
  echo "=================================================="
12
  echo "πŸ§ͺ ENSEMBLE ANNOTATION TEST"
13
  echo "=================================================="
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ pip install -q torch transformers librosa soundfile datasets huggingface_hub tqdm
 
 
 
 
16
 
 
 
17
  if [ ! -d "ensemble-tts-annotation" ]; then
18
  git clone -q https://huggingface.co/marcosremar2/ensemble-tts-annotation
 
19
  else
20
+ cd ensemble-tts-annotation && git pull -q && cd ..
 
 
 
21
  fi
 
22
 
23
  echo "βœ… Setup complete!"
24
 
25
  run: |
26
  cd ensemble-tts-annotation
27
 
28
+ echo "πŸ“₯ Downloading 1000 Orpheus samples..."
29
+ python3 -c 'from datasets import load_dataset; from pathlib import Path; import soundfile as sf; ds = load_dataset("marcosremar2/orpheus-tts-portuguese-dataset", split="train", streaming=True); out = Path("data/raw/orpheus_test"); out.mkdir(parents=True, exist_ok=True); [(sf.write(out / f"orpheus_{i:05d}.wav", s["audio"]["array"], s["audio"]["sampling_rate"]), print(f" {i+1}/1000") if (i+1) % 100 == 0 else None) for i, s in enumerate(ds) if i < 1000]; print("βœ… Downloaded 1000 samples")'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  echo ""
32
+ echo "πŸ€– Annotating with ensemble (quick mode)..."
33
+ python3 -c 'import sys; sys.path.insert(0, "."); from ensemble_tts.annotator import EnsembleAnnotator; from pathlib import Path; import json; from tqdm import tqdm; import time; print("Loading..."); ann = EnsembleAnnotator(mode="quick", device="cuda", enable_events=False); print("βœ… Loaded\n"); files = sorted(Path("data/raw/orpheus_test").glob("*.wav")); print(f"Annotating {len(files)} files...\n"); start = time.time(); results = [{"file": f.name, "emotion": (r := ann.annotate(str(f))).get("emotion", {}).get("label", "error"), "confidence": r.get("emotion", {}).get("confidence", 0.0)} if not (i % 100) else {"file": f.name, "emotion": (r := ann.annotate(str(f))).get("emotion", {}).get("label", "error"), "confidence": r.get("emotion", {}).get("confidence", 0.0)} for i, f in enumerate(tqdm(files))]; out = Path("data/annotations/orpheus_test_annotations.json"); out.parent.mkdir(parents=True, exist_ok=True); json.dump(results, open(out, "w"), indent=2); elapsed = time.time() - start; ok = sum(1 for r in results if r["emotion"] != "error"); print(f"\nβœ… COMPLETE\nTotal: {len(results)}\nSuccess: {ok}\nTime: {elapsed/60:.1f} min\nRate: {len(results)/elapsed:.2f} files/s")'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  echo ""
36
+ echo "πŸ“Š Sample results:"
37
+ head -30 data/annotations/orpheus_test_annotations.json