Spaces:
Running on Zero
Running on Zero
Auto-resume ZeroGPU runs and cap shard size
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import shutil
|
|
| 6 |
import subprocess
|
| 7 |
import sys
|
| 8 |
import tempfile
|
|
|
|
| 9 |
import urllib.request
|
| 10 |
from datetime import datetime
|
| 11 |
from pathlib import Path
|
|
@@ -50,6 +51,13 @@ AUDIO_EXTENSIONS = (".wav", ".mp3", ".flac")
|
|
| 50 |
DEFAULT_TARGET_DBFS = -20.0
|
| 51 |
DEFAULT_MAX_BOOST_DB = 20.0
|
| 52 |
DEFAULT_MAX_ATTEN_DB = 10.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
|
|
@@ -749,8 +757,21 @@ def default_output_repo(source_id: str, username: str) -> str:
|
|
| 749 |
return f"{username}/{name}"
|
| 750 |
|
| 751 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
@gpu_decorator(DEFAULT_GPU_DURATION)
|
| 753 |
-
def
|
| 754 |
dataset_id: str,
|
| 755 |
config: str,
|
| 756 |
split: str,
|
|
@@ -794,6 +815,22 @@ def process_dataset_and_push(
|
|
| 794 |
if max_shards_per_run and max_shards_per_run > 0
|
| 795 |
else None
|
| 796 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 797 |
|
| 798 |
api = HfApi(token=token)
|
| 799 |
username = api.whoami()["name"]
|
|
@@ -1090,6 +1127,75 @@ def process_dataset_and_push(
|
|
| 1090 |
)
|
| 1091 |
|
| 1092 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
def assemble_cached_dataset_and_push(
|
| 1094 |
dataset_id: str,
|
| 1095 |
config: str,
|
|
@@ -1345,6 +1451,10 @@ with gr.Blocks(title="Representation Chizzler") as demo:
|
|
| 1345 |
resume_checkbox = gr.Checkbox(
|
| 1346 |
label="Resume from cached shards", value=True
|
| 1347 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1348 |
cache_to_hub_checkbox = gr.Checkbox(
|
| 1349 |
label="Cache shards on Hub (recommended for ZeroGPU)",
|
| 1350 |
value=DEFAULT_CACHE_TO_HUB,
|
|
@@ -1419,6 +1529,7 @@ with gr.Blocks(title="Representation Chizzler") as demo:
|
|
| 1419 |
max_atten_slider_ds,
|
| 1420 |
max_examples_input,
|
| 1421 |
resume_checkbox,
|
|
|
|
| 1422 |
shard_size_input,
|
| 1423 |
cache_to_hub_checkbox,
|
| 1424 |
max_shards_input,
|
|
|
|
| 6 |
import subprocess
|
| 7 |
import sys
|
| 8 |
import tempfile
|
| 9 |
+
import time
|
| 10 |
import urllib.request
|
| 11 |
from datetime import datetime
|
| 12 |
from pathlib import Path
|
|
|
|
| 51 |
DEFAULT_TARGET_DBFS = -20.0
|
| 52 |
DEFAULT_MAX_BOOST_DB = 20.0
|
| 53 |
DEFAULT_MAX_ATTEN_DB = 10.0
|
| 54 |
+
DEFAULT_AUTO_RESUME = bool(os.getenv("SPACE_ID"))
|
| 55 |
+
DEFAULT_ZERO_GPU_SHARD_SIZE = int(
|
| 56 |
+
os.getenv("CHIZZLER_ZERO_GPU_SHARD_SIZE", "10")
|
| 57 |
+
)
|
| 58 |
+
DEFAULT_ZERO_GPU_MAX_SHARDS = int(
|
| 59 |
+
os.getenv("CHIZZLER_ZERO_GPU_MAX_SHARDS", "1")
|
| 60 |
+
)
|
| 61 |
|
| 62 |
|
| 63 |
def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
|
|
|
|
| 757 |
return f"{username}/{name}"
|
| 758 |
|
| 759 |
|
| 760 |
+
def _apply_zero_gpu_limits(
|
| 761 |
+
shard_size: int, max_shards: Optional[int]
|
| 762 |
+
) -> Tuple[int, Optional[int]]:
|
| 763 |
+
if not os.getenv("SPACE_ID"):
|
| 764 |
+
return shard_size, max_shards
|
| 765 |
+
adjusted_shard_size = min(shard_size, DEFAULT_ZERO_GPU_SHARD_SIZE)
|
| 766 |
+
if max_shards is None:
|
| 767 |
+
adjusted_max_shards = DEFAULT_ZERO_GPU_MAX_SHARDS
|
| 768 |
+
else:
|
| 769 |
+
adjusted_max_shards = min(max_shards, DEFAULT_ZERO_GPU_MAX_SHARDS)
|
| 770 |
+
return adjusted_shard_size, adjusted_max_shards
|
| 771 |
+
|
| 772 |
+
|
| 773 |
@gpu_decorator(DEFAULT_GPU_DURATION)
|
| 774 |
+
def _process_dataset_and_push_gpu(
|
| 775 |
dataset_id: str,
|
| 776 |
config: str,
|
| 777 |
split: str,
|
|
|
|
| 815 |
if max_shards_per_run and max_shards_per_run > 0
|
| 816 |
else None
|
| 817 |
)
|
| 818 |
+
if os.getenv("SPACE_ID"):
|
| 819 |
+
adjusted_shard_size, adjusted_max_shards = _apply_zero_gpu_limits(
|
| 820 |
+
shard_size_int, max_shards_int
|
| 821 |
+
)
|
| 822 |
+
if adjusted_shard_size != shard_size_int:
|
| 823 |
+
log_progress(
|
| 824 |
+
f"ZeroGPU safe mode: shard size capped at {adjusted_shard_size}",
|
| 825 |
+
2,
|
| 826 |
+
)
|
| 827 |
+
shard_size_int = adjusted_shard_size
|
| 828 |
+
if adjusted_max_shards != max_shards_int:
|
| 829 |
+
log_progress(
|
| 830 |
+
f"ZeroGPU safe mode: max shards per run capped at {adjusted_max_shards}",
|
| 831 |
+
2,
|
| 832 |
+
)
|
| 833 |
+
max_shards_int = adjusted_max_shards
|
| 834 |
|
| 835 |
api = HfApi(token=token)
|
| 836 |
username = api.whoami()["name"]
|
|
|
|
| 1127 |
)
|
| 1128 |
|
| 1129 |
|
| 1130 |
+
def process_dataset_and_push(
|
| 1131 |
+
dataset_id: str,
|
| 1132 |
+
config: str,
|
| 1133 |
+
split: str,
|
| 1134 |
+
audio_column: str,
|
| 1135 |
+
output_repo: str,
|
| 1136 |
+
private_repo: bool,
|
| 1137 |
+
vad_threshold: float,
|
| 1138 |
+
max_silence_gap: float,
|
| 1139 |
+
normalize_audio: bool,
|
| 1140 |
+
target_dbfs: float,
|
| 1141 |
+
max_boost_db: float,
|
| 1142 |
+
max_atten_db: float,
|
| 1143 |
+
max_examples: Optional[float],
|
| 1144 |
+
resume_processing: bool,
|
| 1145 |
+
auto_resume: bool,
|
| 1146 |
+
shard_size: Optional[float],
|
| 1147 |
+
cache_on_hub: bool,
|
| 1148 |
+
max_shards_per_run: Optional[float],
|
| 1149 |
+
progress=gr.Progress(),
|
| 1150 |
+
) -> str:
|
| 1151 |
+
attempts = 0
|
| 1152 |
+
while True:
|
| 1153 |
+
try:
|
| 1154 |
+
result = _process_dataset_and_push_gpu(
|
| 1155 |
+
dataset_id,
|
| 1156 |
+
config,
|
| 1157 |
+
split,
|
| 1158 |
+
audio_column,
|
| 1159 |
+
output_repo,
|
| 1160 |
+
private_repo,
|
| 1161 |
+
vad_threshold,
|
| 1162 |
+
max_silence_gap,
|
| 1163 |
+
normalize_audio,
|
| 1164 |
+
target_dbfs,
|
| 1165 |
+
max_boost_db,
|
| 1166 |
+
max_atten_db,
|
| 1167 |
+
max_examples,
|
| 1168 |
+
resume_processing,
|
| 1169 |
+
shard_size,
|
| 1170 |
+
cache_on_hub,
|
| 1171 |
+
max_shards_per_run,
|
| 1172 |
+
progress=progress,
|
| 1173 |
+
)
|
| 1174 |
+
except Exception as exc:
|
| 1175 |
+
message = str(exc)
|
| 1176 |
+
if auto_resume and "GPU task aborted" in message:
|
| 1177 |
+
attempts += 1
|
| 1178 |
+
log_progress(
|
| 1179 |
+
f"ZeroGPU preempted. Retrying (attempt {attempts})...",
|
| 1180 |
+
2,
|
| 1181 |
+
)
|
| 1182 |
+
time.sleep(2)
|
| 1183 |
+
continue
|
| 1184 |
+
raise
|
| 1185 |
+
|
| 1186 |
+
if not auto_resume:
|
| 1187 |
+
return result
|
| 1188 |
+
if "Resume with cached shards" in result:
|
| 1189 |
+
attempts += 1
|
| 1190 |
+
log_progress(
|
| 1191 |
+
f"Auto-resume: continuing (attempt {attempts})...",
|
| 1192 |
+
2,
|
| 1193 |
+
)
|
| 1194 |
+
time.sleep(2)
|
| 1195 |
+
continue
|
| 1196 |
+
return result
|
| 1197 |
+
|
| 1198 |
+
|
| 1199 |
def assemble_cached_dataset_and_push(
|
| 1200 |
dataset_id: str,
|
| 1201 |
config: str,
|
|
|
|
| 1451 |
resume_checkbox = gr.Checkbox(
|
| 1452 |
label="Resume from cached shards", value=True
|
| 1453 |
)
|
| 1454 |
+
auto_resume_checkbox = gr.Checkbox(
|
| 1455 |
+
label="Auto-resume on ZeroGPU preemption",
|
| 1456 |
+
value=DEFAULT_AUTO_RESUME,
|
| 1457 |
+
)
|
| 1458 |
cache_to_hub_checkbox = gr.Checkbox(
|
| 1459 |
label="Cache shards on Hub (recommended for ZeroGPU)",
|
| 1460 |
value=DEFAULT_CACHE_TO_HUB,
|
|
|
|
| 1529 |
max_atten_slider_ds,
|
| 1530 |
max_examples_input,
|
| 1531 |
resume_checkbox,
|
| 1532 |
+
auto_resume_checkbox,
|
| 1533 |
shard_size_input,
|
| 1534 |
cache_to_hub_checkbox,
|
| 1535 |
max_shards_input,
|