Reza2kn commited on
Commit
65d5dac
·
verified ·
1 Parent(s): 6ca1775

Auto-resume ZeroGPU runs and cap shard size

Browse files
Files changed (1) hide show
  1. app.py +112 -1
app.py CHANGED
@@ -6,6 +6,7 @@ import shutil
6
  import subprocess
7
  import sys
8
  import tempfile
 
9
  import urllib.request
10
  from datetime import datetime
11
  from pathlib import Path
@@ -50,6 +51,13 @@ AUDIO_EXTENSIONS = (".wav", ".mp3", ".flac")
50
  DEFAULT_TARGET_DBFS = -20.0
51
  DEFAULT_MAX_BOOST_DB = 20.0
52
  DEFAULT_MAX_ATTEN_DB = 10.0
 
 
 
 
 
 
 
53
 
54
 
55
  def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
@@ -749,8 +757,21 @@ def default_output_repo(source_id: str, username: str) -> str:
749
  return f"{username}/{name}"
750
 
751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
  @gpu_decorator(DEFAULT_GPU_DURATION)
753
- def process_dataset_and_push(
754
  dataset_id: str,
755
  config: str,
756
  split: str,
@@ -794,6 +815,22 @@ def process_dataset_and_push(
794
  if max_shards_per_run and max_shards_per_run > 0
795
  else None
796
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
 
798
  api = HfApi(token=token)
799
  username = api.whoami()["name"]
@@ -1090,6 +1127,75 @@ def process_dataset_and_push(
1090
  )
1091
 
1092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
  def assemble_cached_dataset_and_push(
1094
  dataset_id: str,
1095
  config: str,
@@ -1345,6 +1451,10 @@ with gr.Blocks(title="Representation Chizzler") as demo:
1345
  resume_checkbox = gr.Checkbox(
1346
  label="Resume from cached shards", value=True
1347
  )
 
 
 
 
1348
  cache_to_hub_checkbox = gr.Checkbox(
1349
  label="Cache shards on Hub (recommended for ZeroGPU)",
1350
  value=DEFAULT_CACHE_TO_HUB,
@@ -1419,6 +1529,7 @@ with gr.Blocks(title="Representation Chizzler") as demo:
1419
  max_atten_slider_ds,
1420
  max_examples_input,
1421
  resume_checkbox,
 
1422
  shard_size_input,
1423
  cache_to_hub_checkbox,
1424
  max_shards_input,
 
6
  import subprocess
7
  import sys
8
  import tempfile
9
+ import time
10
  import urllib.request
11
  from datetime import datetime
12
  from pathlib import Path
 
51
  DEFAULT_TARGET_DBFS = -20.0
52
  DEFAULT_MAX_BOOST_DB = 20.0
53
  DEFAULT_MAX_ATTEN_DB = 10.0
54
+ DEFAULT_AUTO_RESUME = bool(os.getenv("SPACE_ID"))
55
+ DEFAULT_ZERO_GPU_SHARD_SIZE = int(
56
+ os.getenv("CHIZZLER_ZERO_GPU_SHARD_SIZE", "10")
57
+ )
58
+ DEFAULT_ZERO_GPU_MAX_SHARDS = int(
59
+ os.getenv("CHIZZLER_ZERO_GPU_MAX_SHARDS", "1")
60
+ )
61
 
62
 
63
  def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
 
757
  return f"{username}/{name}"
758
 
759
 
760
+ def _apply_zero_gpu_limits(
761
+ shard_size: int, max_shards: Optional[int]
762
+ ) -> Tuple[int, Optional[int]]:
763
+ if not os.getenv("SPACE_ID"):
764
+ return shard_size, max_shards
765
+ adjusted_shard_size = min(shard_size, DEFAULT_ZERO_GPU_SHARD_SIZE)
766
+ if max_shards is None:
767
+ adjusted_max_shards = DEFAULT_ZERO_GPU_MAX_SHARDS
768
+ else:
769
+ adjusted_max_shards = min(max_shards, DEFAULT_ZERO_GPU_MAX_SHARDS)
770
+ return adjusted_shard_size, adjusted_max_shards
771
+
772
+
773
  @gpu_decorator(DEFAULT_GPU_DURATION)
774
+ def _process_dataset_and_push_gpu(
775
  dataset_id: str,
776
  config: str,
777
  split: str,
 
815
  if max_shards_per_run and max_shards_per_run > 0
816
  else None
817
  )
818
+ if os.getenv("SPACE_ID"):
819
+ adjusted_shard_size, adjusted_max_shards = _apply_zero_gpu_limits(
820
+ shard_size_int, max_shards_int
821
+ )
822
+ if adjusted_shard_size != shard_size_int:
823
+ log_progress(
824
+ f"ZeroGPU safe mode: shard size capped at {adjusted_shard_size}",
825
+ 2,
826
+ )
827
+ shard_size_int = adjusted_shard_size
828
+ if adjusted_max_shards != max_shards_int:
829
+ log_progress(
830
+ f"ZeroGPU safe mode: max shards per run capped at {adjusted_max_shards}",
831
+ 2,
832
+ )
833
+ max_shards_int = adjusted_max_shards
834
 
835
  api = HfApi(token=token)
836
  username = api.whoami()["name"]
 
1127
  )
1128
 
1129
 
1130
+ def process_dataset_and_push(
1131
+ dataset_id: str,
1132
+ config: str,
1133
+ split: str,
1134
+ audio_column: str,
1135
+ output_repo: str,
1136
+ private_repo: bool,
1137
+ vad_threshold: float,
1138
+ max_silence_gap: float,
1139
+ normalize_audio: bool,
1140
+ target_dbfs: float,
1141
+ max_boost_db: float,
1142
+ max_atten_db: float,
1143
+ max_examples: Optional[float],
1144
+ resume_processing: bool,
1145
+ auto_resume: bool,
1146
+ shard_size: Optional[float],
1147
+ cache_on_hub: bool,
1148
+ max_shards_per_run: Optional[float],
1149
+ progress=gr.Progress(),
1150
+ ) -> str:
1151
+ attempts = 0
1152
+ while True:
1153
+ try:
1154
+ result = _process_dataset_and_push_gpu(
1155
+ dataset_id,
1156
+ config,
1157
+ split,
1158
+ audio_column,
1159
+ output_repo,
1160
+ private_repo,
1161
+ vad_threshold,
1162
+ max_silence_gap,
1163
+ normalize_audio,
1164
+ target_dbfs,
1165
+ max_boost_db,
1166
+ max_atten_db,
1167
+ max_examples,
1168
+ resume_processing,
1169
+ shard_size,
1170
+ cache_on_hub,
1171
+ max_shards_per_run,
1172
+ progress=progress,
1173
+ )
1174
+ except Exception as exc:
1175
+ message = str(exc)
1176
+ if auto_resume and "GPU task aborted" in message:
1177
+ attempts += 1
1178
+ log_progress(
1179
+ f"ZeroGPU preempted. Retrying (attempt {attempts})...",
1180
+ 2,
1181
+ )
1182
+ time.sleep(2)
1183
+ continue
1184
+ raise
1185
+
1186
+ if not auto_resume:
1187
+ return result
1188
+ if "Resume with cached shards" in result:
1189
+ attempts += 1
1190
+ log_progress(
1191
+ f"Auto-resume: continuing (attempt {attempts})...",
1192
+ 2,
1193
+ )
1194
+ time.sleep(2)
1195
+ continue
1196
+ return result
1197
+
1198
+
1199
  def assemble_cached_dataset_and_push(
1200
  dataset_id: str,
1201
  config: str,
 
1451
  resume_checkbox = gr.Checkbox(
1452
  label="Resume from cached shards", value=True
1453
  )
1454
+ auto_resume_checkbox = gr.Checkbox(
1455
+ label="Auto-resume on ZeroGPU preemption",
1456
+ value=DEFAULT_AUTO_RESUME,
1457
+ )
1458
  cache_to_hub_checkbox = gr.Checkbox(
1459
  label="Cache shards on Hub (recommended for ZeroGPU)",
1460
  value=DEFAULT_CACHE_TO_HUB,
 
1529
  max_atten_slider_ds,
1530
  max_examples_input,
1531
  resume_checkbox,
1532
+ auto_resume_checkbox,
1533
  shard_size_input,
1534
  cache_to_hub_checkbox,
1535
  max_shards_input,