Spaces:

Reza2kn
/

representation-chizzler

Running on Zero

App Files Files Community

Reza2kn commited on Jan 5

Commit

65d5dac

verified ·

1 Parent(s): 6ca1775

Auto-resume ZeroGPU runs and cap shard size

Browse files

Files changed (1) hide show

app.py +112 -1

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import shutil
 import subprocess
 import sys
 import tempfile
 import urllib.request
 from datetime import datetime
 from pathlib import Path
@@ -50,6 +51,13 @@ AUDIO_EXTENSIONS = (".wav", ".mp3", ".flac")
 DEFAULT_TARGET_DBFS = -20.0
 DEFAULT_MAX_BOOST_DB = 20.0
 DEFAULT_MAX_ATTEN_DB = 10.0
 def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
@@ -749,8 +757,21 @@ def default_output_repo(source_id: str, username: str) -> str:
     return f"{username}/{name}"
 @gpu_decorator(DEFAULT_GPU_DURATION)
-def process_dataset_and_push(
     dataset_id: str,
     config: str,
     split: str,
@@ -794,6 +815,22 @@ def process_dataset_and_push(
         if max_shards_per_run and max_shards_per_run > 0
         else None
     )
     api = HfApi(token=token)
     username = api.whoami()["name"]
@@ -1090,6 +1127,75 @@ def process_dataset_and_push(
     )
 def assemble_cached_dataset_and_push(
     dataset_id: str,
     config: str,
@@ -1345,6 +1451,10 @@ with gr.Blocks(title="Representation Chizzler") as demo:
         resume_checkbox = gr.Checkbox(
             label="Resume from cached shards", value=True
         )
         cache_to_hub_checkbox = gr.Checkbox(
             label="Cache shards on Hub (recommended for ZeroGPU)",
             value=DEFAULT_CACHE_TO_HUB,
@@ -1419,6 +1529,7 @@ with gr.Blocks(title="Representation Chizzler") as demo:
                 max_atten_slider_ds,
                 max_examples_input,
                 resume_checkbox,
                 shard_size_input,
                 cache_to_hub_checkbox,
                 max_shards_input,

 import subprocess
 import sys
 import tempfile
+import time
 import urllib.request
 from datetime import datetime
 from pathlib import Path
 DEFAULT_TARGET_DBFS = -20.0
 DEFAULT_MAX_BOOST_DB = 20.0
 DEFAULT_MAX_ATTEN_DB = 10.0
+DEFAULT_AUTO_RESUME = bool(os.getenv("SPACE_ID"))
+DEFAULT_ZERO_GPU_SHARD_SIZE = int(
+    os.getenv("CHIZZLER_ZERO_GPU_SHARD_SIZE", "10")
+)
+DEFAULT_ZERO_GPU_MAX_SHARDS = int(
+    os.getenv("CHIZZLER_ZERO_GPU_MAX_SHARDS", "1")
+)
 def log_progress(message: str, level: int = 1, enabled: bool = True) -> None:
     return f"{username}/{name}"
+def _apply_zero_gpu_limits(
+    shard_size: int, max_shards: Optional[int]
+) -> Tuple[int, Optional[int]]:
+    if not os.getenv("SPACE_ID"):
+        return shard_size, max_shards
+    adjusted_shard_size = min(shard_size, DEFAULT_ZERO_GPU_SHARD_SIZE)
+    if max_shards is None:
+        adjusted_max_shards = DEFAULT_ZERO_GPU_MAX_SHARDS
+    else:
+        adjusted_max_shards = min(max_shards, DEFAULT_ZERO_GPU_MAX_SHARDS)
+    return adjusted_shard_size, adjusted_max_shards
 @gpu_decorator(DEFAULT_GPU_DURATION)
+def _process_dataset_and_push_gpu(
     dataset_id: str,
     config: str,
     split: str,
         if max_shards_per_run and max_shards_per_run > 0
         else None
     )
+    if os.getenv("SPACE_ID"):
+        adjusted_shard_size, adjusted_max_shards = _apply_zero_gpu_limits(
+            shard_size_int, max_shards_int
+        )
+        if adjusted_shard_size != shard_size_int:
+            log_progress(
+                f"ZeroGPU safe mode: shard size capped at {adjusted_shard_size}",
+                2,
+            )
+            shard_size_int = adjusted_shard_size
+        if adjusted_max_shards != max_shards_int:
+            log_progress(
+                f"ZeroGPU safe mode: max shards per run capped at {adjusted_max_shards}",
+                2,
+            )
+            max_shards_int = adjusted_max_shards
     api = HfApi(token=token)
     username = api.whoami()["name"]
     )
+def process_dataset_and_push(
+    dataset_id: str,
+    config: str,
+    split: str,
+    audio_column: str,
+    output_repo: str,
+    private_repo: bool,
+    vad_threshold: float,
+    max_silence_gap: float,
+    normalize_audio: bool,
+    target_dbfs: float,
+    max_boost_db: float,
+    max_atten_db: float,
+    max_examples: Optional[float],
+    resume_processing: bool,
+    auto_resume: bool,
+    shard_size: Optional[float],
+    cache_on_hub: bool,
+    max_shards_per_run: Optional[float],
+    progress=gr.Progress(),
+) -> str:
+    attempts = 0
+    while True:
+        try:
+            result = _process_dataset_and_push_gpu(
+                dataset_id,
+                config,
+                split,
+                audio_column,
+                output_repo,
+                private_repo,
+                vad_threshold,
+                max_silence_gap,
+                normalize_audio,
+                target_dbfs,
+                max_boost_db,
+                max_atten_db,
+                max_examples,
+                resume_processing,
+                shard_size,
+                cache_on_hub,
+                max_shards_per_run,
+                progress=progress,
+            )
+        except Exception as exc:
+            message = str(exc)
+            if auto_resume and "GPU task aborted" in message:
+                attempts += 1
+                log_progress(
+                    f"ZeroGPU preempted. Retrying (attempt {attempts})...",
+                    2,
+                )
+                time.sleep(2)
+                continue
+            raise
+        if not auto_resume:
+            return result
+        if "Resume with cached shards" in result:
+            attempts += 1
+            log_progress(
+                f"Auto-resume: continuing (attempt {attempts})...",
+                2,
+            )
+            time.sleep(2)
+            continue
+        return result
 def assemble_cached_dataset_and_push(
     dataset_id: str,
     config: str,
         resume_checkbox = gr.Checkbox(
             label="Resume from cached shards", value=True
         )
+        auto_resume_checkbox = gr.Checkbox(
+            label="Auto-resume on ZeroGPU preemption",
+            value=DEFAULT_AUTO_RESUME,
+        )
         cache_to_hub_checkbox = gr.Checkbox(
             label="Cache shards on Hub (recommended for ZeroGPU)",
             value=DEFAULT_CACHE_TO_HUB,
                 max_atten_slider_ds,
                 max_examples_input,
                 resume_checkbox,
+                auto_resume_checkbox,
                 shard_size_input,
                 cache_to_hub_checkbox,
                 max_shards_input,