Spaces:

adowu
/

ACE-Step-Training

Running

App Files Files Community

pedroapfilho commited on Feb 11

Commit

eb72117

unverified ·

1 Parent(s): ad1969b

Add max files limit to HF dataset download

Browse files

List repo files first via HfApi, take first N audio files, download
individually with hf_hub_download, and symlink into working dir.
Avoids downloading all 18k files when only 50 are needed for training.

Files changed (2) hide show

app.py +9 -3
src/lora_trainer.py +49 -20

app.py CHANGED Viewed

@@ -323,7 +323,7 @@ def lora_upload_and_scan(files, training_state):
         return f"Error: {e}", training_state or {}
-def lora_download_hf(dataset_id, hf_token, training_state):
     """Download HuggingFace dataset and scan for audio files."""
     try:
         if not dataset_id or not dataset_id.strip():
@@ -332,7 +332,7 @@ def lora_download_hf(dataset_id, hf_token, training_state):
         token = hf_token.strip() if hf_token else None
         local_dir, dl_status = download_hf_dataset(
-            dataset_id.strip(), hf_token=token
         )
         if not local_dir:
@@ -768,6 +768,12 @@ def create_ui():
                                     label="HF Token (optional, for private repos)",
                                     type="password",
                                 )
                                 lora_hf_btn = gr.Button(
                                     "Download & Scan", variant="primary"
                                 )
@@ -899,7 +905,7 @@ def create_ui():
                 lora_hf_btn.click(
                     fn=lora_download_hf,
-                    inputs=[lora_hf_id, lora_hf_token, training_state],
                     outputs=[lora_source_status, training_state],
                 )

         return f"Error: {e}", training_state or {}
+def lora_download_hf(dataset_id, hf_token, max_files, training_state):
     """Download HuggingFace dataset and scan for audio files."""
     try:
         if not dataset_id or not dataset_id.strip():
         token = hf_token.strip() if hf_token else None
         local_dir, dl_status = download_hf_dataset(
+            dataset_id.strip(), max_files=int(max_files), hf_token=token
         )
         if not local_dir:
                                     label="HF Token (optional, for private repos)",
                                     type="password",
                                 )
+                                lora_hf_max = gr.Number(
+                                    label="Max files",
+                                    value=50,
+                                    minimum=1,
+                                    precision=0,
+                                )
                                 lora_hf_btn = gr.Button(
                                     "Download & Scan", variant="primary"
                                 )
                 lora_hf_btn.click(
                     fn=lora_download_hf,
+                    inputs=[lora_hf_id, lora_hf_token, lora_hf_max, training_state],
                     outputs=[lora_source_status, training_state],
                 )

src/lora_trainer.py CHANGED Viewed

@@ -11,48 +11,77 @@ from typing import Optional, Tuple
 logger = logging.getLogger(__name__)
-AUDIO_EXTENSIONS = ["*.wav", "*.mp3", "*.flac", "*.ogg", "*.opus"]
 def download_hf_dataset(
     dataset_id: str,
     hf_token: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
-    Download an audio dataset from HuggingFace Hub.
-    Uses snapshot_download without local_dir so HF's built-in cache
-    handles storage. On HF Spaces this is co-located with HF storage,
-    avoiding unnecessary file duplication.
     Args:
         dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
         hf_token: Optional HuggingFace token for private repos
     Returns:
-        Tuple of (cached_dir, status_message)
     """
     try:
-        from huggingface_hub import snapshot_download
-        logger.info(f"Fetching dataset '{dataset_id}' via HF cache...")
-        cached_dir = snapshot_download(
-            repo_id=dataset_id,
-            repo_type="dataset",
-            token=hf_token or None,
-            allow_patterns=AUDIO_EXTENSIONS,
-        )
-        audio_count = sum(
-            1
-            for ext in AUDIO_EXTENSIONS
-            for _ in Path(cached_dir).rglob(ext)
         )
-        status = f"Loaded {audio_count} audio files from {dataset_id}"
         logger.info(status)
-        return cached_dir, status
     except ImportError:
         msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"

 logger = logging.getLogger(__name__)
+AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg", ".opus"}
 def download_hf_dataset(
     dataset_id: str,
+    max_files: int = 50,
     hf_token: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
+    Download a subset of audio files from a HuggingFace dataset repo.
+    Lists repo contents first, picks the first N audio files,
+    then downloads them individually to the HF cache.
     Args:
         dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
+        max_files: Maximum number of audio files to download
         hf_token: Optional HuggingFace token for private repos
     Returns:
+        Tuple of (output_dir, status_message)
     """
     try:
+        from huggingface_hub import HfApi, hf_hub_download
+        api = HfApi()
+        token = hf_token or None
+        logger.info(f"Listing files in '{dataset_id}'...")
+        all_files = [
+            f.rfilename
+            for f in api.list_repo_tree(
+                dataset_id, repo_type="dataset", token=token, recursive=True
+            )
+            if hasattr(f, "rfilename")
+            and Path(f.rfilename).suffix.lower() in AUDIO_SUFFIXES
+        ]
+        total_available = len(all_files)
+        selected = all_files[:max_files]
+        if not selected:
+            return "", f"No audio files found in {dataset_id}"
+        logger.info(
+            f"Downloading {len(selected)}/{total_available} audio files..."
         )
+        output_dir = Path("lora_training") / "hf" / dataset_id.replace("/", "_")
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for i, filename in enumerate(selected):
+            logger.info(f"  [{i + 1}/{len(selected)}] {filename}")
+            cached_path = hf_hub_download(
+                repo_id=dataset_id,
+                filename=filename,
+                repo_type="dataset",
+                token=token,
+            )
+            # Symlink from cache into our working dir so scan_directory finds them
+            dest = output_dir / Path(filename).name
+            if not dest.exists():
+                dest.symlink_to(cached_path)
+        status = (
+            f"Downloaded {len(selected)} of {total_available} "
+            f"audio files from {dataset_id}"
+        )
         logger.info(status)
+        return str(output_dir), status
     except ImportError:
         msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"