Spaces:
Running
Running
Use HF cache for dataset downloads instead of custom local_dir
Browse filesOn HF Spaces, snapshot_download without local_dir leverages the
co-located HF cache, avoiding file duplication to ephemeral storage.
- app.py +1 -2
- src/lora_trainer.py +9 -14
app.py
CHANGED
|
@@ -330,10 +330,9 @@ def lora_download_hf(dataset_id, hf_token, training_state):
|
|
| 330 |
return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
|
| 331 |
|
| 332 |
token = hf_token.strip() if hf_token else None
|
| 333 |
-
output_dir = str(Path("lora_training") / "hf_datasets")
|
| 334 |
|
| 335 |
local_dir, dl_status = download_hf_dataset(
|
| 336 |
-
dataset_id.strip(),
|
| 337 |
)
|
| 338 |
|
| 339 |
if not local_dir:
|
|
|
|
| 330 |
return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
|
| 331 |
|
| 332 |
token = hf_token.strip() if hf_token else None
|
|
|
|
| 333 |
|
| 334 |
local_dir, dl_status = download_hf_dataset(
|
| 335 |
+
dataset_id.strip(), hf_token=token
|
| 336 |
)
|
| 337 |
|
| 338 |
if not local_dir:
|
src/lora_trainer.py
CHANGED
|
@@ -16,35 +16,30 @@ AUDIO_EXTENSIONS = ["*.wav", "*.mp3", "*.flac", "*.ogg", "*.opus"]
|
|
| 16 |
|
| 17 |
def download_hf_dataset(
|
| 18 |
dataset_id: str,
|
| 19 |
-
output_dir: str,
|
| 20 |
hf_token: Optional[str] = None,
|
| 21 |
) -> Tuple[str, str]:
|
| 22 |
"""
|
| 23 |
Download an audio dataset from HuggingFace Hub.
|
| 24 |
|
| 25 |
-
Uses snapshot_download
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
Args:
|
| 29 |
dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
|
| 30 |
-
output_dir: Local directory to download into
|
| 31 |
hf_token: Optional HuggingFace token for private repos
|
| 32 |
|
| 33 |
Returns:
|
| 34 |
-
Tuple of (
|
| 35 |
"""
|
| 36 |
try:
|
| 37 |
from huggingface_hub import snapshot_download
|
| 38 |
|
| 39 |
-
|
| 40 |
-
output_path.mkdir(parents=True, exist_ok=True)
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
local_dir = snapshot_download(
|
| 45 |
repo_id=dataset_id,
|
| 46 |
repo_type="dataset",
|
| 47 |
-
local_dir=str(output_path / dataset_id.replace("/", "_")),
|
| 48 |
token=hf_token or None,
|
| 49 |
allow_patterns=AUDIO_EXTENSIONS,
|
| 50 |
)
|
|
@@ -52,12 +47,12 @@ def download_hf_dataset(
|
|
| 52 |
audio_count = sum(
|
| 53 |
1
|
| 54 |
for ext in AUDIO_EXTENSIONS
|
| 55 |
-
for _ in Path(
|
| 56 |
)
|
| 57 |
|
| 58 |
-
status = f"
|
| 59 |
logger.info(status)
|
| 60 |
-
return
|
| 61 |
|
| 62 |
except ImportError:
|
| 63 |
msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"
|
|
|
|
| 16 |
|
| 17 |
def download_hf_dataset(
|
| 18 |
dataset_id: str,
|
|
|
|
| 19 |
hf_token: Optional[str] = None,
|
| 20 |
) -> Tuple[str, str]:
|
| 21 |
"""
|
| 22 |
Download an audio dataset from HuggingFace Hub.
|
| 23 |
|
| 24 |
+
Uses snapshot_download without local_dir so HF's built-in cache
|
| 25 |
+
handles storage. On HF Spaces this is co-located with HF storage,
|
| 26 |
+
avoiding unnecessary file duplication.
|
| 27 |
|
| 28 |
Args:
|
| 29 |
dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
|
|
|
|
| 30 |
hf_token: Optional HuggingFace token for private repos
|
| 31 |
|
| 32 |
Returns:
|
| 33 |
+
Tuple of (cached_dir, status_message)
|
| 34 |
"""
|
| 35 |
try:
|
| 36 |
from huggingface_hub import snapshot_download
|
| 37 |
|
| 38 |
+
logger.info(f"Fetching dataset '{dataset_id}' via HF cache...")
|
|
|
|
| 39 |
|
| 40 |
+
cached_dir = snapshot_download(
|
|
|
|
|
|
|
| 41 |
repo_id=dataset_id,
|
| 42 |
repo_type="dataset",
|
|
|
|
| 43 |
token=hf_token or None,
|
| 44 |
allow_patterns=AUDIO_EXTENSIONS,
|
| 45 |
)
|
|
|
|
| 47 |
audio_count = sum(
|
| 48 |
1
|
| 49 |
for ext in AUDIO_EXTENSIONS
|
| 50 |
+
for _ in Path(cached_dir).rglob(ext)
|
| 51 |
)
|
| 52 |
|
| 53 |
+
status = f"Loaded {audio_count} audio files from {dataset_id}"
|
| 54 |
logger.info(status)
|
| 55 |
+
return cached_dir, status
|
| 56 |
|
| 57 |
except ImportError:
|
| 58 |
msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"
|