forensics-grpo / code /dl_explicit.py
sdzt's picture
Add root download helper scripts
fe47d73 verified
Raw
History Blame Contribute Delete
2.98 kB
#!/usr/bin/env python3
"""Download missing files from ActivityForensics/ActivityForensics dataset.
Iterates over every file matching video/* and calls hf_hub_download per file,
which skips files already in the cache. Handles 429 with exponential backoff.
"""
import fnmatch
import os
import random
import sys
import time
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
REPO_ID = "ActivityForensics/ActivityForensics"
REPO_TYPE = "dataset"
CACHE_DIR = "/mnt/local-fast/zhangt/.cache/huggingface/hub"
INCLUDE_GLOB = "video/*"
MAX_RETRIES_PER_FILE = 8
SESSION_BACKOFF_BASE = 30 # seconds
def fetch_one(filename: str) -> str:
attempt = 0
while True:
attempt += 1
try:
return hf_hub_download(
repo_id=REPO_ID,
repo_type=REPO_TYPE,
filename=filename,
cache_dir=CACHE_DIR,
)
except HfHubHTTPError as e:
status = getattr(getattr(e, "response", None), "status_code", None)
if status == 429 and attempt < MAX_RETRIES_PER_FILE:
wait = SESSION_BACKOFF_BASE * attempt + random.uniform(0, 5)
print(f" [429] attempt {attempt}, sleeping {wait:.1f}s", flush=True)
time.sleep(wait)
continue
raise
def main() -> int:
api = HfApi()
print("listing repo files...", flush=True)
files = api.list_repo_files(REPO_ID, repo_type=REPO_TYPE)
targets = [f for f in files if fnmatch.fnmatch(f, INCLUDE_GLOB)]
print(f"target: {len(targets)} files match {INCLUDE_GLOB}", flush=True)
n_done = 0
n_skipped = 0
n_failed = 0
failed = []
t0 = time.time()
for i, fn in enumerate(targets, 1):
try:
path = fetch_one(fn)
# hf_hub_download is a no-op if cached, returns the local path
if os.path.exists(path):
n_done += 1
else:
n_failed += 1
failed.append(fn)
except Exception as e:
n_failed += 1
failed.append(fn)
print(f" FAILED {fn}: {type(e).__name__}: {e}", flush=True)
if i % 50 == 0 or i == len(targets):
elapsed = time.time() - t0
rate = i / elapsed if elapsed > 0 else 0
eta = (len(targets) - i) / rate if rate > 0 else 0
print(
f"[{i}/{len(targets)}] done={n_done} failed={n_failed} "
f"rate={rate:.2f}/s eta={eta/60:.1f}min",
flush=True,
)
print(f"\nSummary: done={n_done} failed={n_failed}", flush=True)
if failed:
with open("/mnt/local-fast/zhangt/hf_failed.txt", "w") as f:
for x in failed:
f.write(x + "\n")
print("failed list -> /mnt/local-fast/zhangt/hf_failed.txt", flush=True)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())