#!/usr/bin/env python3 """Download missing files from ActivityForensics/ActivityForensics dataset. Iterates over every file matching video/* and calls hf_hub_download per file, which skips files already in the cache. Handles 429 with exponential backoff. """ import fnmatch import os import random import sys import time from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.errors import HfHubHTTPError REPO_ID = "ActivityForensics/ActivityForensics" REPO_TYPE = "dataset" CACHE_DIR = "/mnt/local-fast/zhangt/.cache/huggingface/hub" INCLUDE_GLOB = "video/*" MAX_RETRIES_PER_FILE = 8 SESSION_BACKOFF_BASE = 30 # seconds def fetch_one(filename: str) -> str: attempt = 0 while True: attempt += 1 try: return hf_hub_download( repo_id=REPO_ID, repo_type=REPO_TYPE, filename=filename, cache_dir=CACHE_DIR, ) except HfHubHTTPError as e: status = getattr(getattr(e, "response", None), "status_code", None) if status == 429 and attempt < MAX_RETRIES_PER_FILE: wait = SESSION_BACKOFF_BASE * attempt + random.uniform(0, 5) print(f" [429] attempt {attempt}, sleeping {wait:.1f}s", flush=True) time.sleep(wait) continue raise def main() -> int: api = HfApi() print("listing repo files...", flush=True) files = api.list_repo_files(REPO_ID, repo_type=REPO_TYPE) targets = [f for f in files if fnmatch.fnmatch(f, INCLUDE_GLOB)] print(f"target: {len(targets)} files match {INCLUDE_GLOB}", flush=True) n_done = 0 n_skipped = 0 n_failed = 0 failed = [] t0 = time.time() for i, fn in enumerate(targets, 1): try: path = fetch_one(fn) # hf_hub_download is a no-op if cached, returns the local path if os.path.exists(path): n_done += 1 else: n_failed += 1 failed.append(fn) except Exception as e: n_failed += 1 failed.append(fn) print(f" FAILED {fn}: {type(e).__name__}: {e}", flush=True) if i % 50 == 0 or i == len(targets): elapsed = time.time() - t0 rate = i / elapsed if elapsed > 0 else 0 eta = (len(targets) - i) / rate if rate > 0 else 0 print( f"[{i}/{len(targets)}] done={n_done} failed={n_failed} " f"rate={rate:.2f}/s eta={eta/60:.1f}min", flush=True, ) print(f"\nSummary: done={n_done} failed={n_failed}", flush=True) if failed: with open("/mnt/local-fast/zhangt/hf_failed.txt", "w") as f: for x in failed: f.write(x + "\n") print("failed list -> /mnt/local-fast/zhangt/hf_failed.txt", flush=True) return 1 return 0 if __name__ == "__main__": sys.exit(main())