| |
| """Download missing files from ActivityForensics/ActivityForensics dataset. |
| |
| Iterates over every file matching video/* and calls hf_hub_download per file, |
| which skips files already in the cache. Handles 429 with exponential backoff. |
| """ |
| import fnmatch |
| import os |
| import random |
| import sys |
| import time |
|
|
| from huggingface_hub import HfApi, hf_hub_download |
| from huggingface_hub.errors import HfHubHTTPError |
|
|
| REPO_ID = "ActivityForensics/ActivityForensics" |
| REPO_TYPE = "dataset" |
| CACHE_DIR = "/mnt/local-fast/zhangt/.cache/huggingface/hub" |
| INCLUDE_GLOB = "video/*" |
| MAX_RETRIES_PER_FILE = 8 |
| SESSION_BACKOFF_BASE = 30 |
|
|
|
|
| def fetch_one(filename: str) -> str: |
| attempt = 0 |
| while True: |
| attempt += 1 |
| try: |
| return hf_hub_download( |
| repo_id=REPO_ID, |
| repo_type=REPO_TYPE, |
| filename=filename, |
| cache_dir=CACHE_DIR, |
| ) |
| except HfHubHTTPError as e: |
| status = getattr(getattr(e, "response", None), "status_code", None) |
| if status == 429 and attempt < MAX_RETRIES_PER_FILE: |
| wait = SESSION_BACKOFF_BASE * attempt + random.uniform(0, 5) |
| print(f" [429] attempt {attempt}, sleeping {wait:.1f}s", flush=True) |
| time.sleep(wait) |
| continue |
| raise |
|
|
|
|
| def main() -> int: |
| api = HfApi() |
| print("listing repo files...", flush=True) |
| files = api.list_repo_files(REPO_ID, repo_type=REPO_TYPE) |
| targets = [f for f in files if fnmatch.fnmatch(f, INCLUDE_GLOB)] |
| print(f"target: {len(targets)} files match {INCLUDE_GLOB}", flush=True) |
|
|
| n_done = 0 |
| n_skipped = 0 |
| n_failed = 0 |
| failed = [] |
| t0 = time.time() |
| for i, fn in enumerate(targets, 1): |
| try: |
| path = fetch_one(fn) |
| |
| if os.path.exists(path): |
| n_done += 1 |
| else: |
| n_failed += 1 |
| failed.append(fn) |
| except Exception as e: |
| n_failed += 1 |
| failed.append(fn) |
| print(f" FAILED {fn}: {type(e).__name__}: {e}", flush=True) |
|
|
| if i % 50 == 0 or i == len(targets): |
| elapsed = time.time() - t0 |
| rate = i / elapsed if elapsed > 0 else 0 |
| eta = (len(targets) - i) / rate if rate > 0 else 0 |
| print( |
| f"[{i}/{len(targets)}] done={n_done} failed={n_failed} " |
| f"rate={rate:.2f}/s eta={eta/60:.1f}min", |
| flush=True, |
| ) |
|
|
| print(f"\nSummary: done={n_done} failed={n_failed}", flush=True) |
| if failed: |
| with open("/mnt/local-fast/zhangt/hf_failed.txt", "w") as f: |
| for x in failed: |
| f.write(x + "\n") |
| print("failed list -> /mnt/local-fast/zhangt/hf_failed.txt", flush=True) |
| return 1 |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|