File size: 2,979 Bytes
fe47d73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | #!/usr/bin/env python3
"""Download missing files from ActivityForensics/ActivityForensics dataset.
Iterates over every file matching video/* and calls hf_hub_download per file,
which skips files already in the cache. Handles 429 with exponential backoff.
"""
import fnmatch
import os
import random
import sys
import time
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
REPO_ID = "ActivityForensics/ActivityForensics"
REPO_TYPE = "dataset"
CACHE_DIR = "/mnt/local-fast/zhangt/.cache/huggingface/hub"
INCLUDE_GLOB = "video/*"
MAX_RETRIES_PER_FILE = 8
SESSION_BACKOFF_BASE = 30 # seconds
def fetch_one(filename: str) -> str:
attempt = 0
while True:
attempt += 1
try:
return hf_hub_download(
repo_id=REPO_ID,
repo_type=REPO_TYPE,
filename=filename,
cache_dir=CACHE_DIR,
)
except HfHubHTTPError as e:
status = getattr(getattr(e, "response", None), "status_code", None)
if status == 429 and attempt < MAX_RETRIES_PER_FILE:
wait = SESSION_BACKOFF_BASE * attempt + random.uniform(0, 5)
print(f" [429] attempt {attempt}, sleeping {wait:.1f}s", flush=True)
time.sleep(wait)
continue
raise
def main() -> int:
api = HfApi()
print("listing repo files...", flush=True)
files = api.list_repo_files(REPO_ID, repo_type=REPO_TYPE)
targets = [f for f in files if fnmatch.fnmatch(f, INCLUDE_GLOB)]
print(f"target: {len(targets)} files match {INCLUDE_GLOB}", flush=True)
n_done = 0
n_skipped = 0
n_failed = 0
failed = []
t0 = time.time()
for i, fn in enumerate(targets, 1):
try:
path = fetch_one(fn)
# hf_hub_download is a no-op if cached, returns the local path
if os.path.exists(path):
n_done += 1
else:
n_failed += 1
failed.append(fn)
except Exception as e:
n_failed += 1
failed.append(fn)
print(f" FAILED {fn}: {type(e).__name__}: {e}", flush=True)
if i % 50 == 0 or i == len(targets):
elapsed = time.time() - t0
rate = i / elapsed if elapsed > 0 else 0
eta = (len(targets) - i) / rate if rate > 0 else 0
print(
f"[{i}/{len(targets)}] done={n_done} failed={n_failed} "
f"rate={rate:.2f}/s eta={eta/60:.1f}min",
flush=True,
)
print(f"\nSummary: done={n_done} failed={n_failed}", flush=True)
if failed:
with open("/mnt/local-fast/zhangt/hf_failed.txt", "w") as f:
for x in failed:
f.write(x + "\n")
print("failed list -> /mnt/local-fast/zhangt/hf_failed.txt", flush=True)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())
|