File size: 2,979 Bytes
fe47d73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
"""Download missing files from ActivityForensics/ActivityForensics dataset.

Iterates over every file matching video/* and calls hf_hub_download per file,
which skips files already in the cache. Handles 429 with exponential backoff.
"""
import fnmatch
import os
import random
import sys
import time

from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError

REPO_ID = "ActivityForensics/ActivityForensics"
REPO_TYPE = "dataset"
CACHE_DIR = "/mnt/local-fast/zhangt/.cache/huggingface/hub"
INCLUDE_GLOB = "video/*"
MAX_RETRIES_PER_FILE = 8
SESSION_BACKOFF_BASE = 30  # seconds


def fetch_one(filename: str) -> str:
    attempt = 0
    while True:
        attempt += 1
        try:
            return hf_hub_download(
                repo_id=REPO_ID,
                repo_type=REPO_TYPE,
                filename=filename,
                cache_dir=CACHE_DIR,
            )
        except HfHubHTTPError as e:
            status = getattr(getattr(e, "response", None), "status_code", None)
            if status == 429 and attempt < MAX_RETRIES_PER_FILE:
                wait = SESSION_BACKOFF_BASE * attempt + random.uniform(0, 5)
                print(f"  [429] attempt {attempt}, sleeping {wait:.1f}s", flush=True)
                time.sleep(wait)
                continue
            raise


def main() -> int:
    api = HfApi()
    print("listing repo files...", flush=True)
    files = api.list_repo_files(REPO_ID, repo_type=REPO_TYPE)
    targets = [f for f in files if fnmatch.fnmatch(f, INCLUDE_GLOB)]
    print(f"target: {len(targets)} files match {INCLUDE_GLOB}", flush=True)

    n_done = 0
    n_skipped = 0
    n_failed = 0
    failed = []
    t0 = time.time()
    for i, fn in enumerate(targets, 1):
        try:
            path = fetch_one(fn)
            # hf_hub_download is a no-op if cached, returns the local path
            if os.path.exists(path):
                n_done += 1
            else:
                n_failed += 1
                failed.append(fn)
        except Exception as e:
            n_failed += 1
            failed.append(fn)
            print(f"  FAILED {fn}: {type(e).__name__}: {e}", flush=True)

        if i % 50 == 0 or i == len(targets):
            elapsed = time.time() - t0
            rate = i / elapsed if elapsed > 0 else 0
            eta = (len(targets) - i) / rate if rate > 0 else 0
            print(
                f"[{i}/{len(targets)}] done={n_done} failed={n_failed} "
                f"rate={rate:.2f}/s eta={eta/60:.1f}min",
                flush=True,
            )

    print(f"\nSummary: done={n_done} failed={n_failed}", flush=True)
    if failed:
        with open("/mnt/local-fast/zhangt/hf_failed.txt", "w") as f:
            for x in failed:
                f.write(x + "\n")
        print("failed list -> /mnt/local-fast/zhangt/hf_failed.txt", flush=True)
        return 1
    return 0


if __name__ == "__main__":
    sys.exit(main())