File size: 2,369 Bytes
fca9b58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import json
from pathlib import Path

from huggingface_hub import HfApi

# ===== ์„ค์ • =====
REPO_ID = "dghadiya/video_eval_extend"
BRANCH = "main"

# ์—ฌ๊ธฐ์— "13๊ฐœ ์•ก์…˜ ํด๋ž˜์Šค ์ด๋ฆ„"์„ ์ •ํ™•ํžˆ ์จ ์ค˜.
# ์˜ˆ์‹œ๋Š” ์ž„์‹œ์•ผ. ๋„ˆ๊ฐ€ ์‹ค์ œ ์‚ฌ์šฉํ•˜๋Š” 13๊ฐœ๋กœ ๊ต์ฒด!!
ACTIONS_13 = [
    "Bowling",
    "CleanAndJerk",
    "GolfSwing",
    "HammerThrow",
    "Hammering",
    "HandStandPushups",
    "JugglingBalls",
    "JumpRope",
    "Lunges",
    "PlayingGuitar",
    "RockClimbingIndoor",
    "RopeClimbing",
    "Surfing",
]


OUTPUT_JSON = "videos_extend.json"

api = HfApi()

def main():
    print(f"Listing files in HF dataset: {REPO_ID} @ {BRANCH}")
    files = api.list_repo_files(
        repo_id=REPO_ID,
        repo_type="dataset",
        revision=BRANCH,
    )
    print(f"Total files in repo: {len(files)}")

    entries = []
    seen = set()

    for path in files:
        # mp4 ํŒŒ์ผ๋งŒ ์‚ฌ์šฉ
        if not path.lower().endswith(".mp4"):
            continue

        # ๊ธฐ๋Œ€ํ•˜๋Š” ๊ตฌ์กฐ ์˜ˆ์‹œ:
        #   Wan2.2/Bowling/v_Bowling_g01_c06.mp4
        #   RunwayGen4/BodyWeightSquats/xxx.mp4
        parts = path.split("/")
        if len(parts) < 3:
            # model/action/file ๊ตฌ์กฐ๊ฐ€ ์•„๋‹ˆ๋ฉด ์Šคํ‚ต
            continue

        model_name = parts[0]
        action_name = parts[1]

        # 13๊ฐœ ์•ก์…˜๋งŒ ์‚ฌ์šฉ
        if action_name not in ACTIONS_13:
            continue

        # ์ค‘๋ณต ๋ฐฉ์ง€ (๊ฐ™์€ path๊ฐ€ ์—ฌ๋Ÿฌ ๋ฒˆ ๋‚˜์˜ค์ง€ ์•Š๋„๋ก)
        if path in seen:
            continue
        seen.add(path)

        url = (
            f"https://huggingface.co/datasets/{REPO_ID}"
            f"/resolve/{BRANCH}/{path}"
        )

        entries.append(
            {
                "url": url,
                "id": path,
                "action": action_name,
            }
        )

    # ์•ก์…˜ ์ด๋ฆ„, ๊ทธ ๋‹ค์Œ id ๊ธฐ์ค€์œผ๋กœ ์ •๋ ฌ(๋ณด๊ธฐ ํŽธํ•˜๊ฒŒ)
    entries.sort(key=lambda e: (e["action"], e["id"]))

    print(f"Collected {len(entries)} videos across {len(ACTIONS_13)} actions.")
    out_path = Path(OUTPUT_JSON)
    out_path.write_text(json.dumps(entries, indent=2), encoding="utf-8")
    print(f"Saved to {out_path.resolve()}")

if __name__ == "__main__":
    # HF_TOKEN ํ™˜๊ฒฝ๋ณ€์ˆ˜๋Š” ๋ฏธ๋ฆฌ ์„ค์ •๋˜์–ด ์žˆ์–ด์•ผ ํ•จ
    #  export HF_TOKEN=...
    main()