#!/usr/bin/env python
import json
from pathlib import Path

from huggingface_hub import HfApi

# ===== 설정 =====
REPO_ID = "dghadiya/video_eval_extend"
BRANCH = "main"

# 여기에 "13개 액션 클래스 이름"을 정확히 써 줘.
# 예시는 임시야. 너가 실제 사용하는 13개로 교체!!
ACTIONS_13 = [
    "Bowling",
    "CleanAndJerk",
    "GolfSwing",
    "HammerThrow",
    "Hammering",
    "HandStandPushups",
    "JugglingBalls",
    "JumpRope",
    "Lunges",
    "PlayingGuitar",
    "RockClimbingIndoor",
    "RopeClimbing",
    "Surfing",
]


OUTPUT_JSON = "videos_extend.json"

api = HfApi()

def main():
    print(f"Listing files in HF dataset: {REPO_ID} @ {BRANCH}")
    files = api.list_repo_files(
        repo_id=REPO_ID,
        repo_type="dataset",
        revision=BRANCH,
    )
    print(f"Total files in repo: {len(files)}")

    entries = []
    seen = set()

    for path in files:
        # mp4 파일만 사용
        if not path.lower().endswith(".mp4"):
            continue

        # 기대하는 구조 예시:
        #   Wan2.2/Bowling/v_Bowling_g01_c06.mp4
        #   RunwayGen4/BodyWeightSquats/xxx.mp4
        parts = path.split("/")
        if len(parts) < 3:
            # model/action/file 구조가 아니면 스킵
            continue

        model_name = parts[0]
        action_name = parts[1]

        # 13개 액션만 사용
        if action_name not in ACTIONS_13:
            continue

        # 중복 방지 (같은 path가 여러 번 나오지 않도록)
        if path in seen:
            continue
        seen.add(path)

        url = (
            f"https://huggingface.co/datasets/{REPO_ID}"
            f"/resolve/{BRANCH}/{path}"
        )

        entries.append(
            {
                "url": url,
                "id": path,
                "action": action_name,
            }
        )

    # 액션 이름, 그 다음 id 기준으로 정렬(보기 편하게)
    entries.sort(key=lambda e: (e["action"], e["id"]))

    print(f"Collected {len(entries)} videos across {len(ACTIONS_13)} actions.")
    out_path = Path(OUTPUT_JSON)
    out_path.write_text(json.dumps(entries, indent=2), encoding="utf-8")
    print(f"Saved to {out_path.resolve()}")

if __name__ == "__main__":
    # HF_TOKEN 환경변수는 미리 설정되어 있어야 함
    #  export HF_TOKEN=...
    main()