Spaces:
Sleeping
Sleeping
Youngsun Lim
commited on
Commit
ยท
bd09d55
1
Parent(s):
2e13497
dataset update
Browse files- app_old.py โ app_newbutold.py +0 -0
- make_json.py +19 -0
- make_my_repo_dataset.py +279 -0
- test.py +76 -0
- videos.json +0 -0
app_old.py โ app_newbutold.py
RENAMED
|
File without changes
|
make_json.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import list_repo_files
|
| 2 |
+
import json, os, random
|
| 3 |
+
|
| 4 |
+
repo = "SGTLIM/ucf101_eval_unified"
|
| 5 |
+
base = f"https://huggingface.co/datasets/{repo}/resolve/main/"
|
| 6 |
+
files = list_repo_files(repo_id=repo, repo_type="dataset")
|
| 7 |
+
mp4s = [f for f in files if f.lower().endswith(".mp4")]
|
| 8 |
+
|
| 9 |
+
videos = []
|
| 10 |
+
for p in mp4s:
|
| 11 |
+
stem = p.rsplit("/",1)[-1].rsplit(".",1)[0]
|
| 12 |
+
parts = stem.split("_")
|
| 13 |
+
action = parts[1] if len(parts) >= 3 else ""
|
| 14 |
+
videos.append({"url": base + p + "?download=1", "id": p, "action": action})
|
| 15 |
+
|
| 16 |
+
random.shuffle(videos)
|
| 17 |
+
with open("videos.json","w",encoding="utf-8") as fp:
|
| 18 |
+
json.dump(videos, fp, ensure_ascii=False, indent=2)
|
| 19 |
+
print("wrote videos.json with", len(videos), "items")
|
make_my_repo_dataset.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os, re, csv, hashlib, shutil, subprocess
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
from huggingface_hub import HfApi, list_repo_files, hf_hub_download, create_repo
|
| 7 |
+
from huggingface_hub import whoami
|
| 8 |
+
from huggingface_hub import CommitOperationAdd
|
| 9 |
+
from huggingface_hub import snapshot_download
|
| 10 |
+
import time, glob
|
| 11 |
+
|
| 12 |
+
# ========= CONFIG =========
|
| 13 |
+
SRC_REPO = "XThomasBU/video_evals_ucf101"
|
| 14 |
+
SRC_SUBDIRS = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] # ํ์ํ ๊ฒ๋ง
|
| 15 |
+
|
| 16 |
+
# ๋์ ๋ก์ปฌ Wan2.2(60๊ฐ)๋ ํฌํจ
|
| 17 |
+
WAN22_LOCAL_ROOT = Path("/projectnb/ivc-ml/youngsun/Video_Eval/Datasets/Wan2p2/Generated_UCF")
|
| 18 |
+
WAN22_TARGET_SUBDIR = "Wan2.2" # ๋ชฉ์ ์ง ๋ฆฌํฌ์ ์์ ํด๋๋ช
(์ฐ๋ฆฌ๊ฐ ์ฐ๋ ์ด๋ฆ ์ ์ง)
|
| 19 |
+
|
| 20 |
+
DEST_REPO = "SGTLIM/ucf101_eval_unified" # โ ๋ค ๊ณ์ ์ผ๋ก ๊ณ ์
|
| 21 |
+
DEST_REPO_IS_PRIVATE = False
|
| 22 |
+
|
| 23 |
+
# ์ฉ๋ ๋๋ํ ์์
๋๋ ํ ๋ฆฌ( /tmp ๋์ /projectnb )
|
| 24 |
+
WORKDIR = Path("/projectnb/ivc-ml/youngsun/tmp_ucf_unified")
|
| 25 |
+
STAGING = WORKDIR / "staging" # ์คํ
์ด์ง ๋ฃจํธ
|
| 26 |
+
DL_ROOT = WORKDIR / "downloads" # ์๋ณธ ๋ค์ด๋ก๋ ์ ์ฅ์
|
| 27 |
+
MAKE_SILENT = False
|
| 28 |
+
TOKEN = os.getenv("HF_TOKEN")
|
| 29 |
+
# =========================
|
| 30 |
+
|
| 31 |
+
# /tmp ๋์ WORKDIR์ ์์/์บ์๋ก ์ฌ์ฉ
|
| 32 |
+
os.environ.setdefault("TMPDIR", str(WORKDIR / "_tmp"))
|
| 33 |
+
os.environ.setdefault("HF_HOME", "/projectnb/ivc-ml/youngsun/.cache/huggingface")
|
| 34 |
+
|
| 35 |
+
# ๋๋ ํ ๋ฆฌ ๋ณด์ฅ
|
| 36 |
+
(WORKDIR / "_tmp").mkdir(parents=True, exist_ok=True)
|
| 37 |
+
STAGING.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
DL_ROOT.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ์ก์
๋ช
์ ๊ทํ(ํ์์ ์ถ๊ฐ)
|
| 42 |
+
ALIAS = {
|
| 43 |
+
"bodyweightsquats":"body_weight_squats",
|
| 44 |
+
"bodysquats":"body_weight_squats",
|
| 45 |
+
"body_weight_squats":"body_weight_squats",
|
| 46 |
+
"hulahoop":"hula_hoop",
|
| 47 |
+
"jumpingjack":"jumping_jack",
|
| 48 |
+
"pullups":"pull_ups",
|
| 49 |
+
"pushups":"push_ups",
|
| 50 |
+
"throwdiscus":"throw_discus",
|
| 51 |
+
"wallpushups":"wall_pushups",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def camel_action(s: str) -> str:
|
| 56 |
+
# 'body_weight_squats' -> 'BodyWeightSquats'
|
| 57 |
+
parts = s.strip("_").split("_")
|
| 58 |
+
return "".join(p.capitalize() for p in parts if p)
|
| 59 |
+
|
| 60 |
+
def extract_action_from_remote(rel_remote: str) -> str:
|
| 61 |
+
"""
|
| 62 |
+
HF ๋ฆฌํฌ์ '์๋ณธ ๊ฒฝ๋ก'์์๋ง ์ก์
์ ๋ฝ๋๋ค.
|
| 63 |
+
์: Hunyuan_videos/v_BodyWeightSquats_g05_c01.mp4 -> body_weight_squats
|
| 64 |
+
"""
|
| 65 |
+
base = os.path.basename(rel_remote)
|
| 66 |
+
m = re.match(r"^v_([A-Za-z0-9]+)_", base) # ๋ฐ๋์ v_<Action>_...
|
| 67 |
+
if m:
|
| 68 |
+
return slugify_action(m.group(1))
|
| 69 |
+
# (์์ธ ์ผ์ด์ค ์ต์ํ: ์๋ณธ ๋ฆฌํฌ๋ v_ํจํด์ด๋ฏ๋ก ์ฌ๊ธฐ๊น์ง ์ฌ ์ผ ๊ฑฐ์ ์์)
|
| 70 |
+
return slugify_action(base.split("_", 1)[0])
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def slugify_action(s: str) -> str:
|
| 75 |
+
s = s.strip().lower().replace(" ", "_")
|
| 76 |
+
s = re.sub(r"[^a-z0-9_]+", "_", s)
|
| 77 |
+
s = re.sub(r"_+", "_", s).strip("_")
|
| 78 |
+
return ALIAS.get(s, s)
|
| 79 |
+
|
| 80 |
+
def model_slug(s: str) -> str:
|
| 81 |
+
s = s.strip().lower()
|
| 82 |
+
s = s.replace(" ", "_")
|
| 83 |
+
s = re.sub(r"[^a-z0-9_]+", "_", s)
|
| 84 |
+
s = re.sub(r"_+", "_", s).strip("_")
|
| 85 |
+
return s
|
| 86 |
+
|
| 87 |
+
def sha1_8(p: Path) -> str:
|
| 88 |
+
h = hashlib.sha1()
|
| 89 |
+
with open(p, "rb") as f:
|
| 90 |
+
for chunk in iter(lambda: f.read(1024*1024), b""):
|
| 91 |
+
h.update(chunk)
|
| 92 |
+
return h.hexdigest()[:8]
|
| 93 |
+
|
| 94 |
+
def ensure_ffmpeg():
|
| 95 |
+
# MAKE_SILENT=False ์ด๋ฉด ffmpeg ์ฒดํฌ ์ ํจ
|
| 96 |
+
if MAKE_SILENT and not shutil.which("ffmpeg"):
|
| 97 |
+
raise RuntimeError("ffmpeg not found, but MAKE_SILENT=True")
|
| 98 |
+
|
| 99 |
+
def mute_copy(src: Path, dst: Path):
|
| 100 |
+
# (์ง๊ธ์ ์ฐ์ง ์์ง๋ง, ์ต์
๋์ด๋ฆฌ๊ธฐ ๋๋น๋ก ๋จ๊ฒจ๋ )
|
| 101 |
+
cmd = ["ffmpeg","-y","-i",str(src),"-c:v","copy","-an",str(dst)]
|
| 102 |
+
try:
|
| 103 |
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 104 |
+
except subprocess.CalledProcessError:
|
| 105 |
+
cmd = [
|
| 106 |
+
"ffmpeg","-y","-i",str(src),
|
| 107 |
+
"-vf","format=yuv420p","-movflags","+faststart",
|
| 108 |
+
"-c:v","libx264","-crf","18","-preset","veryfast",
|
| 109 |
+
"-an",str(dst)
|
| 110 |
+
]
|
| 111 |
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 112 |
+
|
| 113 |
+
def extract_action_from_filename(fn: str) -> str:
|
| 114 |
+
# ์: v_BodyWeightSquats_g05_c01.mp4 โ BodyWeightSquats
|
| 115 |
+
m = re.match(r"v_([A-Za-z0-9]+)", fn)
|
| 116 |
+
if m:
|
| 117 |
+
return slugify_action(m.group(1))
|
| 118 |
+
parts = fn.split("/")
|
| 119 |
+
if len(parts) >= 2:
|
| 120 |
+
return slugify_action(parts[-2])
|
| 121 |
+
stem = Path(fn).stem
|
| 122 |
+
stem = re.sub(r"^\w+_", "", stem)
|
| 123 |
+
return slugify_action(stem)
|
| 124 |
+
|
| 125 |
+
def stage_from_hf_model(api, model_dir, rows):
|
| 126 |
+
# 1) ์๋ณธ ๋ฆฌํฌ ํ์ผ ๋ชฉ๋ก์์ ์ด ํด๋์ mp4๋ง ๊ณ ๋ฅด๊ธฐ
|
| 127 |
+
all_remote = list_repo_files(repo_id=SRC_REPO, repo_type="dataset", token=TOKEN)
|
| 128 |
+
remotes = [p for p in all_remote if p.startswith(model_dir + "/") and p.lower().endswith(".mp4")]
|
| 129 |
+
print(f"[FETCH] {model_dir}: {len(remotes)} remote mp4 files")
|
| 130 |
+
if not remotes:
|
| 131 |
+
print(f"[WARN] no matches under {model_dir}")
|
| 132 |
+
return
|
| 133 |
+
|
| 134 |
+
# 2) ์ด ๋ชจ๋ธ ์ ์ฉ ๋ค์ด๋ก๋ ํด๏ฟฝ๏ฟฝ๋ฅผ ๋งค๋ฒ ๊นจ๋ํ๊ฒ ์์ฑ
|
| 135 |
+
dl_dir = DL_ROOT / model_dir
|
| 136 |
+
if dl_dir.exists():
|
| 137 |
+
shutil.rmtree(dl_dir)
|
| 138 |
+
dl_dir.mkdir(parents=True, exist_ok=True)
|
| 139 |
+
|
| 140 |
+
# 3) (remote_path, local_path) ์์ผ๋ก ๋ณด๊ด (์ก์
์ถ์ถ์ remote_path๋ก!)
|
| 141 |
+
pairs = []
|
| 142 |
+
for rel_remote in remotes:
|
| 143 |
+
local = hf_hub_download(
|
| 144 |
+
repo_id=SRC_REPO,
|
| 145 |
+
filename=rel_remote, # โ ์๋ณธ ๊ฒฝ๋ก ์ ์ง
|
| 146 |
+
repo_type="dataset",
|
| 147 |
+
token=TOKEN,
|
| 148 |
+
local_dir=str(dl_dir),
|
| 149 |
+
local_dir_use_symlinks=False,
|
| 150 |
+
)
|
| 151 |
+
pairs.append((rel_remote, local))
|
| 152 |
+
|
| 153 |
+
# 4) ์คํ
์ด์ง์ผ๋ก ์ด๋ + ์ฌ๋ฐ๋ฅธ ์ด๋ฆ์ผ๋ก ๋ฆฌ๋ค์
|
| 154 |
+
folder_name = model_dir # ์: 'Hunyuan_videos', 'RunwayGen4', ...
|
| 155 |
+
dst_dir = STAGING / folder_name
|
| 156 |
+
dst_dir.mkdir(parents=True, exist_ok=True)
|
| 157 |
+
|
| 158 |
+
counters = defaultdict(int)
|
| 159 |
+
moved = 0
|
| 160 |
+
for rel_remote, local in sorted(pairs):
|
| 161 |
+
# โ
์ก์
์ '์๋ณธ ๊ฒฝ๋ก'์์๋ง ์ถ์ถ
|
| 162 |
+
action_slug = extract_action_from_remote(rel_remote) # 'body_weight_squats'
|
| 163 |
+
print(action_slug)
|
| 164 |
+
action_camel = camel_action(action_slug) # 'BodyWeightSquats'
|
| 165 |
+
print(action_camel)
|
| 166 |
+
counters[action_slug] += 1
|
| 167 |
+
idx = counters[action_slug]
|
| 168 |
+
h8 = sha1_8(Path(local))
|
| 169 |
+
|
| 170 |
+
# โ
์ต์ข
๊ท์น: Model_Action_๋์๋ฆฌ_ํด์8.mp4
|
| 171 |
+
new_name = f"{folder_name}_{action_camel}_{idx:02d}_{h8}.mp4"
|
| 172 |
+
|
| 173 |
+
dst = dst_dir / new_name
|
| 174 |
+
shutil.move(local, dst)
|
| 175 |
+
moved += 1
|
| 176 |
+
|
| 177 |
+
rows.append([f"hf://{SRC_REPO}/{rel_remote}", folder_name, action_slug, idx, h8, f"{folder_name}/{new_name}"])
|
| 178 |
+
|
| 179 |
+
print(f"[STAGED] {model_dir}: moved {moved} files to {dst_dir}")
|
| 180 |
+
|
| 181 |
+
def stage_from_local_wan22(rows: List[List[str]]):
|
| 182 |
+
pretty_model = "Wan2.2"
|
| 183 |
+
counters = defaultdict(int)
|
| 184 |
+
for class_dir in sorted([p for p in WAN22_LOCAL_ROOT.iterdir() if p.is_dir()]):
|
| 185 |
+
action = slugify_action(class_dir.name)
|
| 186 |
+
for mp4 in sorted([p for p in class_dir.iterdir() if p.suffix.lower()==".mp4"]):
|
| 187 |
+
counters[action] += 1
|
| 188 |
+
idx = counters[action]
|
| 189 |
+
h8 = sha1_8(mp4)
|
| 190 |
+
# new_name = f"wan2p2_{action}_{idx:02d}_{h8}.mp4"
|
| 191 |
+
pretty_model = WAN22_TARGET_SUBDIR # ex) 'Wan2.2'
|
| 192 |
+
camel = camel_action(action) # ex) 'PushUps'
|
| 193 |
+
new_name = f"{pretty_model}_{camel}_{idx:02d}_{h8}.mp4"
|
| 194 |
+
dst_dir = STAGING / pretty_model
|
| 195 |
+
dst_dir.mkdir(parents=True, exist_ok=True)
|
| 196 |
+
dst = dst_dir / new_name
|
| 197 |
+
if MAKE_SILENT:
|
| 198 |
+
mute_copy(mp4, dst)
|
| 199 |
+
else:
|
| 200 |
+
shutil.copy2(mp4, dst) # ๐ ์๋ณธ ๊ทธ๋๋ก
|
| 201 |
+
rows.append([str(mp4), pretty_model, action, idx, h8, f"{pretty_model}/{new_name}"])
|
| 202 |
+
|
| 203 |
+
def main():
|
| 204 |
+
if not TOKEN:
|
| 205 |
+
raise SystemExit("Set HF_TOKEN with write permission.")
|
| 206 |
+
ensure_ffmpeg()
|
| 207 |
+
|
| 208 |
+
# ๐ป ์์
๋๋ ํ ๋ฆฌ ์ด๊ธฐํ (์ฌ๊ธฐ ์ถ๊ฐ)
|
| 209 |
+
if STAGING.exists():
|
| 210 |
+
shutil.rmtree(STAGING)
|
| 211 |
+
if DL_ROOT.exists():
|
| 212 |
+
shutil.rmtree(DL_ROOT)
|
| 213 |
+
STAGING.mkdir(parents=True, exist_ok=True)
|
| 214 |
+
DL_ROOT.mkdir(parents=True, exist_ok=True)
|
| 215 |
+
|
| 216 |
+
api = HfApi()
|
| 217 |
+
try:
|
| 218 |
+
create_repo(repo_id=DEST_REPO, repo_type="dataset", private=DEST_REPO_IS_PRIVATE, token=TOKEN, exist_ok=True)
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
+
rows: List[List[str]] = []
|
| 223 |
+
|
| 224 |
+
# 1) ์๋ณธ ๋ฆฌํฌ์์ 4๊ฐ ๋ชจ๋ธ ํด๋๋ง ์คํ
์ด์ง
|
| 225 |
+
for sub in SRC_SUBDIRS:
|
| 226 |
+
stage_from_hf_model(api, sub, rows)
|
| 227 |
+
|
| 228 |
+
# 2) ๋ก์ปฌ Wan2.2(60๊ฐ) ์ถ๊ฐ
|
| 229 |
+
stage_from_local_wan22(rows)
|
| 230 |
+
|
| 231 |
+
# 3) ๋งคํ csv ์ ์ฅ
|
| 232 |
+
with open(STAGING / "mapping.csv", "w", newline="", encoding="utf-8") as f:
|
| 233 |
+
w = csv.writer(f)
|
| 234 |
+
w.writerow(["source","model","action","idx","hash8","dest_path"])
|
| 235 |
+
w.writerows(rows)
|
| 236 |
+
|
| 237 |
+
# 4) ํด๋๋ณ๋ก ๋ช
์์ ์ถ๊ฐ ์ปค๋ฐ(๋ณ๊ฒฝ๊ฐ์ง ์ฐํ)
|
| 238 |
+
subdirs_to_push = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] #, "Wan2.2"
|
| 239 |
+
for sd in subdirs_to_push:
|
| 240 |
+
sd_path = STAGING / sd
|
| 241 |
+
if not sd_path.exists():
|
| 242 |
+
print(f"[WARN] skip missing subdir: {sd}")
|
| 243 |
+
continue
|
| 244 |
+
files = sorted(str(p) for p in sd_path.rglob("*.mp4"))
|
| 245 |
+
if not files:
|
| 246 |
+
print(f"[WARN] skip empty subdir: {sd}")
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
print(f"[PUSH] {sd} ... ({len(files)} files)")
|
| 250 |
+
ops = []
|
| 251 |
+
for fp in files:
|
| 252 |
+
rel_in_repo = os.path.relpath(fp, start=STAGING) # ์: Hunyuan_videos/xxx.mp4
|
| 253 |
+
ops.append(CommitOperationAdd(path_in_repo=rel_in_repo, path_or_fileobj=fp))
|
| 254 |
+
|
| 255 |
+
api.create_commit(
|
| 256 |
+
repo_id=DEST_REPO,
|
| 257 |
+
repo_type="dataset",
|
| 258 |
+
operations=ops,
|
| 259 |
+
commit_message=f"Add {sd} ({len(files)} files)",
|
| 260 |
+
token=TOKEN,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# mapping.csv ์
๋ก๋
|
| 264 |
+
map_csv = STAGING / "mapping.csv"
|
| 265 |
+
if map_csv.exists():
|
| 266 |
+
api.upload_file(
|
| 267 |
+
path_or_fileobj=str(map_csv),
|
| 268 |
+
path_in_repo="mapping.csv",
|
| 269 |
+
repo_id=DEST_REPO,
|
| 270 |
+
repo_type="dataset",
|
| 271 |
+
token=TOKEN,
|
| 272 |
+
commit_message="Add mapping.csv",
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
print(f"[DONE] Pushed to https://huggingface.co/datasets/{DEST_REPO}")
|
| 276 |
+
|
| 277 |
+
if __name__ == "__main__":
|
| 278 |
+
main()
|
| 279 |
+
|
test.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prune_runway_keep6.py
|
| 2 |
+
import os, re, collections
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete
|
| 5 |
+
|
| 6 |
+
REPO = "SGTLIM/ucf101_eval_unified" # <-- ๋ค ๋ฐ์ดํฐ์
|
| 7 |
+
DIR = "RunwayGen4" # ๋์ ํด๋
|
| 8 |
+
KEEP_PER_ACTION = 6 # ์ก์
๋น ๋จ๊ธธ ๊ฐ์
|
| 9 |
+
BATCH = 100 # ์ปค๋ฐ ๋จ์
|
| 10 |
+
DRY_RUN = False # ๋จผ์ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ํ False๋ก ๋ฐ๊ฟ ์คํ
|
| 11 |
+
|
| 12 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # ์ฐ๊ธฐ ๊ถํ ํ ํฐ ํ์(ํผ๋ธ๋ฆญ์ด๋ผ๋ ์ญ์ ์ ํ์)
|
| 13 |
+
|
| 14 |
+
NAME_RE = re.compile(
|
| 15 |
+
r'^' + re.escape(DIR) + r'/RunwayGen4_(?P<action>[A-Za-z][A-Za-z0-9]+)_(?P<idx>\d{2})(?:_[0-9a-f]{8})?\.mp4$',
|
| 16 |
+
re.I
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
def parse(p: str):
|
| 20 |
+
m = NAME_RE.match(p)
|
| 21 |
+
if not m:
|
| 22 |
+
return None
|
| 23 |
+
return m.group("action"), int(m.group("idx"))
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
api = HfApi()
|
| 27 |
+
files = list_repo_files(repo_id=REPO, repo_type="dataset", token=HF_TOKEN)
|
| 28 |
+
runway = [p for p in files if p.startswith(DIR + "/") and p.lower().endswith(".mp4")]
|
| 29 |
+
|
| 30 |
+
by_action: dict[str, List[Tuple[int,str]]] = collections.defaultdict(list)
|
| 31 |
+
bad = []
|
| 32 |
+
for p in runway:
|
| 33 |
+
r = parse(p)
|
| 34 |
+
if not r:
|
| 35 |
+
bad.append(p); continue
|
| 36 |
+
action, idx = r
|
| 37 |
+
by_action[action].append((idx, p))
|
| 38 |
+
|
| 39 |
+
print(f"[INFO] ๋์ ํ์ผ ์ด {len(runway)}๊ฐ, ์ก์
{len(by_action)}์ข
")
|
| 40 |
+
if bad:
|
| 41 |
+
print(f"[WARN] ํจํด ๋ถ์ผ์น {len(bad)}๊ฐ (์ญ์ ๋์์์ ์ ์ธ):")
|
| 42 |
+
for x in bad[:10]:
|
| 43 |
+
print(" ", x)
|
| 44 |
+
|
| 45 |
+
to_delete: List[str] = []
|
| 46 |
+
for action, lst in sorted(by_action.items()):
|
| 47 |
+
lst.sort(key=lambda x: (x[0], x[1])) # idx -> path
|
| 48 |
+
keep = lst[:KEEP_PER_ACTION]
|
| 49 |
+
drop = lst[KEEP_PER_ACTION:]
|
| 50 |
+
print(f"- {action:16s}: keep {len(keep)} drop {len(drop)}")
|
| 51 |
+
to_delete.extend([p for _, p in drop])
|
| 52 |
+
|
| 53 |
+
print(f"\n[PLAN] ์ญ์ ์์ ํ์ผ ์: {len(to_delete)}")
|
| 54 |
+
for p in to_delete[:12]:
|
| 55 |
+
print(" DEL", p)
|
| 56 |
+
if DRY_RUN:
|
| 57 |
+
print("\n[DRY_RUN] ์ค์ ์ญ์ ์ ํจ. DRY_RUN=False๋ก ๋ฐ๊พผ ๋ค ๋ค์ ์คํํ์ธ์.")
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
# ์ค์ ์ญ์ ์ปค๋ฐ
|
| 61 |
+
total = 0
|
| 62 |
+
for i in range(0, len(to_delete), BATCH):
|
| 63 |
+
chunk = to_delete[i:i+BATCH]
|
| 64 |
+
ops = [CommitOperationDelete(path_in_repo=p) for p in chunk]
|
| 65 |
+
api.create_commit(
|
| 66 |
+
repo_id=REPO, repo_type="dataset",
|
| 67 |
+
operations=ops,
|
| 68 |
+
commit_message=f"Prune {DIR}: keep {KEEP_PER_ACTION} per action ({i+len(chunk)}/{len(to_delete)})",
|
| 69 |
+
token=HF_TOKEN,
|
| 70 |
+
)
|
| 71 |
+
total += len(chunk)
|
| 72 |
+
print(f"[COMMIT] deleted {len(chunk)} (progress {i+len(chunk)}/{len(to_delete)})")
|
| 73 |
+
print(f"[DONE] ์ด {total}๊ฐ ์ญ์ ์๋ฃ.")
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|
videos.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|