Youngsun Lim commited on
Commit
6b5fab3
ยท
1 Parent(s): 38d5a5a

dataset update

Browse files
Files changed (4) hide show
  1. app.py +0 -58
  2. make_json.py +104 -18
  3. test.py +21 -76
  4. videos.json +0 -0
app.py CHANGED
@@ -90,29 +90,6 @@ def _append(old_bytes, row):
90
  w.writerow(row)
91
  return s.getvalue().encode("utf-8")
92
 
93
- # def push(participant_id, action_name, score, notes=""):
94
- # if not participant_id or not participant_id.strip():
95
- # return gr.update(visible=True, value="โ— Please enter your Participant ID before proceeding.")
96
- # if not action_name or score is None:
97
- # return gr.update(visible=True, value="โ— Fill out all fields.")
98
- # old = _read_csv_bytes()
99
- # row = [
100
- # datetime.utcnow().isoformat(),
101
- # participant_id.strip(),
102
- # action_name,
103
- # float(score),
104
- # notes or ""
105
- # ]
106
- # newb = _append(old, row)
107
- # api.upload_file(
108
- # path_or_fileobj=io.BytesIO(newb),
109
- # path_in_repo=RESULTS_FILE,
110
- # repo_id=REPO_ID,
111
- # repo_type="dataset",
112
- # token=HF_TOKEN,
113
- # commit_message="append"
114
- # )
115
- # return gr.update(visible=True, value=f"โœ… Saved for {action_name}.")
116
 
117
  def push(participant_id, video_id, score, notes=""):
118
  if not participant_id or not participant_id.strip():
@@ -559,36 +536,6 @@ with gr.Blocks(fill_height=True, css=GLOBAL_CSS) as demo:
559
 
560
  return seq
561
 
562
-
563
- # def _start_and_load_first():
564
- # total = TOTAL_PER_PARTICIPANT
565
- # order = _build_order_with_anchor(
566
- # total=total,
567
- # anchor_idx=ANCHOR_IDX,
568
- # repeats=ANCHOR_REPEATS,
569
- # pool_size=len(V),
570
- # min_gap=1 # ์ธ์ ‘ ๊ธˆ์ง€
571
- # )
572
-
573
- # first_idx = order[0]
574
- # v0 = V[first_idx]
575
- # url0 = v0["url"]
576
- # action0 = _extract_action(v0)
577
- # vid0 = _get_video_id(v0) # โœ… ์—ฌ๊ธฐ์„œ ์›๋ณธ id
578
-
579
- # return (
580
- # gr.update(visible=False), # page_intro off
581
- # gr.update(visible=True), # page_eval on
582
- # url0, # video
583
- # action0, # action_tb (ํ‘œ์‹œ์šฉ)
584
- # 5.0, # score ์ดˆ๊ธฐ๊ฐ’
585
- # gr.update(visible=False, value=""),
586
- # 0, # done_state
587
- # _progress_html(0, TOTAL_PER_PARTICIPANT),
588
- # order, # order_state
589
- # 1, # ptr_state
590
- # vid0 # โœ… cur_video_id
591
- # )
592
  def _start_and_load_first():
593
  total = TOTAL_PER_PARTICIPANT
594
  order = _build_order_least_first_with_anchor(
@@ -665,11 +612,6 @@ with gr.Blocks(fill_height=True, css=GLOBAL_CSS) as demo:
665
  _get_video_id(v) # โœ… ๋‹ค์Œ cur_video_id
666
  )
667
 
668
- # save_next.click(
669
- # save_and_next,
670
- # inputs=[pid, action_tb, score, done_state, order_state, ptr_state],
671
- # outputs=[status, video, action_tb, done_state, progress, score, ptr_state]
672
- # )
673
  save_next.click(
674
  save_and_next,
675
  # โœ… cur_video_id๋ฅผ ๋‘ ๋ฒˆ์งธ ์ธ์ž๋กœ ๋„˜๊น€
 
90
  w.writerow(row)
91
  return s.getvalue().encode("utf-8")
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  def push(participant_id, video_id, score, notes=""):
95
  if not participant_id or not participant_id.strip():
 
536
 
537
  return seq
538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  def _start_and_load_first():
540
  total = TOTAL_PER_PARTICIPANT
541
  order = _build_order_least_first_with_anchor(
 
612
  _get_video_id(v) # โœ… ๋‹ค์Œ cur_video_id
613
  )
614
 
 
 
 
 
 
615
  save_next.click(
616
  save_and_next,
617
  # โœ… cur_video_id๋ฅผ ๋‘ ๋ฒˆ์งธ ์ธ์ž๋กœ ๋„˜๊น€
make_json.py CHANGED
@@ -1,19 +1,105 @@
 
 
 
 
1
  from huggingface_hub import list_repo_files
2
- import json, os, random
3
-
4
- repo = "SGTLIM/ucf101_eval_unified"
5
- base = f"https://huggingface.co/datasets/{repo}/resolve/main/"
6
- files = list_repo_files(repo_id=repo, repo_type="dataset")
7
- mp4s = [f for f in files if f.lower().endswith(".mp4")]
8
-
9
- videos = []
10
- for p in mp4s:
11
- stem = p.rsplit("/",1)[-1].rsplit(".",1)[0]
12
- parts = stem.split("_")
13
- action = parts[1] if len(parts) >= 3 else ""
14
- videos.append({"url": base + p + "?download=1", "id": p, "action": action})
15
-
16
- random.shuffle(videos)
17
- with open("videos.json","w",encoding="utf-8") as fp:
18
- json.dump(videos, fp, ensure_ascii=False, indent=2)
19
- print("wrote videos.json with", len(videos), "items")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # make_json.py (ํ˜น์€ build_videos_json.py)
2
+
3
+ import json, os, re, random
4
+ from typing import Optional
5
  from huggingface_hub import list_repo_files
6
+
7
+ REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified")
8
+ INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()]
9
+ OUT_PATH = "videos.json"
10
+ ADD_DOWNLOAD_PARAM = False # ์›๊ฒฉ ์žฌ์ƒ ๋ฌธ์ œ ์žˆ์œผ๋ฉด False ์œ ์ง€
11
+
12
+ # 10๊ฐœ ์•ก์…˜์˜ ํ‘œ์ค€๋ช…(๋Œ€์†Œ๋ฌธ์ž ํฌํ•จ)
13
+ ALLOWED_ACTIONS = {
14
+ "BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps",
15
+ "Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups",
16
+ }
17
+
18
+ # ํ”ํ•œ ์ฒ ์ž/๋Œ€์†Œ๋ฌธ์ž/์–ธ๋”์Šค์ฝ”์–ด ๋ณ€ํ˜• โ†’ ํ‘œ์ค€๋ช…์œผ๋กœ ์น˜ํ™˜
19
+ ALIAS = {
20
+ "bodyweightsquats":"BodyWeightSquats",
21
+ "body_weight_squats":"BodyWeightSquats",
22
+ "bodysquats":"BodyWeightSquats",
23
+
24
+ "hulahoop":"HulaHoop",
25
+
26
+ "jumpingjack":"JumpingJack",
27
+
28
+ "pullups":"PullUps",
29
+
30
+ "pushups":"PushUps",
31
+
32
+ "shotput":"Shotput",
33
+
34
+ "soccerjuggling":"SoccerJuggling",
35
+ "soccer_juggling":"SoccerJuggling",
36
+
37
+ "tennisswing":"TennisSwing",
38
+ "tennis_swing":"TennisSwing",
39
+
40
+ "throwdiscus":"ThrowDiscus",
41
+ "throw_discus":"ThrowDiscus",
42
+
43
+ "wallpushups":"WallPushups",
44
+ "wall_pushups":"WallPushups",
45
+ }
46
+
47
+ def normalize_action(s: str) -> Optional[str]:
48
+ key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_")
49
+ canon = ALIAS.get(key)
50
+ return canon if canon in ALLOWED_ACTIONS else None
51
+
52
+ def is_index_token(tok: str) -> bool:
53
+ return bool(re.fullmatch(r"\d{2,3}", tok))
54
+
55
+ def extract_action_from_id(path_in_repo: str) -> Optional[str]:
56
+ """
57
+ <Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4
58
+ โ†’ ์˜ค๋ฅธ์ชฝ๋ถ€ํ„ฐ ์Šค์บ”ํ•ด ์ฒซ ์ˆซ์ž ํ† ํฐ(2~3์ž๋ฆฌ)์„ 'Index'๋กœ ๊ฐ„์ฃผ,
59
+ ๊ทธ ๋ฐ”๋กœ ์•ž ํ† ํฐ์„ ์•ก์…˜์œผ๋กœ ์‚ฌ์šฉ.
60
+ """
61
+ name = path_in_repo.rsplit("/", 1)[-1]
62
+ stem = name.rsplit(".", 1)[0]
63
+ toks = stem.split("_")
64
+ # ์˜ค๋ฅธ์ชฝโ†’์™ผ์ชฝ ๋ฐฉํ–ฅ์œผ๋กœ ์ˆซ์ž ํ† ํฐ์„ ์ฐพ๋Š”๋‹ค (์˜ˆ: 01, 10, 123)
65
+ for i in range(len(toks)-1, -1, -1):
66
+ t = toks[i]
67
+ if re.fullmatch(r"\d{2,3}", t):
68
+ if i - 1 >= 0:
69
+ action = toks[i - 1]
70
+ return normalize_action(action)
71
+ break
72
+ return None
73
+
74
+
75
+ def main():
76
+ base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/"
77
+ files = list_repo_files(repo_id=REPO, repo_type="dataset")
78
+ mp4s = [f for f in files if f.lower().endswith(".mp4")]
79
+ if INCLUDE_FOLDERS:
80
+ mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)]
81
+
82
+ videos, bad = [], []
83
+ for p in mp4s:
84
+ action = extract_action_from_id(p)
85
+ if action is None or action not in ALLOWED_ACTIONS:
86
+ bad.append(p); continue
87
+ url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "")
88
+ videos.append({"url": url, "id": p, "action": action})
89
+
90
+ random.shuffle(videos)
91
+ with open(OUT_PATH, "w", encoding="utf-8") as fp:
92
+ json.dump(videos, fp, ensure_ascii=False, indent=2)
93
+
94
+ print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}")
95
+ from collections import Counter
96
+ c = Counter(v["action"] for v in videos)
97
+ for a in sorted(ALLOWED_ACTIONS):
98
+ print(f" {a:16s} = {c.get(a,0)}")
99
+ if bad:
100
+ print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}")
101
+ for x in bad[:10]:
102
+ print(" -", x)
103
+
104
+ if __name__ == "__main__":
105
+ main()
test.py CHANGED
@@ -1,76 +1,21 @@
1
- # prune_runway_keep6.py
2
- import os, re, collections
3
- from typing import List, Tuple
4
- from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete
5
-
6
- REPO = "SGTLIM/ucf101_eval_unified" # <-- ๋„ค ๋ฐ์ดํ„ฐ์…‹
7
- DIR = "RunwayGen4" # ๋Œ€์ƒ ํด๋”
8
- KEEP_PER_ACTION = 6 # ์•ก์…˜๋‹น ๋‚จ๊ธธ ๊ฐœ์ˆ˜
9
- BATCH = 100 # ์ปค๋ฐ‹ ๋‹จ์œ„
10
- DRY_RUN = False # ๋จผ์ € ๋ฏธ๋ฆฌ๋ณด๊ธฐ ํ›„ False๋กœ ๋ฐ”๊ฟ” ์‹คํ–‰
11
-
12
- HF_TOKEN = os.getenv("HF_TOKEN") # ์“ฐ๊ธฐ ๊ถŒํ•œ ํ† ํฐ ํ•„์š”(ํผ๋ธ”๋ฆญ์ด๋ผ๋„ ์‚ญ์ œ์—” ํ•„์š”)
13
-
14
- NAME_RE = re.compile(
15
- r'^' + re.escape(DIR) + r'/RunwayGen4_(?P<action>[A-Za-z][A-Za-z0-9]+)_(?P<idx>\d{2})(?:_[0-9a-f]{8})?\.mp4$',
16
- re.I
17
- )
18
-
19
- def parse(p: str):
20
- m = NAME_RE.match(p)
21
- if not m:
22
- return None
23
- return m.group("action"), int(m.group("idx"))
24
-
25
- def main():
26
- api = HfApi()
27
- files = list_repo_files(repo_id=REPO, repo_type="dataset", token=HF_TOKEN)
28
- runway = [p for p in files if p.startswith(DIR + "/") and p.lower().endswith(".mp4")]
29
-
30
- by_action: dict[str, List[Tuple[int,str]]] = collections.defaultdict(list)
31
- bad = []
32
- for p in runway:
33
- r = parse(p)
34
- if not r:
35
- bad.append(p); continue
36
- action, idx = r
37
- by_action[action].append((idx, p))
38
-
39
- print(f"[INFO] ๋Œ€์ƒ ํŒŒ์ผ ์ด {len(runway)}๊ฐœ, ์•ก์…˜ {len(by_action)}์ข…")
40
- if bad:
41
- print(f"[WARN] ํŒจํ„ด ๋ถˆ์ผ์น˜ {len(bad)}๊ฐœ (์‚ญ์ œ ๋Œ€์ƒ์—์„œ ์ œ์™ธ):")
42
- for x in bad[:10]:
43
- print(" ", x)
44
-
45
- to_delete: List[str] = []
46
- for action, lst in sorted(by_action.items()):
47
- lst.sort(key=lambda x: (x[0], x[1])) # idx -> path
48
- keep = lst[:KEEP_PER_ACTION]
49
- drop = lst[KEEP_PER_ACTION:]
50
- print(f"- {action:16s}: keep {len(keep)} drop {len(drop)}")
51
- to_delete.extend([p for _, p in drop])
52
-
53
- print(f"\n[PLAN] ์‚ญ์ œ ์˜ˆ์ • ํŒŒ์ผ ์ˆ˜: {len(to_delete)}")
54
- for p in to_delete[:12]:
55
- print(" DEL", p)
56
- if DRY_RUN:
57
- print("\n[DRY_RUN] ์‹ค์ œ ์‚ญ์ œ ์•ˆ ํ•จ. DRY_RUN=False๋กœ ๋ฐ”๊พผ ๋’ค ๋‹ค์‹œ ์‹คํ–‰ํ•˜์„ธ์š”.")
58
- return
59
-
60
- # ์‹ค์ œ ์‚ญ์ œ ์ปค๋ฐ‹
61
- total = 0
62
- for i in range(0, len(to_delete), BATCH):
63
- chunk = to_delete[i:i+BATCH]
64
- ops = [CommitOperationDelete(path_in_repo=p) for p in chunk]
65
- api.create_commit(
66
- repo_id=REPO, repo_type="dataset",
67
- operations=ops,
68
- commit_message=f"Prune {DIR}: keep {KEEP_PER_ACTION} per action ({i+len(chunk)}/{len(to_delete)})",
69
- token=HF_TOKEN,
70
- )
71
- total += len(chunk)
72
- print(f"[COMMIT] deleted {len(chunk)} (progress {i+len(chunk)}/{len(to_delete)})")
73
- print(f"[DONE] ์ด {total}๊ฐœ ์‚ญ์ œ ์™„๋ฃŒ.")
74
-
75
- if __name__ == "__main__":
76
- main()
 
1
+ from huggingface_hub import list_repo_files
2
+ from collections import Counter
3
+
4
+ REPO="SGTLIM/ucf101_eval_unified"
5
+ files = list_repo_files(repo_id=REPO, repo_type="dataset")
6
+ mp4s = [f for f in files if f.lower().endswith(".mp4")]
7
+ by_top = Counter(f.split("/",1)[0] for f in mp4s)
8
+ print(by_top) # ๊ฐ ์ตœ์ƒ์œ„ ํด๋”๋ณ„ mp4 ๊ฐœ์ˆ˜
9
+
10
+ # ์•ก์…˜ ํŒŒ์‹ฑ(๋‘ ์ž๋ฆฌ ์ธ๋ฑ์Šค ์•ž ํ† ํฐ)
11
+ import re
12
+ def act(p):
13
+ stem = p.rsplit("/",1)[-1].rsplit(".",1)[0].split("_")
14
+ for i,t in enumerate(stem):
15
+ if re.fullmatch(r"\d{2,3}", t):
16
+ return stem[i-1] if i>0 else None
17
+ return None
18
+
19
+ for folder in ["Wan2.2","RunwayGen4","Hunyuan_videos","Opensora_768","wan21_videos"]:
20
+ xs = [f for f in mp4s if f.startswith(folder + "/")]
21
+ print(folder, len(xs), "files")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
videos.json CHANGED
The diff for this file is too large to render. See raw diff