Spaces:

SGTLIM
/

videoeval_humaneval

Build error

App Files Files Community

Youngsun Lim commited on Oct 15, 2025

Commit

6b5fab3

1 Parent(s): 38d5a5a

dataset update

Browse files

Files changed (4) hide show

app.py +0 -58
make_json.py +104 -18
test.py +21 -76
videos.json +0 -0

app.py CHANGED Viewed

@@ -90,29 +90,6 @@ def _append(old_bytes, row):
     w.writerow(row)
     return s.getvalue().encode("utf-8")
-# def push(participant_id, action_name, score, notes=""):
-#     if not participant_id or not participant_id.strip():
-#         return gr.update(visible=True, value="❗ Please enter your Participant ID before proceeding.")
-#     if not action_name or score is None:
-#         return gr.update(visible=True, value="❗ Fill out all fields.")
-#     old = _read_csv_bytes()
-#     row = [
-#         datetime.utcnow().isoformat(),
-#         participant_id.strip(),
-#         action_name,
-#         float(score),
-#         notes or ""
-#     ]
-#     newb = _append(old, row)
-#     api.upload_file(
-#         path_or_fileobj=io.BytesIO(newb),
-#         path_in_repo=RESULTS_FILE,
-#         repo_id=REPO_ID,
-#         repo_type="dataset",
-#         token=HF_TOKEN,
-#         commit_message="append"
-#     )
-#     return gr.update(visible=True, value=f"✅ Saved for {action_name}.")
 def push(participant_id, video_id, score, notes=""):
     if not participant_id or not participant_id.strip():
@@ -559,36 +536,6 @@ with gr.Blocks(fill_height=True, css=GLOBAL_CSS) as demo:
         return seq
-    # def _start_and_load_first():
-    #     total = TOTAL_PER_PARTICIPANT
-    #     order = _build_order_with_anchor(
-    #         total=total,
-    #         anchor_idx=ANCHOR_IDX,
-    #         repeats=ANCHOR_REPEATS,
-    #         pool_size=len(V),
-    #         min_gap=1  # 인접 금지
-    #     )
-    #     first_idx = order[0]
-    #     v0 = V[first_idx]
-    #     url0 = v0["url"]
-    #     action0 = _extract_action(v0)
-    #     vid0 = _get_video_id(v0)             # ✅ 여기서 원본 id
-    #     return (
-    #         gr.update(visible=False),         # page_intro off
-    #         gr.update(visible=True),          # page_eval on
-    #         url0,                             # video
-    #         action0,                          # action_tb (표시용)
-    #         5.0,                              # score 초기값
-    #         gr.update(visible=False, value=""),
-    #         0,                                # done_state
-    #         _progress_html(0, TOTAL_PER_PARTICIPANT),
-    #         order,                            # order_state
-    #         1,                                # ptr_state
-    #         vid0                              # ✅ cur_video_id
-    #     )
     def _start_and_load_first():
         total = TOTAL_PER_PARTICIPANT
         order = _build_order_least_first_with_anchor(
@@ -665,11 +612,6 @@ with gr.Blocks(fill_height=True, css=GLOBAL_CSS) as demo:
             _get_video_id(v)   # ✅ 다음 cur_video_id
         )
-    # save_next.click(
-    #     save_and_next,
-    #     inputs=[pid, action_tb, score, done_state, order_state, ptr_state],
-    #     outputs=[status, video, action_tb, done_state, progress, score, ptr_state]
-    # )
     save_next.click(
         save_and_next,
         # ✅ cur_video_id를 두 번째 인자로 넘김

     w.writerow(row)
     return s.getvalue().encode("utf-8")
 def push(participant_id, video_id, score, notes=""):
     if not participant_id or not participant_id.strip():
         return seq
     def _start_and_load_first():
         total = TOTAL_PER_PARTICIPANT
         order = _build_order_least_first_with_anchor(
             _get_video_id(v)   # ✅ 다음 cur_video_id
         )
     save_next.click(
         save_and_next,
         # ✅ cur_video_id를 두 번째 인자로 넘김

make_json.py CHANGED Viewed

@@ -1,19 +1,105 @@
 from huggingface_hub import list_repo_files
-import json, os, random
-repo = "SGTLIM/ucf101_eval_unified"
-base = f"https://huggingface.co/datasets/{repo}/resolve/main/"
-files = list_repo_files(repo_id=repo, repo_type="dataset")
-mp4s = [f for f in files if f.lower().endswith(".mp4")]
-videos = []
-for p in mp4s:
-    stem = p.rsplit("/",1)[-1].rsplit(".",1)[0]
-    parts = stem.split("_")
-    action = parts[1] if len(parts) >= 3 else ""
-    videos.append({"url": base + p + "?download=1", "id": p, "action": action})
-random.shuffle(videos)
-with open("videos.json","w",encoding="utf-8") as fp:
-    json.dump(videos, fp, ensure_ascii=False, indent=2)
-print("wrote videos.json with", len(videos), "items")

+# make_json.py (혹은 build_videos_json.py)
+import json, os, re, random
+from typing import Optional
 from huggingface_hub import list_repo_files
+REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified")
+INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()]
+OUT_PATH = "videos.json"
+ADD_DOWNLOAD_PARAM = False  # 원격 재생 문제 있으면 False 유지
+# 10개 액션의 표준명(대소문자 포함)
+ALLOWED_ACTIONS = {
+    "BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps",
+    "Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups",
+}
+# 흔한 철자/대소문자/언더스코어 변형 → 표준명으로 치환
+ALIAS = {
+    "bodyweightsquats":"BodyWeightSquats",
+    "body_weight_squats":"BodyWeightSquats",
+    "bodysquats":"BodyWeightSquats",
+    "hulahoop":"HulaHoop",
+    "jumpingjack":"JumpingJack",
+    "pullups":"PullUps",
+    "pushups":"PushUps",
+    "shotput":"Shotput",
+    "soccerjuggling":"SoccerJuggling",
+    "soccer_juggling":"SoccerJuggling",
+    "tennisswing":"TennisSwing",
+    "tennis_swing":"TennisSwing",
+    "throwdiscus":"ThrowDiscus",
+    "throw_discus":"ThrowDiscus",
+    "wallpushups":"WallPushups",
+    "wall_pushups":"WallPushups",
+}
+def normalize_action(s: str) -> Optional[str]:
+    key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_")
+    canon = ALIAS.get(key)
+    return canon if canon in ALLOWED_ACTIONS else None
+def is_index_token(tok: str) -> bool:
+    return bool(re.fullmatch(r"\d{2,3}", tok))
+def extract_action_from_id(path_in_repo: str) -> Optional[str]:
+    """
+    <Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4
+    → 오른쪽부터 스캔해 첫 숫자 토큰(2~3자리)을 'Index'로 간주,
+      그 바로 앞 토큰을 액션으로 사용.
+    """
+    name = path_in_repo.rsplit("/", 1)[-1]
+    stem = name.rsplit(".", 1)[0]
+    toks = stem.split("_")
+    # 오른쪽→왼쪽 방향으로 숫자 토큰을 찾는다 (예: 01, 10, 123)
+    for i in range(len(toks)-1, -1, -1):
+        t = toks[i]
+        if re.fullmatch(r"\d{2,3}", t):
+            if i - 1 >= 0:
+                action = toks[i - 1]
+                return normalize_action(action)
+            break
+    return None
+def main():
+    base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/"
+    files = list_repo_files(repo_id=REPO, repo_type="dataset")
+    mp4s = [f for f in files if f.lower().endswith(".mp4")]
+    if INCLUDE_FOLDERS:
+        mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)]
+    videos, bad = [], []
+    for p in mp4s:
+        action = extract_action_from_id(p)
+        if action is None or action not in ALLOWED_ACTIONS:
+            bad.append(p); continue
+        url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "")
+        videos.append({"url": url, "id": p, "action": action})
+    random.shuffle(videos)
+    with open(OUT_PATH, "w", encoding="utf-8") as fp:
+        json.dump(videos, fp, ensure_ascii=False, indent=2)
+    print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}")
+    from collections import Counter
+    c = Counter(v["action"] for v in videos)
+    for a in sorted(ALLOWED_ACTIONS):
+        print(f"  {a:16s} = {c.get(a,0)}")
+    if bad:
+        print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}")
+        for x in bad[:10]:
+            print("   -", x)
+if __name__ == "__main__":
+    main()

test.py CHANGED Viewed

@@ -1,76 +1,21 @@
-# prune_runway_keep6.py
-import os, re, collections
-from typing import List, Tuple
-from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete
-REPO = "SGTLIM/ucf101_eval_unified"  # <-- 네 데이터셋
-DIR  = "RunwayGen4"                  # 대상 폴더
-KEEP_PER_ACTION = 6                  # 액션당 남길 개수
-BATCH = 100                          # 커밋 단위
-DRY_RUN = False                       # 먼저 미리보기 후 False로 바꿔 실행
-HF_TOKEN = os.getenv("HF_TOKEN")     # 쓰기 권한 토큰 필요(퍼블릭이라도 삭제엔 필요)
-NAME_RE = re.compile(
-    r'^' + re.escape(DIR) + r'/RunwayGen4_(?P<action>[A-Za-z][A-Za-z0-9]+)_(?P<idx>\d{2})(?:_[0-9a-f]{8})?\.mp4$',
-    re.I
-)
-def parse(p: str):
-    m = NAME_RE.match(p)
-    if not m:
-        return None
-    return m.group("action"), int(m.group("idx"))
-def main():
-    api = HfApi()
-    files = list_repo_files(repo_id=REPO, repo_type="dataset", token=HF_TOKEN)
-    runway = [p for p in files if p.startswith(DIR + "/") and p.lower().endswith(".mp4")]
-    by_action: dict[str, List[Tuple[int,str]]] = collections.defaultdict(list)
-    bad = []
-    for p in runway:
-        r = parse(p)
-        if not r:
-            bad.append(p); continue
-        action, idx = r
-        by_action[action].append((idx, p))
-    print(f"[INFO] 대상 파일 총 {len(runway)}개, 액션 {len(by_action)}종")
-    if bad:
-        print(f"[WARN] 패턴 불일치 {len(bad)}개 (삭제 대상에서 제외):")
-        for x in bad[:10]:
-            print("  ", x)
-    to_delete: List[str] = []
-    for action, lst in sorted(by_action.items()):
-        lst.sort(key=lambda x: (x[0], x[1]))  # idx -> path
-        keep = lst[:KEEP_PER_ACTION]
-        drop = lst[KEEP_PER_ACTION:]
-        print(f"- {action:16s}: keep {len(keep)}  drop {len(drop)}")
-        to_delete.extend([p for _, p in drop])
-    print(f"\n[PLAN] 삭제 예정 파일 수: {len(to_delete)}")
-    for p in to_delete[:12]:
-        print("  DEL", p)
-    if DRY_RUN:
-        print("\n[DRY_RUN] 실제 삭제 안 함. DRY_RUN=False로 바꾼 뒤 다시 실행하세요.")
-        return
-    # 실제 삭제 커밋
-    total = 0
-    for i in range(0, len(to_delete), BATCH):
-        chunk = to_delete[i:i+BATCH]
-        ops = [CommitOperationDelete(path_in_repo=p) for p in chunk]
-        api.create_commit(
-            repo_id=REPO, repo_type="dataset",
-            operations=ops,
-            commit_message=f"Prune {DIR}: keep {KEEP_PER_ACTION} per action ({i+len(chunk)}/{len(to_delete)})",
-            token=HF_TOKEN,
-        )
-        total += len(chunk)
-        print(f"[COMMIT] deleted {len(chunk)} (progress {i+len(chunk)}/{len(to_delete)})")
-    print(f"[DONE] 총 {total}개 삭제 완료.")
-if __name__ == "__main__":
-    main()

+from huggingface_hub import list_repo_files
+from collections import Counter
+REPO="SGTLIM/ucf101_eval_unified"
+files = list_repo_files(repo_id=REPO, repo_type="dataset")
+mp4s = [f for f in files if f.lower().endswith(".mp4")]
+by_top = Counter(f.split("/",1)[0] for f in mp4s)
+print(by_top)  # 각 최상위 폴더별 mp4 개수
+# 액션 파싱(두 자리 인덱스 앞 토큰)
+import re
+def act(p):
+    stem = p.rsplit("/",1)[-1].rsplit(".",1)[0].split("_")
+    for i,t in enumerate(stem):
+        if re.fullmatch(r"\d{2,3}", t):
+            return stem[i-1] if i>0 else None
+    return None
+for folder in ["Wan2.2","RunwayGen4","Hunyuan_videos","Opensora_768","wan21_videos"]:
+    xs = [f for f in mp4s if f.startswith(folder + "/")]
+    print(folder, len(xs), "files")

videos.json CHANGED Viewed

The diff for this file is too large to render. See raw diff