# check_eval_coverage.py import argparse, json, re from pathlib import Path import pandas as pd def normalize_video_id(s: str) -> str: """CSV/JSON/URL에서 들어오는 video_id를 repo-relative path로 통일.""" if s is None: return "" s = str(s).strip() # URL이면 /resolve// 뒤를 repo-relative로 자르기 m = re.search(r"/resolve/[^/]+/(.+)$", s) if m: s = m.group(1) # 혹시 leading slash 있으면 제거 s = s.lstrip("/") # Windows backslash 방지 s = s.replace("\\", "/") return s def action_from_video_id(video_id: str) -> str: # 보통 "Model/Action/file.mp4" parts = normalize_video_id(video_id).split("/") if len(parts) >= 2: return parts[1] return "UNKNOWN" def load_expected_ids(json_path: Path): data = json.loads(json_path.read_text(encoding="utf-8")) expected = [] for v in data: vid = v.get("id") or v.get("url") or "" expected.append(normalize_video_id(vid)) # 빈 값 제거 expected = [x for x in expected if x] return expected def main(): ap = argparse.ArgumentParser() ap.add_argument("--json", required=True, help="videos json (list of dicts with id/url)") ap.add_argument("--csv", required=True, help="results_extend.csv") ap.add_argument("--pid", default="YS", help="participant_id to check") ap.add_argument("--save_lists", action="store_true", help="save missing/extra to txt") args = ap.parse_args() json_path = Path(args.json) csv_path = Path(args.csv) expected_list = load_expected_ids(json_path) expected_set = set(expected_list) df = pd.read_csv(csv_path) # 컬럼 이름 혹시 공백/대소문자 문제 대비 df.columns = [c.strip() for c in df.columns] if "participant_id" not in df.columns or "video_id" not in df.columns: raise SystemExit(f"CSV columns must include participant_id and video_id. Got: {list(df.columns)}") df_pid = df[df["participant_id"].astype(str).str.strip() == args.pid].copy() df_pid["video_id_norm"] = df_pid["video_id"].map(normalize_video_id) rated_list = [x for x in df_pid["video_id_norm"].tolist() if x] rated_set = set(rated_list) missing = sorted(expected_set - rated_set) extra = sorted(rated_set - expected_set) # duplicates: 같은 video_id를 여러번 저장한 경우 dup_counts = ( pd.Series(rated_list) .value_counts() .loc[lambda s: s > 1] .sort_values(ascending=False) ) print("\n=== SUMMARY ===") print(f"PID: {args.pid}") print(f"Expected videos (from JSON): {len(expected_list)} (unique={len(expected_set)})") print(f"Rated rows in CSV (for PID): {len(df_pid)}") print(f"Rated unique videos: {len(rated_set)}") print(f"Missing (expected - rated): {len(missing)}") print(f"Extra (rated - expected): {len(extra)}") print(f"Duplicate-rated videos: {len(dup_counts)}") # action별 진행 상황 (원하면 유용) exp_actions = pd.Series([action_from_video_id(x) for x in expected_list]).value_counts() rated_actions = pd.Series([action_from_video_id(x) for x in rated_list]).value_counts() action_table = ( pd.DataFrame({"expected": exp_actions, "rated_rows": rated_actions}) .fillna(0).astype(int) .sort_values(["expected", "rated_rows"], ascending=False) ) print("\n=== ACTION COUNTS (expected vs rated rows) ===") print(action_table.to_string()) if missing: print("\n=== MISSING (first 50) ===") for x in missing[:50]: print(x) if extra: print("\n=== EXTRA (first 50) ===") for x in extra[:50]: print(x) if len(dup_counts) > 0: print("\n=== DUPLICATES (top 50) ===") print(dup_counts.head(50).to_string()) if args.save_lists: Path("missing.txt").write_text("\n".join(missing) + ("\n" if missing else ""), encoding="utf-8") Path("extra.txt").write_text("\n".join(extra) + ("\n" if extra else ""), encoding="utf-8") Path("duplicates.txt").write_text(dup_counts.to_string() + "\n", encoding="utf-8") print("\nSaved: missing.txt, extra.txt, duplicates.txt") if __name__ == "__main__": main()