video_eval / test2.py
Youngsun Lim
videoscore test
fca9b58
# check_eval_coverage.py
import argparse, json, re
from pathlib import Path
import pandas as pd
def normalize_video_id(s: str) -> str:
"""CSV/JSON/URL에서 들어오는 video_id를 repo-relative path로 통일."""
if s is None:
return ""
s = str(s).strip()
# URL이면 /resolve/<branch>/ 뒤를 repo-relative로 자르기
m = re.search(r"/resolve/[^/]+/(.+)$", s)
if m:
s = m.group(1)
# 혹시 leading slash 있으면 제거
s = s.lstrip("/")
# Windows backslash 방지
s = s.replace("\\", "/")
return s
def action_from_video_id(video_id: str) -> str:
# 보통 "Model/Action/file.mp4"
parts = normalize_video_id(video_id).split("/")
if len(parts) >= 2:
return parts[1]
return "UNKNOWN"
def load_expected_ids(json_path: Path):
data = json.loads(json_path.read_text(encoding="utf-8"))
expected = []
for v in data:
vid = v.get("id") or v.get("url") or ""
expected.append(normalize_video_id(vid))
# 빈 값 제거
expected = [x for x in expected if x]
return expected
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--json", required=True, help="videos json (list of dicts with id/url)")
ap.add_argument("--csv", required=True, help="results_extend.csv")
ap.add_argument("--pid", default="YS", help="participant_id to check")
ap.add_argument("--save_lists", action="store_true", help="save missing/extra to txt")
args = ap.parse_args()
json_path = Path(args.json)
csv_path = Path(args.csv)
expected_list = load_expected_ids(json_path)
expected_set = set(expected_list)
df = pd.read_csv(csv_path)
# 컬럼 이름 혹시 공백/대소문자 문제 대비
df.columns = [c.strip() for c in df.columns]
if "participant_id" not in df.columns or "video_id" not in df.columns:
raise SystemExit(f"CSV columns must include participant_id and video_id. Got: {list(df.columns)}")
df_pid = df[df["participant_id"].astype(str).str.strip() == args.pid].copy()
df_pid["video_id_norm"] = df_pid["video_id"].map(normalize_video_id)
rated_list = [x for x in df_pid["video_id_norm"].tolist() if x]
rated_set = set(rated_list)
missing = sorted(expected_set - rated_set)
extra = sorted(rated_set - expected_set)
# duplicates: 같은 video_id를 여러번 저장한 경우
dup_counts = (
pd.Series(rated_list)
.value_counts()
.loc[lambda s: s > 1]
.sort_values(ascending=False)
)
print("\n=== SUMMARY ===")
print(f"PID: {args.pid}")
print(f"Expected videos (from JSON): {len(expected_list)} (unique={len(expected_set)})")
print(f"Rated rows in CSV (for PID): {len(df_pid)}")
print(f"Rated unique videos: {len(rated_set)}")
print(f"Missing (expected - rated): {len(missing)}")
print(f"Extra (rated - expected): {len(extra)}")
print(f"Duplicate-rated videos: {len(dup_counts)}")
# action별 진행 상황 (원하면 유용)
exp_actions = pd.Series([action_from_video_id(x) for x in expected_list]).value_counts()
rated_actions = pd.Series([action_from_video_id(x) for x in rated_list]).value_counts()
action_table = (
pd.DataFrame({"expected": exp_actions, "rated_rows": rated_actions})
.fillna(0).astype(int)
.sort_values(["expected", "rated_rows"], ascending=False)
)
print("\n=== ACTION COUNTS (expected vs rated rows) ===")
print(action_table.to_string())
if missing:
print("\n=== MISSING (first 50) ===")
for x in missing[:50]:
print(x)
if extra:
print("\n=== EXTRA (first 50) ===")
for x in extra[:50]:
print(x)
if len(dup_counts) > 0:
print("\n=== DUPLICATES (top 50) ===")
print(dup_counts.head(50).to_string())
if args.save_lists:
Path("missing.txt").write_text("\n".join(missing) + ("\n" if missing else ""), encoding="utf-8")
Path("extra.txt").write_text("\n".join(extra) + ("\n" if extra else ""), encoding="utf-8")
Path("duplicates.txt").write_text(dup_counts.to_string() + "\n", encoding="utf-8")
print("\nSaved: missing.txt, extra.txt, duplicates.txt")
if __name__ == "__main__":
main()