Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| st.set_page_config(page_title="VideoEval Explorer", layout="wide") | |
| st.title("VideoEval — Tables + Viewer with Human Scores") | |
| # ========================= | |
| # Paths / Config | |
| # ========================= | |
| mapping_json_path = "src/YOUTUBE_DATA/id_map.json" | |
| video_dir = "src/YOUTUBE_DATA" | |
| def uniq_sorted(s) -> list: | |
| return sorted(pd.Series(s).dropna().unique().tolist()) | |
| # --- Metric & video-id normalization for raters --- | |
| ALIASES = { | |
| "action": "human_action", | |
| "anatomy": "human_anatomy", | |
| "appearance": "human_appearance", | |
| "motion": "human_motion", | |
| "overall": "overall", | |
| } | |
| def normalize_metric_name(name: str) -> str: | |
| if not isinstance(name, str): | |
| return str(name) | |
| key = name.strip().lower() | |
| return ALIASES.get(key, name) # fall back to original if unknown | |
| def normalize_video_id(v) -> str: | |
| s = str(v) | |
| # enforce ".mp4" suffix to match main df | |
| return s if s.endswith(".mp4") else f"{s}.mp4" | |
| # Model metric JSONs | |
| REQ = { | |
| "action_mean_intra": "src/action_mean_intra.json", | |
| "frame_diff_ord2": "src/frame_diff_ord2.json", | |
| } | |
| # Human JSONs (aggregate) | |
| HUMAN = { | |
| "human_action": "src/human_scores_analysis_action_mos_centered.json", | |
| "human_anatomy": "src/human_scores_analysis_anatomy_mos_centered.json", | |
| "human_appearance": "src/human_scores_analysis_appearance_mos_centered.json", | |
| "human_motion": "src/human_scores_analysis_motion_mos_centered.json", | |
| } | |
| # Rater JSONs glob patterns (support both) | |
| RATER_GLOBS = ["src/raters*.json", "src/raters/*.json"] | |
| # ========================= | |
| # Helpers | |
| # ========================= | |
| def load_json(p: Path) -> dict: | |
| with open(p, "r") as f: | |
| d = json.load(f) | |
| out = {} | |
| for k, v in d.items(): | |
| try: | |
| out[str(k)] = float(v) | |
| except Exception: | |
| out[str(k)] = np.nan | |
| return out | |
| def build_maps_from_mapping(p: Path) -> Tuple[Dict[str, str], Dict[str, str]]: | |
| """ | |
| Returns: | |
| - v2m: video_id -> model | |
| - v2base: video_id -> base_name (original 'videoName', e.g. 'v_JumpingJack_g13_c02') | |
| Accepts either 'video_to_model' or 'model_to_videoName_to_id' schema. | |
| """ | |
| v2m, v2base = {}, {} | |
| if not p or not p.exists(): | |
| return v2m, v2base | |
| with open(p, "r") as f: | |
| m = json.load(f) | |
| if "video_to_model" in m and isinstance(m["video_to_model"], dict): | |
| v2m = dict(m["video_to_model"]) | |
| return v2m, v2base | |
| if "model_to_videoName_to_id" in m and isinstance(m["model_to_videoName_to_id"], dict): | |
| for model, name_to_id in m["model_to_videoName_to_id"].items(): | |
| for base_name, cls_id in name_to_id.items(): | |
| vid = f"{cls_id}.mp4" # filenames like Class__HASH.mp4 | |
| v2m[vid] = model | |
| v2base[vid] = base_name | |
| return v2m, v2base | |
| def minmax_normalize(series: pd.Series) -> pd.Series: | |
| s = pd.to_numeric(series, errors="coerce") | |
| if s.notna().sum() == 0: | |
| return s | |
| smin, smax = s.min(skipna=True), s.max(skipna=True) | |
| if not np.isfinite(smin) or not np.isfinite(smax) or smin == smax: | |
| return s.apply(lambda x: 0.0 if pd.notna(x) else np.nan) | |
| return (s - smin) / (smax - smin) | |
| def fmt3(x): | |
| try: | |
| x = float(x) | |
| except (TypeError, ValueError): | |
| return "" | |
| if not np.isfinite(x): | |
| return "" | |
| return f"{x:.3f}" | |
| def filter_by(df: pd.DataFrame, cls: str, mdl: str) -> pd.DataFrame: | |
| """Single-select filter (used by the two tables).""" | |
| out = df | |
| if cls != "(All)": | |
| out = out[out["class"] == cls] | |
| if mdl != "(All)": | |
| out = out[out["model"] == mdl] | |
| return out | |
| def filter_by_multi(df: pd.DataFrame, classes: List[str] | None, models: List[str] | None) -> pd.DataFrame: | |
| """Multi-select filter used in pairwise tabs. Empty/None means 'all'.""" | |
| out = df | |
| if classes: | |
| out = out[out["class"].isin(classes)] | |
| if models: | |
| out = out[out["model"].isin(models)] | |
| return out | |
| def pairwise_agreement(df: pd.DataFrame, model_col: str, human_col: str): | |
| """ | |
| Pairwise ranking agreement between model_col and human_col over all rows in df. | |
| Returns: (accuracy, disagree_pairs_df, total_pairs) | |
| Directions: | |
| - Model metrics: BOTH lower is better | |
| * action_mean_intra -> use 'action_mean_intra_orig' (pre-negation) | |
| * frame_diff_ord2 -> use 'frame_diff_ord2' (raw) | |
| - Human metrics: higher is better | |
| """ | |
| if model_col == "action_mean_intra": | |
| use_model_col = "action_mean_intra_orig" | |
| elif model_col == "frame_diff_ord2": | |
| use_model_col = "frame_diff_ord2" | |
| else: | |
| use_model_col = model_col # fallback | |
| vids = df["video_id"].tolist() | |
| mvals = pd.to_numeric(df[use_model_col], errors="coerce").values | |
| hvals = pd.to_numeric(df[human_col], errors="coerce").values | |
| rows = [] | |
| total = 0 | |
| agree = 0 | |
| n = len(vids) | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| mi, mj = mvals[i], mvals[j] | |
| hi, hj = hvals[i], hvals[j] | |
| if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)): | |
| continue | |
| if mi == mj or hi == hj: | |
| continue | |
| # model order: LOWER is better | |
| model_order = "A>B" if mi < mj else "B>A" | |
| # human order: HIGHER is better | |
| human_order = "A>B" if hi > hj else "B>A" | |
| is_agree = (model_order == human_order) | |
| total += 1 | |
| if is_agree: | |
| agree += 1 | |
| else: | |
| rows.append({ | |
| "video_A": vids[i], | |
| "video_B": vids[j], | |
| f"{model_col}_A": mi, | |
| f"{model_col}_B": mj, | |
| f"{human_col}_A": hi, | |
| f"{human_col}_B": hj, | |
| "model_order": model_order, | |
| "human_order": human_order, | |
| # "agree": is_agree, | |
| }) | |
| acc = (agree / total) if total > 0 else np.nan | |
| disagree_df = pd.DataFrame(rows) | |
| return acc, disagree_df, total | |
| def cross_model_pairwise(df: pd.DataFrame, model_col: str, human_col: str): | |
| """ | |
| Pairwise agreement only across DIFFERENT MODELS but the SAME base video. | |
| Requires df to have: ['video_id','model','base_name', model_col, human_col, 'action_mean_intra_orig']. | |
| Returns: (accuracy, disagree_pairs_df, total_pairs) | |
| Directions: | |
| - Model metrics: BOTH lower is better (action_mean_intra_orig, frame_diff_ord2) | |
| - Human metrics: higher is better | |
| """ | |
| if model_col == "action_mean_intra": | |
| use_model_col = "action_mean_intra_orig" | |
| elif model_col == "frame_diff_ord2": | |
| use_model_col = "frame_diff_ord2" | |
| else: | |
| use_model_col = model_col | |
| rows = [] | |
| total = 0 | |
| agree = 0 | |
| scope = df.dropna(subset=["base_name"]) | |
| for base_name, g in scope.groupby("base_name"): | |
| vids = g["video_id"].tolist() | |
| models = g["model"].tolist() | |
| mvals = pd.to_numeric(g[use_model_col], errors="coerce").values | |
| hvals = pd.to_numeric(g[human_col], errors="coerce").values | |
| n = len(vids) | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| if models[i] == models[j]: | |
| continue # cross-model only | |
| mi, mj = mvals[i], mvals[j] | |
| hi, hj = hvals[i], hvals[j] | |
| if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)): | |
| continue | |
| if mi == mj or hi == hj: | |
| continue | |
| model_order = "A>B" if mi < mj else "B>A" # LOWER better | |
| human_order = "A>B" if hi > hj else "B>A" # HIGHER better | |
| is_agree = (model_order == human_order) | |
| total += 1 | |
| if is_agree: | |
| agree += 1 | |
| else: | |
| rows.append({ | |
| "base_name": base_name, | |
| "video_A": vids[i], | |
| "model_A": models[i], | |
| f"{model_col}_A": mi, | |
| f"{human_col}_A": hi, | |
| "video_B": vids[j], | |
| "model_B": models[j], | |
| f"{model_col}_B": mj, | |
| f"{human_col}_B": hj, | |
| "model_order": model_order, | |
| "human_order": human_order, | |
| # "agree": is_agree, | |
| }) | |
| acc = (agree / total) if total > 0 else np.nan | |
| disagree_df = pd.DataFrame(rows) | |
| return acc, disagree_df, total | |
| def spearman_all(df: pd.DataFrame, model_cols: list, human_cols: list) -> pd.DataFrame: | |
| """Spearman rho between each model metric and each human metric over all videos.""" | |
| records = [] | |
| for m in model_cols: | |
| for h in human_cols: | |
| sub = df[[m, h]].dropna() | |
| n = len(sub) | |
| rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan | |
| records.append({"model_metric": m, "human_metric": h, "rho": rho, "n": n}) | |
| return pd.DataFrame(records) | |
| def spearman_by_group(df: pd.DataFrame, model_cols: list, human_cols: list, group_col: str, groups: List[str]) -> pd.DataFrame: | |
| """ | |
| Spearman rho per group (group_col ∈ {'class','model'}). If groups is empty -> all unique values. | |
| Returns tidy df with columns: [group_col, model_metric, human_metric, rho, n] | |
| """ | |
| if not groups: | |
| groups = sorted([g for g in df[group_col].dropna().unique().tolist()]) | |
| rows = [] | |
| for g in groups: | |
| subdf = df[df[group_col] == g] | |
| if subdf.empty: | |
| continue | |
| for m in model_cols: | |
| for h in human_cols: | |
| sub = subdf[[m, h]].dropna() | |
| n = len(sub) | |
| rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan | |
| rows.append({group_col: g, "model_metric": m, "human_metric": h, "rho": rho, "n": n}) | |
| return pd.DataFrame(rows) | |
| # ---------- Rater loading & inter-rater correlations ---------- | |
| def _filename_to_rater_name(p: Path) -> str: | |
| name = p.stem # e.g., "chvskch_scores" | |
| if name.endswith("_scores"): | |
| name = name[:-7] | |
| return name | |
| def _looks_like_video_key(k: str) -> bool: | |
| s = str(k) | |
| return "__" in s or s.endswith(".mp4") or s.endswith(".MP4") | |
| def _detect_rater_structure(obj) -> str: | |
| """ | |
| Returns one of: | |
| - 'metric_to_video' : {metric: {video_id: score}, ...} | |
| - 'video_to_score' : {video_id: score, ...} | |
| - 'video_to_metric_score' : {video_id: {metric: score}, ...} | |
| - 'unknown' | |
| """ | |
| if not isinstance(obj, dict) or not obj: | |
| return "unknown" | |
| # If values are not dicts -> {video_id: score} | |
| first_val = next(iter(obj.values())) | |
| if not isinstance(first_val, dict): | |
| return "video_to_score" | |
| # Values are dicts. Decide by looking at the OUTER KEYS. | |
| outer_keys = list(obj.keys()) | |
| if any(_looks_like_video_key(k) for k in outer_keys): | |
| return "video_to_metric_score" # <-- your case | |
| else: | |
| return "metric_to_video" | |
| def load_raters(globs: List[str]) -> Dict[str, pd.DataFrame]: | |
| """ | |
| Returns: dict metric_name -> wide DataFrame with index=video_id ('.mp4' enforced) and columns=rater_name. | |
| Supports shapes: | |
| - {metric: {video_id: score}} | |
| - {video_id: score} -> metric 'overall' | |
| - {video_id: {metric: score}} -> (your case) | |
| Metric names normalized to app's human_* names. | |
| """ | |
| metric_to_frames: Dict[str, List[pd.DataFrame]] = {} | |
| files = [] | |
| for pat in globs: | |
| files.extend(Path(".").glob(pat)) | |
| for p in files: | |
| try: | |
| data = json.loads(Path(p).read_text()) | |
| except Exception: | |
| continue | |
| rater = _filename_to_rater_name(p) | |
| shape = _detect_rater_structure(data) | |
| if shape == "metric_to_video": | |
| # {metric: {video_id: score}} | |
| for metric, vid_scores in data.items(): | |
| metric_norm = normalize_metric_name(metric) | |
| s = pd.Series({normalize_video_id(k): v for k, v in vid_scores.items()}, | |
| name=rater, dtype="float") | |
| metric_to_frames.setdefault(metric_norm, []).append(s.to_frame()) | |
| elif shape == "video_to_score": | |
| # {video_id: score} -> 'overall' | |
| s = pd.Series({normalize_video_id(k): v for k, v in data.items()}, | |
| name=rater, dtype="float") | |
| metric_to_frames.setdefault("overall", []).append(s.to_frame()) | |
| elif shape == "video_to_metric_score": | |
| # {video_id: {metric: score}} <-- YOUR FILES | |
| bucket: Dict[str, Dict[str, float]] = {} | |
| for vid, mdict in data.items(): | |
| if not isinstance(mdict, dict): | |
| continue | |
| for metric, val in mdict.items(): | |
| metric_norm = normalize_metric_name(metric) | |
| bucket.setdefault(metric_norm, {})[normalize_video_id(vid)] = val | |
| for metric_norm, vid_scores in bucket.items(): | |
| s = pd.Series(vid_scores, name=rater, dtype="float") | |
| metric_to_frames.setdefault(metric_norm, []).append(s.to_frame()) | |
| # else: unknown; skip | |
| # Merge per metric | |
| metric_to_wide: Dict[str, pd.DataFrame] = {} | |
| for metric, frames in metric_to_frames.items(): | |
| if not frames: | |
| continue | |
| wide = frames[0] | |
| for f in frames[1:]: | |
| wide = wide.join(f, how="outer") | |
| wide = wide.loc[:, ~wide.columns.duplicated()] | |
| metric_to_wide[metric] = wide | |
| return metric_to_wide | |
| def inter_rater_pairs_by_group( | |
| rater_wide: pd.DataFrame, | |
| video_meta: pd.DataFrame, | |
| by: str, | |
| values: List[str] | None = None, | |
| min_overlap: int = 2, | |
| ) -> pd.DataFrame: | |
| """ | |
| Return per-rater *pairwise* correlations within each group. | |
| Rows: [by, rater, other_rater, rho, n] | |
| Filters: drop NaN rho; keep only pairs with n >= min_overlap. | |
| """ | |
| pairs = inter_rater_corr_grouped(rater_wide, video_meta, by=by, values=values) | |
| if pairs.empty: | |
| return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"]) | |
| # Ensure integer overlaps | |
| pairs["n"] = pairs["n"].astype(int) | |
| # keep only pairs with enough overlap and non-null rho | |
| pairs = pairs[(pairs["n"] >= min_overlap) & pairs["rho"].notna()] | |
| # # keep only adequate overlap, drop NaNs | |
| # pairs = pairs[pairs["n"] >= min_overlap].dropna(subset=["rho"]) | |
| if pairs.empty: | |
| return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"]) | |
| # expand to directed rows so each rater has their own row per counterpart | |
| a = pairs.rename(columns={"rater_i": "rater", "rater_j": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]] | |
| b = pairs.rename(columns={"rater_j": "rater", "rater_i": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]] | |
| out = pd.concat([a, b], ignore_index=True) | |
| # sort for nicer reading | |
| out = out.sort_values([by, "rater", "rho"], ascending=[True, True, False]).reset_index(drop=True) | |
| return out | |
| def inter_rater_avg_by_group( | |
| rater_wide: pd.DataFrame, | |
| video_meta: pd.DataFrame, | |
| by: str, | |
| values: List[str] | None = None, | |
| min_overlap: int = 2, | |
| ) -> pd.DataFrame: | |
| """ | |
| Average (unweighted) inter-rater Spearman rho per rater within each group. | |
| Uses same filters as inter_rater_pairs_by_group. | |
| Rows: [by, rater, mean_rho, num_pairs_used] | |
| """ | |
| pairs_long = inter_rater_pairs_by_group( | |
| rater_wide, video_meta, by=by, values=values, min_overlap=min_overlap | |
| ) | |
| if pairs_long.empty: | |
| return pd.DataFrame(columns=[by, "rater", "mean_rho", "num_pairs_used"]) | |
| agg = ( | |
| pairs_long.groupby([by, "rater"]) | |
| .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count")) | |
| .reset_index() | |
| ) | |
| agg["mean_rho"] = agg["mean_rho"].astype(float) | |
| return agg | |
| def inter_rater_corr_grouped( | |
| rater_wide: pd.DataFrame, | |
| video_meta: pd.DataFrame, | |
| by: str, | |
| values: List[str] | None = None | |
| ) -> pd.DataFrame: | |
| """ | |
| Spearman inter-rater correlation matrices grouped by `by` ∈ {'class','model'}. | |
| Requires ≥2 raters with ≥2 overlapping videos inside each group. | |
| Returns tidy df: [by, rater_i, rater_j, rho, n] | |
| """ | |
| assert by in ("class", "model") | |
| def corr_to_long(df_corr: pd.DataFrame, counts: pd.DataFrame, group_label: str): | |
| out = [] | |
| for i in df_corr.index: | |
| for j in df_corr.columns: | |
| if i >= j: | |
| continue | |
| out.append({by: group_label, "rater_i": i, "rater_j": j, | |
| "rho": df_corr.loc[i, j], "n": int(counts.loc[i, j])}) | |
| return out | |
| meta = video_meta[["video_id", "class", "model"]].drop_duplicates().set_index("video_id") | |
| X = rater_wide.copy() | |
| X.index = X.index.map(str) # should already be normalized to '.mp4' | |
| X = X.join(meta, how="left") | |
| if not values: | |
| values = sorted([v for v in X[by].dropna().unique().tolist()]) | |
| results = [] | |
| def corr_with_counts(M: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| rho = M.corr(method="spearman", min_periods=2) | |
| mask = ~M.isna() | |
| counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="float") | |
| for a in M.columns: | |
| for b in M.columns: | |
| counts.loc[a, b] = float((mask[a] & mask[b]).sum()) | |
| return rho, counts | |
| for g in values: | |
| sub = X[X[by] == g] | |
| if sub.empty: | |
| continue | |
| # keep only rater columns with at least 2 ratings | |
| cols = [c for c in sub.columns if c not in ("class", "model")] | |
| usable = [c for c in cols if sub[c].notna().sum() >= 2] | |
| if len(usable) < 2: | |
| continue | |
| M = sub[usable] | |
| rho, counts = corr_with_counts(M) | |
| # if every pair has <2 overlaps, skip | |
| if (counts.values < 2).all(): | |
| continue | |
| results.extend(corr_to_long(rho, counts, g)) | |
| out = pd.DataFrame(results) | |
| if not out.empty: | |
| out["rho"] = out["rho"].astype(float) | |
| return out | |
| def _corr_with_counts_matrix(M: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """Spearman rho matrix + overlap counts matrix for a wide (videos x raters) frame.""" | |
| rho = M.corr(method="spearman", min_periods=2) | |
| mask = ~M.isna() | |
| counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="int64") | |
| for a in M.columns: | |
| for b in M.columns: | |
| counts.loc[a, b] = int((mask[a] & mask[b]).sum()) | |
| return rho, counts | |
| def inter_rater_pairs_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame: | |
| """ | |
| Pairwise inter-rater Spearman over the entire dataset (no grouping). | |
| Returns columns: [rater, other_rater, rho, n] | |
| n = overlapping videos used for that pair. | |
| """ | |
| if rater_wide is None or rater_wide.empty: | |
| return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"]) | |
| # keep only rater columns with at least 2 ratings | |
| usable = [c for c in rater_wide.columns if rater_wide[c].notna().sum() >= 2] | |
| M = rater_wide[usable] | |
| if M.shape[1] < 2: | |
| return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"]) | |
| rho, counts = _corr_with_counts_matrix(M) | |
| rows = [] | |
| for i in rho.index: | |
| for j in rho.columns: | |
| if i >= j: | |
| continue | |
| n_overlap = int(counts.loc[i, j]) | |
| r = rho.loc[i, j] | |
| if n_overlap >= min_overlap and pd.notna(r): | |
| rows.append({"rater": i, "other_rater": j, "rho": float(r), "n": n_overlap}) | |
| rows.append({"rater": j, "other_rater": i, "rho": float(r), "n": n_overlap}) # directed | |
| out = pd.DataFrame(rows) | |
| if not out.empty: | |
| out = out.sort_values(["rater", "rho"], ascending=[True, False]).reset_index(drop=True) | |
| return out | |
| def inter_rater_avg_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame: | |
| """ | |
| Overall (no grouping) average Spearman rho per rater. | |
| Returns columns: [rater, mean_rho, num_pairs_used] | |
| """ | |
| pairs = inter_rater_pairs_overall(rater_wide, min_overlap=min_overlap) | |
| if pairs.empty: | |
| return pd.DataFrame(columns=["rater", "mean_rho", "num_pairs_used"]) | |
| agg = (pairs.groupby("rater") | |
| .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count")) | |
| .reset_index() | |
| .sort_values("mean_rho", ascending=False) | |
| .reset_index(drop=True)) | |
| return agg | |
| # ========================= | |
| # Load aggregate scores | |
| # ========================= | |
| scores = {k: load_json(Path(p)) for k, p in REQ.items()} | |
| human_scores = {k: load_json(Path(p)) for k, p in HUMAN.items()} | |
| all_keys = set() | |
| for d in scores.values(): | |
| all_keys |= set(d.keys()) | |
| for d in human_scores.values(): | |
| all_keys |= set(d.keys()) | |
| rows = [] | |
| for vid in sorted(all_keys): | |
| rows.append({ | |
| "video_id": vid, | |
| "class": vid.split("__")[0] if "__" in vid else "UNK", | |
| "action_mean_intra": scores["action_mean_intra"].get(vid, np.nan), | |
| "frame_diff_ord2": scores["frame_diff_ord2"].get(vid, np.nan), | |
| "human_action": human_scores["human_action"].get(vid, np.nan), | |
| "human_anatomy": human_scores["human_anatomy"].get(vid, np.nan), | |
| "human_appearance": human_scores["human_appearance"].get(vid, np.nan), | |
| "human_motion": human_scores["human_motion"].get(vid, np.nan), | |
| }) | |
| df_raw = pd.DataFrame(rows) | |
| # Keep original action_mean_intra for pairwise; flip for display ONCE (your JSON is negative) | |
| df_raw["action_mean_intra"] = pd.to_numeric(df_raw["action_mean_intra"], errors="coerce") | |
| df_raw["action_mean_intra_orig"] = df_raw["action_mean_intra"].copy() | |
| df_raw["action_mean_intra"] = df_raw["action_mean_intra_orig"] # positive / higher-better in tables | |
| # Map models + base_name via mapping.json | |
| v2m, v2base = build_maps_from_mapping(Path(mapping_json_path)) | |
| df_raw["model"] = df_raw["video_id"].map(v2m).fillna("UNK") | |
| df_raw["base_name"] = df_raw["video_id"].map(v2base) # may be NaN | |
| # Metric columns (display + normalization) | |
| metric_cols = [ | |
| "action_mean_intra", # positive (after flip) for display; pairwise uses *_orig | |
| "frame_diff_ord2", # raw; lower-better | |
| "human_action", | |
| "human_anatomy", | |
| "human_appearance", | |
| "human_motion", | |
| ] | |
| human_cols = ["human_action", "human_anatomy", "human_appearance", "human_motion"] | |
| # Drop rows with NO human ratings at all | |
| df_raw = df_raw.dropna(subset=human_cols, how="all") | |
| # Normalized view for tables | |
| df_norm = df_raw.copy() | |
| for c in metric_cols: | |
| df_norm[c] = minmax_normalize(df_norm[c]) | |
| # ========================= | |
| # UI Layout | |
| # ========================= | |
| left, right = st.columns([2, 1]) | |
| with left: | |
| tab_tables, tab_agree, tab_cross, tab_spear, tab_ir = st.tabs( | |
| ["Tables", "Agreement", "Cross-Model (same videoName)", "Spearman", "Inter-Rater"] | |
| ) | |
| # --------------------- TABLES (A/B) --------------------- | |
| with tab_tables: | |
| st.subheader("Comparison Tables") | |
| # Table A | |
| st.markdown("**Table A**") | |
| use_norm_a = st.checkbox("Show normalized scores (0–1) — A", value=False, key="norm_a") | |
| classes = ["(All)"] + uniq_sorted(df_raw["class"]) | |
| models = ["(All)"] + uniq_sorted(df_raw["model"]) | |
| ca, ma = st.columns(2) | |
| with ca: | |
| chosen_c_a = st.selectbox("Class (A)", classes, key="class_a") | |
| with ma: | |
| chosen_m_a = st.selectbox("Model (A)", models, key="model_a") | |
| df_view_a = df_norm if use_norm_a else df_raw | |
| filt_a = filter_by(df_view_a, chosen_c_a, chosen_m_a).copy() | |
| disp_a = filt_a.copy() | |
| disp_a[metric_cols] = disp_a[metric_cols].applymap(fmt3) | |
| st.dataframe(disp_a, use_container_width=True, hide_index=True) | |
| st.markdown("---") | |
| # Table B | |
| st.markdown("**Table B**") | |
| use_norm_b = st.checkbox("Show normalized scores (0–1) — B", value=False, key="norm_b") | |
| cb, mb = st.columns(2) | |
| with cb: | |
| chosen_c_b = st.selectbox("Class (B)", classes, key="class_b") | |
| with mb: | |
| chosen_m_b = st.selectbox("Model (B)", models, key="model_b") | |
| df_view_b = df_norm if use_norm_b else df_raw | |
| filt_b = filter_by(df_view_b, chosen_c_b, chosen_m_b).copy() | |
| disp_b = filt_b.copy() | |
| disp_b[metric_cols] = disp_b[metric_cols].applymap(fmt3) | |
| st.dataframe(disp_b, use_container_width=True, hide_index=True) | |
| # --------------------- AGREEMENT (multi-class) --------------------- | |
| with tab_agree: | |
| st.subheader("Agreement (Model vs Human) — multi-class") | |
| all_models = uniq_sorted(df_raw["model"]) | |
| all_classes = uniq_sorted(df_raw["class"]) | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| chosen_classes = st.multiselect("Classes (empty = All)", all_classes, default=[], key="agree_classes") | |
| with c2: | |
| chosen_models = st.multiselect("Models (empty = All)", all_models, default=[], key="agree_models") | |
| model_metric = st.selectbox( | |
| "Model metric", | |
| ["action_mean_intra", "frame_diff_ord2"], | |
| index=0, | |
| key="agree_model_metric", | |
| help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)." | |
| ) | |
| human_metric = st.selectbox("Human metric", human_cols, index=0, key="agree_human_metric") | |
| scope_df = filter_by_multi(df_raw, chosen_classes, chosen_models).copy() | |
| if "action_mean_intra_orig" not in scope_df.columns: | |
| scope_df = scope_df.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left") | |
| req_model_col = "action_mean_intra_orig" if model_metric == "action_mean_intra" else model_metric | |
| scope_df = scope_df.dropna(subset=[req_model_col, human_metric], how="any") | |
| if len(scope_df) < 2: | |
| st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.") | |
| else: | |
| acc, disagree_df, total_pairs = pairwise_agreement(scope_df, model_metric, human_metric) | |
| st.markdown(f"**Pairwise accuracy:** {fmt3(acc)} (over {total_pairs} comparable pairs).") | |
| if not disagree_df.empty: | |
| df_show = disagree_df.copy() | |
| numeric_cols = [f"{model_metric}_A", f"{model_metric}_B", f"{human_metric}_A", f"{human_metric}_B"] | |
| for col in numeric_cols: | |
| if col in df_show.columns: | |
| df_show[col] = pd.to_numeric(df_show[col], errors="coerce").map(fmt3) | |
| st.markdown("**Disagreeing pairs**") | |
| st.dataframe(df_show, use_container_width=True, hide_index=True) | |
| else: | |
| st.success("All comparable pairs agree. 🎉") | |
| # # --------------------- GLOBAL PAIRWISE (any videos, multi-class) --------------------- | |
| # with tab_global: | |
| # st.subheader("Global Pairwise (any videos) — multi-class") | |
| # all_models = uniq_sorted(df_raw["model"]) | |
| # all_classes = uniq_sorted(df_raw["class"]) | |
| # c1, c2 = st.columns(2) | |
| # with c1: | |
| # chosen_classes_g = st.multiselect("Classes (empty = All)", all_classes, default=[], key="global_classes") | |
| # with c2: | |
| # chosen_models_g = st.multiselect("Models (empty = All)", all_models, default=[], key="global_models") | |
| # model_metric_g = st.selectbox( | |
| # "Model metric", | |
| # ["action_mean_intra", "frame_diff_ord2"], | |
| # index=0, | |
| # key="global_model_metric", | |
| # help="Agreement rule: BOTH model metrics use LOWER-is-better (action_mean_intra uses original sign)." | |
| # ) | |
| # human_metric_g = st.selectbox( | |
| # "Human metric", | |
| # human_cols, | |
| # index=0, | |
| # key="global_human_metric" | |
| # ) | |
| # scope_df_g = filter_by_multi(df_raw, chosen_classes_g, chosen_models_g).copy() | |
| # if "action_mean_intra_orig" not in scope_df_g.columns: | |
| # scope_df_g = scope_df_g.merge( | |
| # df_raw[["video_id", "action_mean_intra_orig"]], | |
| # on="video_id", | |
| # how="left" | |
| # ) | |
| # req_model_col_g = "action_mean_intra_orig" if model_metric_g == "action_mean_intra" else model_metric_g | |
| # scope_df_g = scope_df_g.dropna(subset=[req_model_col_g, human_metric_g], how="any") | |
| # if len(scope_df_g) < 2: | |
| # st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.") | |
| # else: | |
| # acc_g, disagree_df_g, total_pairs_g = pairwise_agreement(scope_df_g, model_metric_g, human_metric_g) | |
| # st.markdown(f"**Global pairwise accuracy:** {fmt3(acc_g)} (over {total_pairs_g} comparable pairs).") | |
| # if not disagree_df_g.empty: | |
| # df_show_g = disagree_df_g.copy() | |
| # numeric_cols_g = [f"{model_metric_g}_A", f"{model_metric_g}_B", f"{human_metric_g}_A", f"{human_metric_g}_B"] | |
| # for col in numeric_cols_g: | |
| # if col in df_show_g.columns: | |
| # df_show_g[col] = pd.to_numeric(df_show_g[col], errors="coerce").map(fmt3) | |
| # st.markdown("**Disagreeing global pairs**") | |
| # st.dataframe(df_show_g, use_container_width=True, hide_index=True) | |
| # else: | |
| # st.success("All comparable global pairs agree. 🎉") | |
| # --------------------- CROSS-MODEL (same base video, multi-class) --------------------- | |
| with tab_cross: | |
| st.subheader("Cross-Model Agreement — same original video (base_name), multi-class") | |
| all_classes = uniq_sorted(df_raw["class"]) | |
| all_models = uniq_sorted(df_raw["model"]) | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| chosen_classes2 = st.multiselect("Classes (empty = All)", all_classes, default=[], key="cross_classes") | |
| with c2: | |
| chosen_models2 = st.multiselect("Models (empty = All)", all_models, default=[], key="cross_models") | |
| model_metric2 = st.selectbox( | |
| "Model metric", | |
| ["action_mean_intra", "frame_diff_ord2"], | |
| index=0, | |
| key="cross_model_metric", | |
| help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)." | |
| ) | |
| human_metric2 = st.selectbox( | |
| "Human metric", | |
| human_cols, | |
| index=0, | |
| key="cross_human_metric" | |
| ) | |
| scope_df2 = filter_by_multi(df_raw, chosen_classes2, chosen_models2).copy() | |
| if "action_mean_intra_orig" not in scope_df2.columns: | |
| scope_df2 = scope_df2.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left") | |
| req_model_col2 = "action_mean_intra_orig" if model_metric2 == "action_mean_intra" else model_metric2 | |
| scope_df2 = scope_df2.dropna(subset=["base_name", req_model_col2, human_metric2], how="any") | |
| eligible = scope_df2.groupby("base_name")["model"].nunique().reset_index(name="n_models") | |
| eligible_names = set(eligible[eligible["n_models"] >= 2]["base_name"].tolist()) | |
| scope_df2 = scope_df2[scope_df2["base_name"].isin(eligible_names)] | |
| if scope_df2.empty: | |
| st.info("No base videos with at least two different models in the current filters.") | |
| else: | |
| acc2, disagree_df2, total_pairs2 = cross_model_pairwise(scope_df2, model_metric2, human_metric2) | |
| st.markdown(f"**Cross-model pairwise accuracy:** {fmt3(acc2)} (over {total_pairs2} comparable cross-model pairs).") | |
| if not disagree_df2.empty: | |
| df_show2 = disagree_df2.copy() | |
| numeric_cols2 = [f"{model_metric2}_A", f"{model_metric2}_B", f"{human_metric2}_A", f"{human_metric2}_B"] | |
| for col in numeric_cols2: | |
| if col in df_show2.columns: | |
| df_show2[col] = pd.to_numeric(df_show2[col], errors="coerce").map(fmt3) | |
| st.markdown("**Disagreeing cross-model pairs (same base video)**") | |
| st.dataframe(df_show2, use_container_width=True, hide_index=True) | |
| else: | |
| st.success("All comparable cross-model pairs agree. 🎉") | |
| # --------------------- SPEARMAN (separate views per human metric) --------------------- | |
| with tab_spear: | |
| st.subheader("Spearman correlations (separate by human metric)") | |
| model_metrics_all = ["action_mean_intra", "frame_diff_ord2"] | |
| # one sub-tab per human metric | |
| sub_tabs = st.tabs(human_cols) | |
| for tab_obj, hmetric in zip(sub_tabs, human_cols): | |
| with tab_obj: | |
| st.caption(f"Human metric: **{hmetric}**") | |
| # Overall (only rows where the specific human metric is present) | |
| spear_overall = spearman_all(df_raw, model_metrics_all, [hmetric]) | |
| show = spear_overall.copy() | |
| show["rho"] = show["rho"].map(fmt3) | |
| st.markdown("**Overall**") | |
| st.dataframe(show, use_container_width=True, hide_index=True) | |
| st.markdown("---") | |
| st.markdown("**By Class**") | |
| classes_all = uniq_sorted(df_raw["class"]) | |
| chosen_cls = st.multiselect("Classes (empty = All)", classes_all, default=[], key=f"spear_cls_{hmetric}") | |
| spc = spearman_by_group(df_raw, model_metrics_all, [hmetric], "class", chosen_cls) | |
| spc["rho"] = spc["rho"].map(fmt3) | |
| st.dataframe(spc, use_container_width=True, hide_index=True) | |
| st.markdown("---") | |
| st.markdown("**By Model**") | |
| models_all = uniq_sorted(df_raw["model"]) | |
| chosen_mdl = st.multiselect("Models (empty = All)", models_all, default=[], key=f"spear_mdl_{hmetric}") | |
| spm = spearman_by_group(df_raw, model_metrics_all, [hmetric], "model", chosen_mdl) | |
| spm["rho"] = spm["rho"].map(fmt3) | |
| st.dataframe(spm, use_container_width=True, hide_index=True) | |
| # --------------------- INTER-RATER (by class / by model ONLY) --------------------- | |
| with tab_ir: | |
| st.subheader("Inter-Rater Correlations (Spearman) — by Class / by Model") | |
| metric_to_wide = load_raters(RATER_GLOBS) | |
| # Quick diagnostics so you can see coverage | |
| # Overall average inter-rater correlation (no grouping) per rater, per metric | |
| if metric_to_wide: | |
| st.caption("Overall inter-rater averages (across all classes/models)") | |
| rows = [] | |
| for metric, wide in metric_to_wide.items(): | |
| avg = inter_rater_avg_overall(wide, min_overlap=2) | |
| if not avg.empty: | |
| tmp = avg.copy() | |
| tmp.insert(0, "metric", metric) | |
| rows.append(tmp) | |
| if rows: | |
| overall_tbl = pd.concat(rows, ignore_index=True) | |
| overall_tbl["mean_rho"] = overall_tbl["mean_rho"].map(fmt3) | |
| st.dataframe(overall_tbl, use_container_width=True, hide_index=True) | |
| else: | |
| st.info("No rater pairs with enough overlapping videos to compute overall averages.") | |
| if not metric_to_wide: | |
| st.info("No rater files found. Expected patterns: 'raters*.json' or 'raters/*.json'.") | |
| else: | |
| meta = df_raw[["video_id", "class", "model"]].drop_duplicates() | |
| metrics_available = sorted(metric_to_wide.keys()) | |
| chosen_metric = st.selectbox("Rater metric", metrics_available, key="ir_metric") | |
| wide = metric_to_wide.get(chosen_metric) | |
| if wide is None or wide.empty: | |
| st.info("Selected metric has no rater data.") | |
| else: | |
| sub_by_cls, sub_by_mdl = st.tabs(["By Class", "By Model"]) | |
| with sub_by_cls: | |
| classes_all = uniq_sorted(meta["class"]) | |
| chosen_cls_ir = st.multiselect("Classes (empty = All)", classes_all, default=[], key="ir_classes") | |
| # Per-rater PAIRWISE | |
| pairs_cls = inter_rater_pairs_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2) | |
| if pairs_cls.empty: | |
| st.info("Not enough overlap to compute class-wise inter-rater correlations.") | |
| else: | |
| show_pairs = pairs_cls.copy() | |
| show_pairs["rho"] = show_pairs["rho"].map(fmt3) | |
| st.markdown("**Per-rater pairwise correlations**") | |
| st.dataframe(show_pairs, use_container_width=True, hide_index=True) | |
| # Per-rater AVERAGE | |
| avg_cls = inter_rater_avg_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2) | |
| if avg_cls.empty: | |
| st.info("Not enough overlap to compute class-wise inter-rater averages.") | |
| else: | |
| show_avg = avg_cls.copy() | |
| show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3) | |
| st.markdown("**Per-rater average correlation**") | |
| st.dataframe(show_avg, use_container_width=True, hide_index=True) | |
| with sub_by_mdl: | |
| models_all = uniq_sorted(meta["model"]) | |
| chosen_mdl_ir = st.multiselect("Models (empty = All)", models_all, default=[], key="ir_models") | |
| # Per-rater PAIRWISE | |
| pairs_mdl = inter_rater_pairs_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2) | |
| if pairs_mdl.empty: | |
| st.info("Not enough overlap to compute model-wise inter-rater correlations.") | |
| else: | |
| show_pairs = pairs_mdl.copy() | |
| show_pairs["rho"] = show_pairs["rho"].map(fmt3) | |
| st.markdown("**Per-rater pairwise correlations**") | |
| st.dataframe(show_pairs, use_container_width=True, hide_index=True) | |
| # Per-rater AVERAGE | |
| avg_mdl = inter_rater_avg_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2) | |
| if avg_mdl.empty: | |
| st.info("Not enough overlap to compute model-wise inter-rater averages.") | |
| else: | |
| show_avg = avg_mdl.copy() | |
| show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3) | |
| st.markdown("**Per-rater average correlation**") | |
| st.dataframe(show_avg, use_container_width=True, hide_index=True) | |
| with right: | |
| st.subheader("Video Viewer") | |
| all_vids = df_raw["video_id"].tolist() | |
| if len(all_vids) == 0: | |
| st.info("No videos available after filtering rows with no human ratings.") | |
| else: | |
| selected_vid = st.selectbox("Choose a video id", sorted(all_vids)) | |
| if selected_vid: | |
| vid_path = str(Path(video_dir) / selected_vid) | |
| st.video(vid_path) | |
| row_view = df_raw[df_raw["video_id"] == selected_vid].iloc[0] | |
| st.markdown("### Scores") | |
| # st.caption("Display: action_mean_intra is flipped once for readability; pairwise uses original (lower is better).") | |
| model_metrics = { | |
| k: round(float(row_view[k]), 3) | |
| for k in ["action_mean_intra", "frame_diff_ord2"] | |
| if pd.notna(row_view[k]) | |
| } | |
| human_metrics = { | |
| k: round(float(row_view[k]), 3) | |
| for k in ["human_action", "human_anatomy", "human_appearance", "human_motion"] | |
| if pd.notna(row_view[k]) | |
| } | |
| st.write("**Model metrics:**") | |
| st.json(model_metrics) | |
| st.write("**Human scores:**") | |
| st.json(human_metrics) |