import json from pathlib import Path from typing import Dict, List, Tuple import numpy as np import pandas as pd import streamlit as st st.set_page_config(page_title="VideoEval Explorer", layout="wide") st.title("VideoEval — Tables + Viewer with Human Scores") # ========================= # Paths / Config # ========================= mapping_json_path = "src/YOUTUBE_DATA/id_map.json" video_dir = "src/YOUTUBE_DATA" def uniq_sorted(s) -> list: return sorted(pd.Series(s).dropna().unique().tolist()) # --- Metric & video-id normalization for raters --- ALIASES = { "action": "human_action", "anatomy": "human_anatomy", "appearance": "human_appearance", "motion": "human_motion", "overall": "overall", } def normalize_metric_name(name: str) -> str: if not isinstance(name, str): return str(name) key = name.strip().lower() return ALIASES.get(key, name) # fall back to original if unknown def normalize_video_id(v) -> str: s = str(v) # enforce ".mp4" suffix to match main df return s if s.endswith(".mp4") else f"{s}.mp4" # Model metric JSONs REQ = { "action_mean_intra": "src/action_mean_intra.json", "frame_diff_ord2": "src/frame_diff_ord2.json", } # Human JSONs (aggregate) HUMAN = { "human_action": "src/human_scores_analysis_action_mos_centered.json", "human_anatomy": "src/human_scores_analysis_anatomy_mos_centered.json", "human_appearance": "src/human_scores_analysis_appearance_mos_centered.json", "human_motion": "src/human_scores_analysis_motion_mos_centered.json", } # Rater JSONs glob patterns (support both) RATER_GLOBS = ["src/raters*.json", "src/raters/*.json"] # ========================= # Helpers # ========================= def load_json(p: Path) -> dict: with open(p, "r") as f: d = json.load(f) out = {} for k, v in d.items(): try: out[str(k)] = float(v) except Exception: out[str(k)] = np.nan return out def build_maps_from_mapping(p: Path) -> Tuple[Dict[str, str], Dict[str, str]]: """ Returns: - v2m: video_id -> model - v2base: video_id -> base_name (original 'videoName', e.g. 'v_JumpingJack_g13_c02') Accepts either 'video_to_model' or 'model_to_videoName_to_id' schema. """ v2m, v2base = {}, {} if not p or not p.exists(): return v2m, v2base with open(p, "r") as f: m = json.load(f) if "video_to_model" in m and isinstance(m["video_to_model"], dict): v2m = dict(m["video_to_model"]) return v2m, v2base if "model_to_videoName_to_id" in m and isinstance(m["model_to_videoName_to_id"], dict): for model, name_to_id in m["model_to_videoName_to_id"].items(): for base_name, cls_id in name_to_id.items(): vid = f"{cls_id}.mp4" # filenames like Class__HASH.mp4 v2m[vid] = model v2base[vid] = base_name return v2m, v2base def minmax_normalize(series: pd.Series) -> pd.Series: s = pd.to_numeric(series, errors="coerce") if s.notna().sum() == 0: return s smin, smax = s.min(skipna=True), s.max(skipna=True) if not np.isfinite(smin) or not np.isfinite(smax) or smin == smax: return s.apply(lambda x: 0.0 if pd.notna(x) else np.nan) return (s - smin) / (smax - smin) def fmt3(x): try: x = float(x) except (TypeError, ValueError): return "" if not np.isfinite(x): return "" return f"{x:.3f}" def filter_by(df: pd.DataFrame, cls: str, mdl: str) -> pd.DataFrame: """Single-select filter (used by the two tables).""" out = df if cls != "(All)": out = out[out["class"] == cls] if mdl != "(All)": out = out[out["model"] == mdl] return out def filter_by_multi(df: pd.DataFrame, classes: List[str] | None, models: List[str] | None) -> pd.DataFrame: """Multi-select filter used in pairwise tabs. Empty/None means 'all'.""" out = df if classes: out = out[out["class"].isin(classes)] if models: out = out[out["model"].isin(models)] return out def pairwise_agreement(df: pd.DataFrame, model_col: str, human_col: str): """ Pairwise ranking agreement between model_col and human_col over all rows in df. Returns: (accuracy, disagree_pairs_df, total_pairs) Directions: - Model metrics: BOTH lower is better * action_mean_intra -> use 'action_mean_intra_orig' (pre-negation) * frame_diff_ord2 -> use 'frame_diff_ord2' (raw) - Human metrics: higher is better """ if model_col == "action_mean_intra": use_model_col = "action_mean_intra_orig" elif model_col == "frame_diff_ord2": use_model_col = "frame_diff_ord2" else: use_model_col = model_col # fallback vids = df["video_id"].tolist() mvals = pd.to_numeric(df[use_model_col], errors="coerce").values hvals = pd.to_numeric(df[human_col], errors="coerce").values rows = [] total = 0 agree = 0 n = len(vids) for i in range(n): for j in range(i + 1, n): mi, mj = mvals[i], mvals[j] hi, hj = hvals[i], hvals[j] if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)): continue if mi == mj or hi == hj: continue # model order: LOWER is better model_order = "A>B" if mi < mj else "B>A" # human order: HIGHER is better human_order = "A>B" if hi > hj else "B>A" is_agree = (model_order == human_order) total += 1 if is_agree: agree += 1 else: rows.append({ "video_A": vids[i], "video_B": vids[j], f"{model_col}_A": mi, f"{model_col}_B": mj, f"{human_col}_A": hi, f"{human_col}_B": hj, "model_order": model_order, "human_order": human_order, # "agree": is_agree, }) acc = (agree / total) if total > 0 else np.nan disagree_df = pd.DataFrame(rows) return acc, disagree_df, total def cross_model_pairwise(df: pd.DataFrame, model_col: str, human_col: str): """ Pairwise agreement only across DIFFERENT MODELS but the SAME base video. Requires df to have: ['video_id','model','base_name', model_col, human_col, 'action_mean_intra_orig']. Returns: (accuracy, disagree_pairs_df, total_pairs) Directions: - Model metrics: BOTH lower is better (action_mean_intra_orig, frame_diff_ord2) - Human metrics: higher is better """ if model_col == "action_mean_intra": use_model_col = "action_mean_intra_orig" elif model_col == "frame_diff_ord2": use_model_col = "frame_diff_ord2" else: use_model_col = model_col rows = [] total = 0 agree = 0 scope = df.dropna(subset=["base_name"]) for base_name, g in scope.groupby("base_name"): vids = g["video_id"].tolist() models = g["model"].tolist() mvals = pd.to_numeric(g[use_model_col], errors="coerce").values hvals = pd.to_numeric(g[human_col], errors="coerce").values n = len(vids) for i in range(n): for j in range(i + 1, n): if models[i] == models[j]: continue # cross-model only mi, mj = mvals[i], mvals[j] hi, hj = hvals[i], hvals[j] if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)): continue if mi == mj or hi == hj: continue model_order = "A>B" if mi < mj else "B>A" # LOWER better human_order = "A>B" if hi > hj else "B>A" # HIGHER better is_agree = (model_order == human_order) total += 1 if is_agree: agree += 1 else: rows.append({ "base_name": base_name, "video_A": vids[i], "model_A": models[i], f"{model_col}_A": mi, f"{human_col}_A": hi, "video_B": vids[j], "model_B": models[j], f"{model_col}_B": mj, f"{human_col}_B": hj, "model_order": model_order, "human_order": human_order, # "agree": is_agree, }) acc = (agree / total) if total > 0 else np.nan disagree_df = pd.DataFrame(rows) return acc, disagree_df, total def spearman_all(df: pd.DataFrame, model_cols: list, human_cols: list) -> pd.DataFrame: """Spearman rho between each model metric and each human metric over all videos.""" records = [] for m in model_cols: for h in human_cols: sub = df[[m, h]].dropna() n = len(sub) rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan records.append({"model_metric": m, "human_metric": h, "rho": rho, "n": n}) return pd.DataFrame(records) def spearman_by_group(df: pd.DataFrame, model_cols: list, human_cols: list, group_col: str, groups: List[str]) -> pd.DataFrame: """ Spearman rho per group (group_col ∈ {'class','model'}). If groups is empty -> all unique values. Returns tidy df with columns: [group_col, model_metric, human_metric, rho, n] """ if not groups: groups = sorted([g for g in df[group_col].dropna().unique().tolist()]) rows = [] for g in groups: subdf = df[df[group_col] == g] if subdf.empty: continue for m in model_cols: for h in human_cols: sub = subdf[[m, h]].dropna() n = len(sub) rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan rows.append({group_col: g, "model_metric": m, "human_metric": h, "rho": rho, "n": n}) return pd.DataFrame(rows) # ---------- Rater loading & inter-rater correlations ---------- def _filename_to_rater_name(p: Path) -> str: name = p.stem # e.g., "chvskch_scores" if name.endswith("_scores"): name = name[:-7] return name def _looks_like_video_key(k: str) -> bool: s = str(k) return "__" in s or s.endswith(".mp4") or s.endswith(".MP4") def _detect_rater_structure(obj) -> str: """ Returns one of: - 'metric_to_video' : {metric: {video_id: score}, ...} - 'video_to_score' : {video_id: score, ...} - 'video_to_metric_score' : {video_id: {metric: score}, ...} - 'unknown' """ if not isinstance(obj, dict) or not obj: return "unknown" # If values are not dicts -> {video_id: score} first_val = next(iter(obj.values())) if not isinstance(first_val, dict): return "video_to_score" # Values are dicts. Decide by looking at the OUTER KEYS. outer_keys = list(obj.keys()) if any(_looks_like_video_key(k) for k in outer_keys): return "video_to_metric_score" # <-- your case else: return "metric_to_video" def load_raters(globs: List[str]) -> Dict[str, pd.DataFrame]: """ Returns: dict metric_name -> wide DataFrame with index=video_id ('.mp4' enforced) and columns=rater_name. Supports shapes: - {metric: {video_id: score}} - {video_id: score} -> metric 'overall' - {video_id: {metric: score}} -> (your case) Metric names normalized to app's human_* names. """ metric_to_frames: Dict[str, List[pd.DataFrame]] = {} files = [] for pat in globs: files.extend(Path(".").glob(pat)) for p in files: try: data = json.loads(Path(p).read_text()) except Exception: continue rater = _filename_to_rater_name(p) shape = _detect_rater_structure(data) if shape == "metric_to_video": # {metric: {video_id: score}} for metric, vid_scores in data.items(): metric_norm = normalize_metric_name(metric) s = pd.Series({normalize_video_id(k): v for k, v in vid_scores.items()}, name=rater, dtype="float") metric_to_frames.setdefault(metric_norm, []).append(s.to_frame()) elif shape == "video_to_score": # {video_id: score} -> 'overall' s = pd.Series({normalize_video_id(k): v for k, v in data.items()}, name=rater, dtype="float") metric_to_frames.setdefault("overall", []).append(s.to_frame()) elif shape == "video_to_metric_score": # {video_id: {metric: score}} <-- YOUR FILES bucket: Dict[str, Dict[str, float]] = {} for vid, mdict in data.items(): if not isinstance(mdict, dict): continue for metric, val in mdict.items(): metric_norm = normalize_metric_name(metric) bucket.setdefault(metric_norm, {})[normalize_video_id(vid)] = val for metric_norm, vid_scores in bucket.items(): s = pd.Series(vid_scores, name=rater, dtype="float") metric_to_frames.setdefault(metric_norm, []).append(s.to_frame()) # else: unknown; skip # Merge per metric metric_to_wide: Dict[str, pd.DataFrame] = {} for metric, frames in metric_to_frames.items(): if not frames: continue wide = frames[0] for f in frames[1:]: wide = wide.join(f, how="outer") wide = wide.loc[:, ~wide.columns.duplicated()] metric_to_wide[metric] = wide return metric_to_wide def inter_rater_pairs_by_group( rater_wide: pd.DataFrame, video_meta: pd.DataFrame, by: str, values: List[str] | None = None, min_overlap: int = 2, ) -> pd.DataFrame: """ Return per-rater *pairwise* correlations within each group. Rows: [by, rater, other_rater, rho, n] Filters: drop NaN rho; keep only pairs with n >= min_overlap. """ pairs = inter_rater_corr_grouped(rater_wide, video_meta, by=by, values=values) if pairs.empty: return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"]) # Ensure integer overlaps pairs["n"] = pairs["n"].astype(int) # keep only pairs with enough overlap and non-null rho pairs = pairs[(pairs["n"] >= min_overlap) & pairs["rho"].notna()] # # keep only adequate overlap, drop NaNs # pairs = pairs[pairs["n"] >= min_overlap].dropna(subset=["rho"]) if pairs.empty: return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"]) # expand to directed rows so each rater has their own row per counterpart a = pairs.rename(columns={"rater_i": "rater", "rater_j": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]] b = pairs.rename(columns={"rater_j": "rater", "rater_i": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]] out = pd.concat([a, b], ignore_index=True) # sort for nicer reading out = out.sort_values([by, "rater", "rho"], ascending=[True, True, False]).reset_index(drop=True) return out def inter_rater_avg_by_group( rater_wide: pd.DataFrame, video_meta: pd.DataFrame, by: str, values: List[str] | None = None, min_overlap: int = 2, ) -> pd.DataFrame: """ Average (unweighted) inter-rater Spearman rho per rater within each group. Uses same filters as inter_rater_pairs_by_group. Rows: [by, rater, mean_rho, num_pairs_used] """ pairs_long = inter_rater_pairs_by_group( rater_wide, video_meta, by=by, values=values, min_overlap=min_overlap ) if pairs_long.empty: return pd.DataFrame(columns=[by, "rater", "mean_rho", "num_pairs_used"]) agg = ( pairs_long.groupby([by, "rater"]) .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count")) .reset_index() ) agg["mean_rho"] = agg["mean_rho"].astype(float) return agg def inter_rater_corr_grouped( rater_wide: pd.DataFrame, video_meta: pd.DataFrame, by: str, values: List[str] | None = None ) -> pd.DataFrame: """ Spearman inter-rater correlation matrices grouped by `by` ∈ {'class','model'}. Requires ≥2 raters with ≥2 overlapping videos inside each group. Returns tidy df: [by, rater_i, rater_j, rho, n] """ assert by in ("class", "model") def corr_to_long(df_corr: pd.DataFrame, counts: pd.DataFrame, group_label: str): out = [] for i in df_corr.index: for j in df_corr.columns: if i >= j: continue out.append({by: group_label, "rater_i": i, "rater_j": j, "rho": df_corr.loc[i, j], "n": int(counts.loc[i, j])}) return out meta = video_meta[["video_id", "class", "model"]].drop_duplicates().set_index("video_id") X = rater_wide.copy() X.index = X.index.map(str) # should already be normalized to '.mp4' X = X.join(meta, how="left") if not values: values = sorted([v for v in X[by].dropna().unique().tolist()]) results = [] def corr_with_counts(M: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: rho = M.corr(method="spearman", min_periods=2) mask = ~M.isna() counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="float") for a in M.columns: for b in M.columns: counts.loc[a, b] = float((mask[a] & mask[b]).sum()) return rho, counts for g in values: sub = X[X[by] == g] if sub.empty: continue # keep only rater columns with at least 2 ratings cols = [c for c in sub.columns if c not in ("class", "model")] usable = [c for c in cols if sub[c].notna().sum() >= 2] if len(usable) < 2: continue M = sub[usable] rho, counts = corr_with_counts(M) # if every pair has <2 overlaps, skip if (counts.values < 2).all(): continue results.extend(corr_to_long(rho, counts, g)) out = pd.DataFrame(results) if not out.empty: out["rho"] = out["rho"].astype(float) return out def _corr_with_counts_matrix(M: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """Spearman rho matrix + overlap counts matrix for a wide (videos x raters) frame.""" rho = M.corr(method="spearman", min_periods=2) mask = ~M.isna() counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="int64") for a in M.columns: for b in M.columns: counts.loc[a, b] = int((mask[a] & mask[b]).sum()) return rho, counts def inter_rater_pairs_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame: """ Pairwise inter-rater Spearman over the entire dataset (no grouping). Returns columns: [rater, other_rater, rho, n] n = overlapping videos used for that pair. """ if rater_wide is None or rater_wide.empty: return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"]) # keep only rater columns with at least 2 ratings usable = [c for c in rater_wide.columns if rater_wide[c].notna().sum() >= 2] M = rater_wide[usable] if M.shape[1] < 2: return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"]) rho, counts = _corr_with_counts_matrix(M) rows = [] for i in rho.index: for j in rho.columns: if i >= j: continue n_overlap = int(counts.loc[i, j]) r = rho.loc[i, j] if n_overlap >= min_overlap and pd.notna(r): rows.append({"rater": i, "other_rater": j, "rho": float(r), "n": n_overlap}) rows.append({"rater": j, "other_rater": i, "rho": float(r), "n": n_overlap}) # directed out = pd.DataFrame(rows) if not out.empty: out = out.sort_values(["rater", "rho"], ascending=[True, False]).reset_index(drop=True) return out def inter_rater_avg_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame: """ Overall (no grouping) average Spearman rho per rater. Returns columns: [rater, mean_rho, num_pairs_used] """ pairs = inter_rater_pairs_overall(rater_wide, min_overlap=min_overlap) if pairs.empty: return pd.DataFrame(columns=["rater", "mean_rho", "num_pairs_used"]) agg = (pairs.groupby("rater") .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count")) .reset_index() .sort_values("mean_rho", ascending=False) .reset_index(drop=True)) return agg # ========================= # Load aggregate scores # ========================= scores = {k: load_json(Path(p)) for k, p in REQ.items()} human_scores = {k: load_json(Path(p)) for k, p in HUMAN.items()} all_keys = set() for d in scores.values(): all_keys |= set(d.keys()) for d in human_scores.values(): all_keys |= set(d.keys()) rows = [] for vid in sorted(all_keys): rows.append({ "video_id": vid, "class": vid.split("__")[0] if "__" in vid else "UNK", "action_mean_intra": scores["action_mean_intra"].get(vid, np.nan), "frame_diff_ord2": scores["frame_diff_ord2"].get(vid, np.nan), "human_action": human_scores["human_action"].get(vid, np.nan), "human_anatomy": human_scores["human_anatomy"].get(vid, np.nan), "human_appearance": human_scores["human_appearance"].get(vid, np.nan), "human_motion": human_scores["human_motion"].get(vid, np.nan), }) df_raw = pd.DataFrame(rows) # Keep original action_mean_intra for pairwise; flip for display ONCE (your JSON is negative) df_raw["action_mean_intra"] = pd.to_numeric(df_raw["action_mean_intra"], errors="coerce") df_raw["action_mean_intra_orig"] = df_raw["action_mean_intra"].copy() df_raw["action_mean_intra"] = df_raw["action_mean_intra_orig"] # positive / higher-better in tables # Map models + base_name via mapping.json v2m, v2base = build_maps_from_mapping(Path(mapping_json_path)) df_raw["model"] = df_raw["video_id"].map(v2m).fillna("UNK") df_raw["base_name"] = df_raw["video_id"].map(v2base) # may be NaN # Metric columns (display + normalization) metric_cols = [ "action_mean_intra", # positive (after flip) for display; pairwise uses *_orig "frame_diff_ord2", # raw; lower-better "human_action", "human_anatomy", "human_appearance", "human_motion", ] human_cols = ["human_action", "human_anatomy", "human_appearance", "human_motion"] # Drop rows with NO human ratings at all df_raw = df_raw.dropna(subset=human_cols, how="all") # Normalized view for tables df_norm = df_raw.copy() for c in metric_cols: df_norm[c] = minmax_normalize(df_norm[c]) # ========================= # UI Layout # ========================= left, right = st.columns([2, 1]) with left: tab_tables, tab_agree, tab_cross, tab_spear, tab_ir = st.tabs( ["Tables", "Agreement", "Cross-Model (same videoName)", "Spearman", "Inter-Rater"] ) # --------------------- TABLES (A/B) --------------------- with tab_tables: st.subheader("Comparison Tables") # Table A st.markdown("**Table A**") use_norm_a = st.checkbox("Show normalized scores (0–1) — A", value=False, key="norm_a") classes = ["(All)"] + uniq_sorted(df_raw["class"]) models = ["(All)"] + uniq_sorted(df_raw["model"]) ca, ma = st.columns(2) with ca: chosen_c_a = st.selectbox("Class (A)", classes, key="class_a") with ma: chosen_m_a = st.selectbox("Model (A)", models, key="model_a") df_view_a = df_norm if use_norm_a else df_raw filt_a = filter_by(df_view_a, chosen_c_a, chosen_m_a).copy() disp_a = filt_a.copy() disp_a[metric_cols] = disp_a[metric_cols].applymap(fmt3) st.dataframe(disp_a, use_container_width=True, hide_index=True) st.markdown("---") # Table B st.markdown("**Table B**") use_norm_b = st.checkbox("Show normalized scores (0–1) — B", value=False, key="norm_b") cb, mb = st.columns(2) with cb: chosen_c_b = st.selectbox("Class (B)", classes, key="class_b") with mb: chosen_m_b = st.selectbox("Model (B)", models, key="model_b") df_view_b = df_norm if use_norm_b else df_raw filt_b = filter_by(df_view_b, chosen_c_b, chosen_m_b).copy() disp_b = filt_b.copy() disp_b[metric_cols] = disp_b[metric_cols].applymap(fmt3) st.dataframe(disp_b, use_container_width=True, hide_index=True) # --------------------- AGREEMENT (multi-class) --------------------- with tab_agree: st.subheader("Agreement (Model vs Human) — multi-class") all_models = uniq_sorted(df_raw["model"]) all_classes = uniq_sorted(df_raw["class"]) c1, c2 = st.columns(2) with c1: chosen_classes = st.multiselect("Classes (empty = All)", all_classes, default=[], key="agree_classes") with c2: chosen_models = st.multiselect("Models (empty = All)", all_models, default=[], key="agree_models") model_metric = st.selectbox( "Model metric", ["action_mean_intra", "frame_diff_ord2"], index=0, key="agree_model_metric", help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)." ) human_metric = st.selectbox("Human metric", human_cols, index=0, key="agree_human_metric") scope_df = filter_by_multi(df_raw, chosen_classes, chosen_models).copy() if "action_mean_intra_orig" not in scope_df.columns: scope_df = scope_df.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left") req_model_col = "action_mean_intra_orig" if model_metric == "action_mean_intra" else model_metric scope_df = scope_df.dropna(subset=[req_model_col, human_metric], how="any") if len(scope_df) < 2: st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.") else: acc, disagree_df, total_pairs = pairwise_agreement(scope_df, model_metric, human_metric) st.markdown(f"**Pairwise accuracy:** {fmt3(acc)} (over {total_pairs} comparable pairs).") if not disagree_df.empty: df_show = disagree_df.copy() numeric_cols = [f"{model_metric}_A", f"{model_metric}_B", f"{human_metric}_A", f"{human_metric}_B"] for col in numeric_cols: if col in df_show.columns: df_show[col] = pd.to_numeric(df_show[col], errors="coerce").map(fmt3) st.markdown("**Disagreeing pairs**") st.dataframe(df_show, use_container_width=True, hide_index=True) else: st.success("All comparable pairs agree. 🎉") # # --------------------- GLOBAL PAIRWISE (any videos, multi-class) --------------------- # with tab_global: # st.subheader("Global Pairwise (any videos) — multi-class") # all_models = uniq_sorted(df_raw["model"]) # all_classes = uniq_sorted(df_raw["class"]) # c1, c2 = st.columns(2) # with c1: # chosen_classes_g = st.multiselect("Classes (empty = All)", all_classes, default=[], key="global_classes") # with c2: # chosen_models_g = st.multiselect("Models (empty = All)", all_models, default=[], key="global_models") # model_metric_g = st.selectbox( # "Model metric", # ["action_mean_intra", "frame_diff_ord2"], # index=0, # key="global_model_metric", # help="Agreement rule: BOTH model metrics use LOWER-is-better (action_mean_intra uses original sign)." # ) # human_metric_g = st.selectbox( # "Human metric", # human_cols, # index=0, # key="global_human_metric" # ) # scope_df_g = filter_by_multi(df_raw, chosen_classes_g, chosen_models_g).copy() # if "action_mean_intra_orig" not in scope_df_g.columns: # scope_df_g = scope_df_g.merge( # df_raw[["video_id", "action_mean_intra_orig"]], # on="video_id", # how="left" # ) # req_model_col_g = "action_mean_intra_orig" if model_metric_g == "action_mean_intra" else model_metric_g # scope_df_g = scope_df_g.dropna(subset=[req_model_col_g, human_metric_g], how="any") # if len(scope_df_g) < 2: # st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.") # else: # acc_g, disagree_df_g, total_pairs_g = pairwise_agreement(scope_df_g, model_metric_g, human_metric_g) # st.markdown(f"**Global pairwise accuracy:** {fmt3(acc_g)} (over {total_pairs_g} comparable pairs).") # if not disagree_df_g.empty: # df_show_g = disagree_df_g.copy() # numeric_cols_g = [f"{model_metric_g}_A", f"{model_metric_g}_B", f"{human_metric_g}_A", f"{human_metric_g}_B"] # for col in numeric_cols_g: # if col in df_show_g.columns: # df_show_g[col] = pd.to_numeric(df_show_g[col], errors="coerce").map(fmt3) # st.markdown("**Disagreeing global pairs**") # st.dataframe(df_show_g, use_container_width=True, hide_index=True) # else: # st.success("All comparable global pairs agree. 🎉") # --------------------- CROSS-MODEL (same base video, multi-class) --------------------- with tab_cross: st.subheader("Cross-Model Agreement — same original video (base_name), multi-class") all_classes = uniq_sorted(df_raw["class"]) all_models = uniq_sorted(df_raw["model"]) c1, c2 = st.columns(2) with c1: chosen_classes2 = st.multiselect("Classes (empty = All)", all_classes, default=[], key="cross_classes") with c2: chosen_models2 = st.multiselect("Models (empty = All)", all_models, default=[], key="cross_models") model_metric2 = st.selectbox( "Model metric", ["action_mean_intra", "frame_diff_ord2"], index=0, key="cross_model_metric", help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)." ) human_metric2 = st.selectbox( "Human metric", human_cols, index=0, key="cross_human_metric" ) scope_df2 = filter_by_multi(df_raw, chosen_classes2, chosen_models2).copy() if "action_mean_intra_orig" not in scope_df2.columns: scope_df2 = scope_df2.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left") req_model_col2 = "action_mean_intra_orig" if model_metric2 == "action_mean_intra" else model_metric2 scope_df2 = scope_df2.dropna(subset=["base_name", req_model_col2, human_metric2], how="any") eligible = scope_df2.groupby("base_name")["model"].nunique().reset_index(name="n_models") eligible_names = set(eligible[eligible["n_models"] >= 2]["base_name"].tolist()) scope_df2 = scope_df2[scope_df2["base_name"].isin(eligible_names)] if scope_df2.empty: st.info("No base videos with at least two different models in the current filters.") else: acc2, disagree_df2, total_pairs2 = cross_model_pairwise(scope_df2, model_metric2, human_metric2) st.markdown(f"**Cross-model pairwise accuracy:** {fmt3(acc2)} (over {total_pairs2} comparable cross-model pairs).") if not disagree_df2.empty: df_show2 = disagree_df2.copy() numeric_cols2 = [f"{model_metric2}_A", f"{model_metric2}_B", f"{human_metric2}_A", f"{human_metric2}_B"] for col in numeric_cols2: if col in df_show2.columns: df_show2[col] = pd.to_numeric(df_show2[col], errors="coerce").map(fmt3) st.markdown("**Disagreeing cross-model pairs (same base video)**") st.dataframe(df_show2, use_container_width=True, hide_index=True) else: st.success("All comparable cross-model pairs agree. 🎉") # --------------------- SPEARMAN (separate views per human metric) --------------------- with tab_spear: st.subheader("Spearman correlations (separate by human metric)") model_metrics_all = ["action_mean_intra", "frame_diff_ord2"] # one sub-tab per human metric sub_tabs = st.tabs(human_cols) for tab_obj, hmetric in zip(sub_tabs, human_cols): with tab_obj: st.caption(f"Human metric: **{hmetric}**") # Overall (only rows where the specific human metric is present) spear_overall = spearman_all(df_raw, model_metrics_all, [hmetric]) show = spear_overall.copy() show["rho"] = show["rho"].map(fmt3) st.markdown("**Overall**") st.dataframe(show, use_container_width=True, hide_index=True) st.markdown("---") st.markdown("**By Class**") classes_all = uniq_sorted(df_raw["class"]) chosen_cls = st.multiselect("Classes (empty = All)", classes_all, default=[], key=f"spear_cls_{hmetric}") spc = spearman_by_group(df_raw, model_metrics_all, [hmetric], "class", chosen_cls) spc["rho"] = spc["rho"].map(fmt3) st.dataframe(spc, use_container_width=True, hide_index=True) st.markdown("---") st.markdown("**By Model**") models_all = uniq_sorted(df_raw["model"]) chosen_mdl = st.multiselect("Models (empty = All)", models_all, default=[], key=f"spear_mdl_{hmetric}") spm = spearman_by_group(df_raw, model_metrics_all, [hmetric], "model", chosen_mdl) spm["rho"] = spm["rho"].map(fmt3) st.dataframe(spm, use_container_width=True, hide_index=True) # --------------------- INTER-RATER (by class / by model ONLY) --------------------- with tab_ir: st.subheader("Inter-Rater Correlations (Spearman) — by Class / by Model") metric_to_wide = load_raters(RATER_GLOBS) # Quick diagnostics so you can see coverage # Overall average inter-rater correlation (no grouping) per rater, per metric if metric_to_wide: st.caption("Overall inter-rater averages (across all classes/models)") rows = [] for metric, wide in metric_to_wide.items(): avg = inter_rater_avg_overall(wide, min_overlap=2) if not avg.empty: tmp = avg.copy() tmp.insert(0, "metric", metric) rows.append(tmp) if rows: overall_tbl = pd.concat(rows, ignore_index=True) overall_tbl["mean_rho"] = overall_tbl["mean_rho"].map(fmt3) st.dataframe(overall_tbl, use_container_width=True, hide_index=True) else: st.info("No rater pairs with enough overlapping videos to compute overall averages.") if not metric_to_wide: st.info("No rater files found. Expected patterns: 'raters*.json' or 'raters/*.json'.") else: meta = df_raw[["video_id", "class", "model"]].drop_duplicates() metrics_available = sorted(metric_to_wide.keys()) chosen_metric = st.selectbox("Rater metric", metrics_available, key="ir_metric") wide = metric_to_wide.get(chosen_metric) if wide is None or wide.empty: st.info("Selected metric has no rater data.") else: sub_by_cls, sub_by_mdl = st.tabs(["By Class", "By Model"]) with sub_by_cls: classes_all = uniq_sorted(meta["class"]) chosen_cls_ir = st.multiselect("Classes (empty = All)", classes_all, default=[], key="ir_classes") # Per-rater PAIRWISE pairs_cls = inter_rater_pairs_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2) if pairs_cls.empty: st.info("Not enough overlap to compute class-wise inter-rater correlations.") else: show_pairs = pairs_cls.copy() show_pairs["rho"] = show_pairs["rho"].map(fmt3) st.markdown("**Per-rater pairwise correlations**") st.dataframe(show_pairs, use_container_width=True, hide_index=True) # Per-rater AVERAGE avg_cls = inter_rater_avg_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2) if avg_cls.empty: st.info("Not enough overlap to compute class-wise inter-rater averages.") else: show_avg = avg_cls.copy() show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3) st.markdown("**Per-rater average correlation**") st.dataframe(show_avg, use_container_width=True, hide_index=True) with sub_by_mdl: models_all = uniq_sorted(meta["model"]) chosen_mdl_ir = st.multiselect("Models (empty = All)", models_all, default=[], key="ir_models") # Per-rater PAIRWISE pairs_mdl = inter_rater_pairs_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2) if pairs_mdl.empty: st.info("Not enough overlap to compute model-wise inter-rater correlations.") else: show_pairs = pairs_mdl.copy() show_pairs["rho"] = show_pairs["rho"].map(fmt3) st.markdown("**Per-rater pairwise correlations**") st.dataframe(show_pairs, use_container_width=True, hide_index=True) # Per-rater AVERAGE avg_mdl = inter_rater_avg_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2) if avg_mdl.empty: st.info("Not enough overlap to compute model-wise inter-rater averages.") else: show_avg = avg_mdl.copy() show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3) st.markdown("**Per-rater average correlation**") st.dataframe(show_avg, use_container_width=True, hide_index=True) with right: st.subheader("Video Viewer") all_vids = df_raw["video_id"].tolist() if len(all_vids) == 0: st.info("No videos available after filtering rows with no human ratings.") else: selected_vid = st.selectbox("Choose a video id", sorted(all_vids)) if selected_vid: vid_path = str(Path(video_dir) / selected_vid) st.video(vid_path) row_view = df_raw[df_raw["video_id"] == selected_vid].iloc[0] st.markdown("### Scores") # st.caption("Display: action_mean_intra is flipped once for readability; pairwise uses original (lower is better).") model_metrics = { k: round(float(row_view[k]), 3) for k in ["action_mean_intra", "frame_diff_ord2"] if pd.notna(row_view[k]) } human_metrics = { k: round(float(row_view[k]), 3) for k in ["human_action", "human_anatomy", "human_appearance", "human_motion"] if pd.notna(row_view[k]) } st.write("**Model metrics:**") st.json(model_metrics) st.write("**Human scores:**") st.json(human_metrics)