Spaces:

XThomasBU
/

VideoEvals

Sleeping

File size: 40,580 Bytes

import json
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import streamlit as st

st.set_page_config(page_title="VideoEval Explorer", layout="wide")
st.title("VideoEval — Tables + Viewer with Human Scores")

# =========================
# Paths / Config
# =========================
mapping_json_path = "src/YOUTUBE_DATA/id_map.json"
video_dir = "src/YOUTUBE_DATA"

def uniq_sorted(s) -> list:
    return sorted(pd.Series(s).dropna().unique().tolist())

# --- Metric & video-id normalization for raters ---
ALIASES = {
    "action": "human_action",
    "anatomy": "human_anatomy",
    "appearance": "human_appearance",
    "motion": "human_motion",
    "overall": "overall",
}
def normalize_metric_name(name: str) -> str:
    if not isinstance(name, str):
        return str(name)
    key = name.strip().lower()
    return ALIASES.get(key, name)  # fall back to original if unknown

def normalize_video_id(v) -> str:
    s = str(v)
    # enforce ".mp4" suffix to match main df
    return s if s.endswith(".mp4") else f"{s}.mp4"

# Model metric JSONs
REQ = {
    "action_mean_intra": "src/action_mean_intra.json",
    "frame_diff_ord2": "src/frame_diff_ord2.json",
}

# Human JSONs (aggregate)
HUMAN = {
    "human_action": "src/human_scores_analysis_action_mos_centered.json",
    "human_anatomy": "src/human_scores_analysis_anatomy_mos_centered.json",
    "human_appearance": "src/human_scores_analysis_appearance_mos_centered.json",
    "human_motion": "src/human_scores_analysis_motion_mos_centered.json",
}

# Rater JSONs glob patterns (support both)
RATER_GLOBS = ["src/raters*.json", "src/raters/*.json"]

# =========================
# Helpers
# =========================
def load_json(p: Path) -> dict:
    with open(p, "r") as f:
        d = json.load(f)
    out = {}
    for k, v in d.items():
        try:
            out[str(k)] = float(v)
        except Exception:
            out[str(k)] = np.nan
    return out

def build_maps_from_mapping(p: Path) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Returns:
      - v2m: video_id -> model
      - v2base: video_id -> base_name (original 'videoName', e.g. 'v_JumpingJack_g13_c02')
    Accepts either 'video_to_model' or 'model_to_videoName_to_id' schema.
    """
    v2m, v2base = {}, {}
    if not p or not p.exists():
        return v2m, v2base

    with open(p, "r") as f:
        m = json.load(f)

    if "video_to_model" in m and isinstance(m["video_to_model"], dict):
        v2m = dict(m["video_to_model"])
        return v2m, v2base

    if "model_to_videoName_to_id" in m and isinstance(m["model_to_videoName_to_id"], dict):
        for model, name_to_id in m["model_to_videoName_to_id"].items():
            for base_name, cls_id in name_to_id.items():
                vid = f"{cls_id}.mp4"  # filenames like Class__HASH.mp4
                v2m[vid] = model
                v2base[vid] = base_name
    return v2m, v2base

def minmax_normalize(series: pd.Series) -> pd.Series:
    s = pd.to_numeric(series, errors="coerce")
    if s.notna().sum() == 0:
        return s
    smin, smax = s.min(skipna=True), s.max(skipna=True)
    if not np.isfinite(smin) or not np.isfinite(smax) or smin == smax:
        return s.apply(lambda x: 0.0 if pd.notna(x) else np.nan)
    return (s - smin) / (smax - smin)

def fmt3(x):
    try:
        x = float(x)
    except (TypeError, ValueError):
        return ""
    if not np.isfinite(x):
        return ""
    return f"{x:.3f}"

def filter_by(df: pd.DataFrame, cls: str, mdl: str) -> pd.DataFrame:
    """Single-select filter (used by the two tables)."""
    out = df
    if cls != "(All)":
        out = out[out["class"] == cls]
    if mdl != "(All)":
        out = out[out["model"] == mdl]
    return out

def filter_by_multi(df: pd.DataFrame, classes: List[str] | None, models: List[str] | None) -> pd.DataFrame:
    """Multi-select filter used in pairwise tabs. Empty/None means 'all'."""
    out = df
    if classes:
        out = out[out["class"].isin(classes)]
    if models:
        out = out[out["model"].isin(models)]
    return out

def pairwise_agreement(df: pd.DataFrame, model_col: str, human_col: str):
    """
    Pairwise ranking agreement between model_col and human_col over all rows in df.
    Returns: (accuracy, disagree_pairs_df, total_pairs)

    Directions:
    - Model metrics: BOTH lower is better
      * action_mean_intra  -> use 'action_mean_intra_orig' (pre-negation)
      * frame_diff_ord2    -> use 'frame_diff_ord2' (raw)
    - Human metrics: higher is better
    """
    if model_col == "action_mean_intra":
        use_model_col = "action_mean_intra_orig"
    elif model_col == "frame_diff_ord2":
        use_model_col = "frame_diff_ord2"
    else:
        use_model_col = model_col  # fallback

    vids = df["video_id"].tolist()
    mvals = pd.to_numeric(df[use_model_col], errors="coerce").values
    hvals = pd.to_numeric(df[human_col], errors="coerce").values

    rows = []
    total = 0
    agree = 0
    n = len(vids)

    for i in range(n):
        for j in range(i + 1, n):
            mi, mj = mvals[i], mvals[j]
            hi, hj = hvals[i], hvals[j]
            if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
                continue
            if mi == mj or hi == hj:
                continue

            # model order: LOWER is better
            model_order = "A>B" if mi < mj else "B>A"
            # human order: HIGHER is better
            human_order = "A>B" if hi > hj else "B>A"

            is_agree = (model_order == human_order)
            total += 1
            if is_agree:
                agree += 1
            else:
                rows.append({
                    "video_A": vids[i],
                    "video_B": vids[j],
                    f"{model_col}_A": mi,
                    f"{model_col}_B": mj,
                    f"{human_col}_A": hi,
                    f"{human_col}_B": hj,
                    "model_order": model_order,
                    "human_order": human_order,
                    # "agree": is_agree,
                })

    acc = (agree / total) if total > 0 else np.nan
    disagree_df = pd.DataFrame(rows)
    return acc, disagree_df, total

def cross_model_pairwise(df: pd.DataFrame, model_col: str, human_col: str):
    """
    Pairwise agreement only across DIFFERENT MODELS but the SAME base video.
    Requires df to have: ['video_id','model','base_name', model_col, human_col, 'action_mean_intra_orig'].
    Returns: (accuracy, disagree_pairs_df, total_pairs)

    Directions:
    - Model metrics: BOTH lower is better (action_mean_intra_orig, frame_diff_ord2)
    - Human metrics: higher is better
    """
    if model_col == "action_mean_intra":
        use_model_col = "action_mean_intra_orig"
    elif model_col == "frame_diff_ord2":
        use_model_col = "frame_diff_ord2"
    else:
        use_model_col = model_col

    rows = []
    total = 0
    agree = 0

    scope = df.dropna(subset=["base_name"])

    for base_name, g in scope.groupby("base_name"):
        vids = g["video_id"].tolist()
        models = g["model"].tolist()
        mvals = pd.to_numeric(g[use_model_col], errors="coerce").values
        hvals = pd.to_numeric(g[human_col], errors="coerce").values
        n = len(vids)
        for i in range(n):
            for j in range(i + 1, n):
                if models[i] == models[j]:
                    continue  # cross-model only
                mi, mj = mvals[i], mvals[j]
                hi, hj = hvals[i], hvals[j]
                if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
                    continue
                if mi == mj or hi == hj:
                    continue

                model_order = "A>B" if mi < mj else "B>A"  # LOWER better
                human_order = "A>B" if hi > hj else "B>A"  # HIGHER better

                is_agree = (model_order == human_order)
                total += 1
                if is_agree:
                    agree += 1
                else:
                    rows.append({
                        "base_name": base_name,
                        "video_A": vids[i],
                        "model_A": models[i],
                        f"{model_col}_A": mi,
                        f"{human_col}_A": hi,
                        "video_B": vids[j],
                        "model_B": models[j],
                        f"{model_col}_B": mj,
                        f"{human_col}_B": hj,
                        "model_order": model_order,
                        "human_order": human_order,
                        # "agree": is_agree,
                    })

    acc = (agree / total) if total > 0 else np.nan
    disagree_df = pd.DataFrame(rows)
    return acc, disagree_df, total

def spearman_all(df: pd.DataFrame, model_cols: list, human_cols: list) -> pd.DataFrame:
    """Spearman rho between each model metric and each human metric over all videos."""
    records = []
    for m in model_cols:
        for h in human_cols:
            sub = df[[m, h]].dropna()
            n = len(sub)
            rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
            records.append({"model_metric": m, "human_metric": h, "rho": rho, "n": n})
    return pd.DataFrame(records)

def spearman_by_group(df: pd.DataFrame, model_cols: list, human_cols: list, group_col: str, groups: List[str]) -> pd.DataFrame:
    """
    Spearman rho per group (group_col ∈ {'class','model'}). If groups is empty -> all unique values.
    Returns tidy df with columns: [group_col, model_metric, human_metric, rho, n]
    """
    if not groups:
        groups = sorted([g for g in df[group_col].dropna().unique().tolist()])
    rows = []
    for g in groups:
        subdf = df[df[group_col] == g]
        if subdf.empty:
            continue
        for m in model_cols:
            for h in human_cols:
                sub = subdf[[m, h]].dropna()
                n = len(sub)
                rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
                rows.append({group_col: g, "model_metric": m, "human_metric": h, "rho": rho, "n": n})
    return pd.DataFrame(rows)

# ---------- Rater loading & inter-rater correlations ----------
def _filename_to_rater_name(p: Path) -> str:
    name = p.stem  # e.g., "chvskch_scores"
    if name.endswith("_scores"):
        name = name[:-7]
    return name

def _looks_like_video_key(k: str) -> bool:
    s = str(k)
    return "__" in s or s.endswith(".mp4") or s.endswith(".MP4")

def _detect_rater_structure(obj) -> str:
    """
    Returns one of:
      - 'metric_to_video'       : {metric: {video_id: score}, ...}
      - 'video_to_score'        : {video_id: score, ...}
      - 'video_to_metric_score' : {video_id: {metric: score}, ...}
      - 'unknown'
    """
    if not isinstance(obj, dict) or not obj:
        return "unknown"

    # If values are not dicts -> {video_id: score}
    first_val = next(iter(obj.values()))
    if not isinstance(first_val, dict):
        return "video_to_score"

    # Values are dicts. Decide by looking at the OUTER KEYS.
    outer_keys = list(obj.keys())
    if any(_looks_like_video_key(k) for k in outer_keys):
        return "video_to_metric_score"        # <-- your case
    else:
        return "metric_to_video"

def load_raters(globs: List[str]) -> Dict[str, pd.DataFrame]:
    """
    Returns: dict metric_name -> wide DataFrame with index=video_id ('.mp4' enforced) and columns=rater_name.
    Supports shapes:
      - {metric: {video_id: score}}
      - {video_id: score}                    -> metric 'overall'
      - {video_id: {metric: score}}          -> (your case)
    Metric names normalized to app's human_* names.
    """
    metric_to_frames: Dict[str, List[pd.DataFrame]] = {}
    files = []
    for pat in globs:
        files.extend(Path(".").glob(pat))

    for p in files:
        try:
            data = json.loads(Path(p).read_text())
        except Exception:
            continue

        rater = _filename_to_rater_name(p)
        shape = _detect_rater_structure(data)

        if shape == "metric_to_video":
            # {metric: {video_id: score}}
            for metric, vid_scores in data.items():
                metric_norm = normalize_metric_name(metric)
                s = pd.Series({normalize_video_id(k): v for k, v in vid_scores.items()},
                              name=rater, dtype="float")
                metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())

        elif shape == "video_to_score":
            # {video_id: score} -> 'overall'
            s = pd.Series({normalize_video_id(k): v for k, v in data.items()},
                          name=rater, dtype="float")
            metric_to_frames.setdefault("overall", []).append(s.to_frame())

        elif shape == "video_to_metric_score":
            # {video_id: {metric: score}}  <-- YOUR FILES
            bucket: Dict[str, Dict[str, float]] = {}
            for vid, mdict in data.items():
                if not isinstance(mdict, dict):
                    continue
                for metric, val in mdict.items():
                    metric_norm = normalize_metric_name(metric)
                    bucket.setdefault(metric_norm, {})[normalize_video_id(vid)] = val
            for metric_norm, vid_scores in bucket.items():
                s = pd.Series(vid_scores, name=rater, dtype="float")
                metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())

        # else: unknown; skip

    # Merge per metric
    metric_to_wide: Dict[str, pd.DataFrame] = {}
    for metric, frames in metric_to_frames.items():
        if not frames:
            continue
        wide = frames[0]
        for f in frames[1:]:
            wide = wide.join(f, how="outer")
        wide = wide.loc[:, ~wide.columns.duplicated()]
        metric_to_wide[metric] = wide

    return metric_to_wide

def inter_rater_pairs_by_group(
    rater_wide: pd.DataFrame,
    video_meta: pd.DataFrame,
    by: str,
    values: List[str] | None = None,
    min_overlap: int = 2,
) -> pd.DataFrame:
    """
    Return per-rater *pairwise* correlations within each group.
    Rows: [by, rater, other_rater, rho, n]
    Filters: drop NaN rho; keep only pairs with n >= min_overlap.
    """
    pairs = inter_rater_corr_grouped(rater_wide, video_meta, by=by, values=values)
    if pairs.empty:
        return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])

    # Ensure integer overlaps
    pairs["n"] = pairs["n"].astype(int)

    # keep only pairs with enough overlap and non-null rho
    pairs = pairs[(pairs["n"] >= min_overlap) & pairs["rho"].notna()]

    # # keep only adequate overlap, drop NaNs
    # pairs = pairs[pairs["n"] >= min_overlap].dropna(subset=["rho"])
    if pairs.empty:
        return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])

    # expand to directed rows so each rater has their own row per counterpart
    a = pairs.rename(columns={"rater_i": "rater", "rater_j": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
    b = pairs.rename(columns={"rater_j": "rater", "rater_i": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
    out = pd.concat([a, b], ignore_index=True)

    # sort for nicer reading
    out = out.sort_values([by, "rater", "rho"], ascending=[True, True, False]).reset_index(drop=True)
    return out


def inter_rater_avg_by_group(
    rater_wide: pd.DataFrame,
    video_meta: pd.DataFrame,
    by: str,
    values: List[str] | None = None,
    min_overlap: int = 2,
) -> pd.DataFrame:
    """
    Average (unweighted) inter-rater Spearman rho per rater within each group.
    Uses same filters as inter_rater_pairs_by_group.
    Rows: [by, rater, mean_rho, num_pairs_used]
    """
    pairs_long = inter_rater_pairs_by_group(
        rater_wide, video_meta, by=by, values=values, min_overlap=min_overlap
    )
    if pairs_long.empty:
        return pd.DataFrame(columns=[by, "rater", "mean_rho", "num_pairs_used"])

    agg = (
        pairs_long.groupby([by, "rater"])
        .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
        .reset_index()
    )
    agg["mean_rho"] = agg["mean_rho"].astype(float)
    return agg

def inter_rater_corr_grouped(
    rater_wide: pd.DataFrame,
    video_meta: pd.DataFrame,
    by: str,
    values: List[str] | None = None
) -> pd.DataFrame:
    """
    Spearman inter-rater correlation matrices grouped by `by` ∈ {'class','model'}.
    Requires ≥2 raters with ≥2 overlapping videos inside each group.
    Returns tidy df: [by, rater_i, rater_j, rho, n]
    """
    assert by in ("class", "model")

    def corr_to_long(df_corr: pd.DataFrame, counts: pd.DataFrame, group_label: str):
        out = []
        for i in df_corr.index:
            for j in df_corr.columns:
                if i >= j:
                    continue
                out.append({by: group_label, "rater_i": i, "rater_j": j,
                            "rho": df_corr.loc[i, j], "n": int(counts.loc[i, j])})
        return out

    meta = video_meta[["video_id", "class", "model"]].drop_duplicates().set_index("video_id")
    X = rater_wide.copy()
    X.index = X.index.map(str)  # should already be normalized to '.mp4'
    X = X.join(meta, how="left")

    if not values:
        values = sorted([v for v in X[by].dropna().unique().tolist()])

    results = []

    def corr_with_counts(M: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        rho = M.corr(method="spearman", min_periods=2)
        mask = ~M.isna()
        counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="float")
        for a in M.columns:
            for b in M.columns:
                counts.loc[a, b] = float((mask[a] & mask[b]).sum())
        return rho, counts

    for g in values:
        sub = X[X[by] == g]
        if sub.empty:
            continue
        # keep only rater columns with at least 2 ratings
        cols = [c for c in sub.columns if c not in ("class", "model")]
        usable = [c for c in cols if sub[c].notna().sum() >= 2]
        if len(usable) < 2:
            continue
        M = sub[usable]
        rho, counts = corr_with_counts(M)
        # if every pair has <2 overlaps, skip
        if (counts.values < 2).all():
            continue
        results.extend(corr_to_long(rho, counts, g))

    out = pd.DataFrame(results)
    if not out.empty:
        out["rho"] = out["rho"].astype(float)
    return out


def _corr_with_counts_matrix(M: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Spearman rho matrix + overlap counts matrix for a wide (videos x raters) frame."""
    rho = M.corr(method="spearman", min_periods=2)
    mask = ~M.isna()
    counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="int64")
    for a in M.columns:
        for b in M.columns:
            counts.loc[a, b] = int((mask[a] & mask[b]).sum())
    return rho, counts

def inter_rater_pairs_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
    """
    Pairwise inter-rater Spearman over the entire dataset (no grouping).
    Returns columns: [rater, other_rater, rho, n]
    n = overlapping videos used for that pair.
    """
    if rater_wide is None or rater_wide.empty:
        return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])

    # keep only rater columns with at least 2 ratings
    usable = [c for c in rater_wide.columns if rater_wide[c].notna().sum() >= 2]
    M = rater_wide[usable]
    if M.shape[1] < 2:
        return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])

    rho, counts = _corr_with_counts_matrix(M)

    rows = []
    for i in rho.index:
        for j in rho.columns:
            if i >= j:
                continue
            n_overlap = int(counts.loc[i, j])
            r = rho.loc[i, j]
            if n_overlap >= min_overlap and pd.notna(r):
                rows.append({"rater": i, "other_rater": j, "rho": float(r), "n": n_overlap})
                rows.append({"rater": j, "other_rater": i, "rho": float(r), "n": n_overlap})  # directed

    out = pd.DataFrame(rows)
    if not out.empty:
        out = out.sort_values(["rater", "rho"], ascending=[True, False]).reset_index(drop=True)
    return out

def inter_rater_avg_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
    """
    Overall (no grouping) average Spearman rho per rater.
    Returns columns: [rater, mean_rho, num_pairs_used]
    """
    pairs = inter_rater_pairs_overall(rater_wide, min_overlap=min_overlap)
    if pairs.empty:
        return pd.DataFrame(columns=["rater", "mean_rho", "num_pairs_used"])
    agg = (pairs.groupby("rater")
                 .agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
                 .reset_index()
                 .sort_values("mean_rho", ascending=False)
                 .reset_index(drop=True))
    return agg

# =========================
# Load aggregate scores
# =========================
scores = {k: load_json(Path(p)) for k, p in REQ.items()}
human_scores = {k: load_json(Path(p)) for k, p in HUMAN.items()}

all_keys = set()
for d in scores.values():
    all_keys |= set(d.keys())
for d in human_scores.values():
    all_keys |= set(d.keys())

rows = []
for vid in sorted(all_keys):
    rows.append({
        "video_id": vid,
        "class": vid.split("__")[0] if "__" in vid else "UNK",
        "action_mean_intra": scores["action_mean_intra"].get(vid, np.nan),
        "frame_diff_ord2": scores["frame_diff_ord2"].get(vid, np.nan),
        "human_action": human_scores["human_action"].get(vid, np.nan),
        "human_anatomy": human_scores["human_anatomy"].get(vid, np.nan),
        "human_appearance": human_scores["human_appearance"].get(vid, np.nan),
        "human_motion": human_scores["human_motion"].get(vid, np.nan),
    })

df_raw = pd.DataFrame(rows)

# Keep original action_mean_intra for pairwise; flip for display ONCE (your JSON is negative)
df_raw["action_mean_intra"] = pd.to_numeric(df_raw["action_mean_intra"], errors="coerce")
df_raw["action_mean_intra_orig"] = df_raw["action_mean_intra"].copy()
df_raw["action_mean_intra"] = df_raw["action_mean_intra_orig"]  # positive / higher-better in tables

# Map models + base_name via mapping.json
v2m, v2base = build_maps_from_mapping(Path(mapping_json_path))
df_raw["model"] = df_raw["video_id"].map(v2m).fillna("UNK")
df_raw["base_name"] = df_raw["video_id"].map(v2base)  # may be NaN

# Metric columns (display + normalization)
metric_cols = [
    "action_mean_intra",   # positive (after flip) for display; pairwise uses *_orig
    "frame_diff_ord2",     # raw; lower-better
    "human_action",
    "human_anatomy",
    "human_appearance",
    "human_motion",
]
human_cols = ["human_action", "human_anatomy", "human_appearance", "human_motion"]

# Drop rows with NO human ratings at all
df_raw = df_raw.dropna(subset=human_cols, how="all")

# Normalized view for tables
df_norm = df_raw.copy()
for c in metric_cols:
    df_norm[c] = minmax_normalize(df_norm[c])

# =========================
# UI Layout
# =========================
left, right = st.columns([2, 1])

with left:
    tab_tables, tab_agree, tab_cross, tab_spear, tab_ir = st.tabs(
        ["Tables", "Agreement", "Cross-Model (same videoName)", "Spearman", "Inter-Rater"]
    )

    # --------------------- TABLES (A/B) ---------------------
    with tab_tables:
        st.subheader("Comparison Tables")

        # Table A
        st.markdown("**Table A**")
        use_norm_a = st.checkbox("Show normalized scores (0–1) — A", value=False, key="norm_a")
        classes = ["(All)"] + uniq_sorted(df_raw["class"])
        models = ["(All)"] + uniq_sorted(df_raw["model"])
        ca, ma = st.columns(2)
        with ca:
            chosen_c_a = st.selectbox("Class (A)", classes, key="class_a")
        with ma:
            chosen_m_a = st.selectbox("Model (A)", models, key="model_a")
        df_view_a = df_norm if use_norm_a else df_raw
        filt_a = filter_by(df_view_a, chosen_c_a, chosen_m_a).copy()
        disp_a = filt_a.copy()
        disp_a[metric_cols] = disp_a[metric_cols].applymap(fmt3)
        st.dataframe(disp_a, use_container_width=True, hide_index=True)

        st.markdown("---")

        # Table B
        st.markdown("**Table B**")
        use_norm_b = st.checkbox("Show normalized scores (0–1) — B", value=False, key="norm_b")
        cb, mb = st.columns(2)
        with cb:
            chosen_c_b = st.selectbox("Class (B)", classes, key="class_b")
        with mb:
            chosen_m_b = st.selectbox("Model (B)", models, key="model_b")
        df_view_b = df_norm if use_norm_b else df_raw
        filt_b = filter_by(df_view_b, chosen_c_b, chosen_m_b).copy()
        disp_b = filt_b.copy()
        disp_b[metric_cols] = disp_b[metric_cols].applymap(fmt3)
        st.dataframe(disp_b, use_container_width=True, hide_index=True)

    # --------------------- AGREEMENT (multi-class) ---------------------
    with tab_agree:
        st.subheader("Agreement (Model vs Human) — multi-class")

        all_models  = uniq_sorted(df_raw["model"])
        all_classes = uniq_sorted(df_raw["class"])
        c1, c2 = st.columns(2)
        with c1:
            chosen_classes = st.multiselect("Classes (empty = All)", all_classes, default=[], key="agree_classes")
        with c2:
            chosen_models = st.multiselect("Models (empty = All)", all_models, default=[], key="agree_models")

        model_metric = st.selectbox(
            "Model metric",
            ["action_mean_intra", "frame_diff_ord2"],
            index=0,
            key="agree_model_metric",
            help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
        )
        human_metric = st.selectbox("Human metric", human_cols, index=0, key="agree_human_metric")

        scope_df = filter_by_multi(df_raw, chosen_classes, chosen_models).copy()
        if "action_mean_intra_orig" not in scope_df.columns:
            scope_df = scope_df.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
        req_model_col = "action_mean_intra_orig" if model_metric == "action_mean_intra" else model_metric
        scope_df = scope_df.dropna(subset=[req_model_col, human_metric], how="any")

        if len(scope_df) < 2:
            st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
        else:
            acc, disagree_df, total_pairs = pairwise_agreement(scope_df, model_metric, human_metric)
            st.markdown(f"**Pairwise accuracy:** {fmt3(acc)} (over {total_pairs} comparable pairs).")
            if not disagree_df.empty:
                df_show = disagree_df.copy()
                numeric_cols = [f"{model_metric}_A", f"{model_metric}_B", f"{human_metric}_A", f"{human_metric}_B"]
                for col in numeric_cols:
                    if col in df_show.columns:
                        df_show[col] = pd.to_numeric(df_show[col], errors="coerce").map(fmt3)
                st.markdown("**Disagreeing pairs**")
                st.dataframe(df_show, use_container_width=True, hide_index=True)
            else:
                st.success("All comparable pairs agree. 🎉")

    # # --------------------- GLOBAL PAIRWISE (any videos, multi-class) ---------------------
    # with tab_global:
    #     st.subheader("Global Pairwise (any videos) — multi-class")

    #     all_models  = uniq_sorted(df_raw["model"])
    #     all_classes = uniq_sorted(df_raw["class"])

    #     c1, c2 = st.columns(2)
    #     with c1:
    #         chosen_classes_g = st.multiselect("Classes (empty = All)", all_classes, default=[], key="global_classes")
    #     with c2:
    #         chosen_models_g = st.multiselect("Models (empty = All)", all_models, default=[], key="global_models")

    #     model_metric_g = st.selectbox(
    #         "Model metric",
    #         ["action_mean_intra", "frame_diff_ord2"],
    #         index=0,
    #         key="global_model_metric",
    #         help="Agreement rule: BOTH model metrics use LOWER-is-better (action_mean_intra uses original sign)."
    #     )
    #     human_metric_g = st.selectbox(
    #         "Human metric",
    #         human_cols,
    #         index=0,
    #         key="global_human_metric"
    #     )

    #     scope_df_g = filter_by_multi(df_raw, chosen_classes_g, chosen_models_g).copy()
    #     if "action_mean_intra_orig" not in scope_df_g.columns:
    #         scope_df_g = scope_df_g.merge(
    #             df_raw[["video_id", "action_mean_intra_orig"]],
    #             on="video_id",
    #             how="left"
    #         )
    #     req_model_col_g = "action_mean_intra_orig" if model_metric_g == "action_mean_intra" else model_metric_g
    #     scope_df_g = scope_df_g.dropna(subset=[req_model_col_g, human_metric_g], how="any")

    #     if len(scope_df_g) < 2:
    #         st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
    #     else:
    #         acc_g, disagree_df_g, total_pairs_g = pairwise_agreement(scope_df_g, model_metric_g, human_metric_g)
    #         st.markdown(f"**Global pairwise accuracy:** {fmt3(acc_g)} (over {total_pairs_g} comparable pairs).")

    #         if not disagree_df_g.empty:
    #             df_show_g = disagree_df_g.copy()
    #             numeric_cols_g = [f"{model_metric_g}_A", f"{model_metric_g}_B", f"{human_metric_g}_A", f"{human_metric_g}_B"]
    #             for col in numeric_cols_g:
    #                 if col in df_show_g.columns:
    #                     df_show_g[col] = pd.to_numeric(df_show_g[col], errors="coerce").map(fmt3)
    #             st.markdown("**Disagreeing global pairs**")
    #             st.dataframe(df_show_g, use_container_width=True, hide_index=True)
    #         else:
    #             st.success("All comparable global pairs agree. 🎉")

    # --------------------- CROSS-MODEL (same base video, multi-class) ---------------------
    with tab_cross:
        st.subheader("Cross-Model Agreement — same original video (base_name), multi-class")

        all_classes = uniq_sorted(df_raw["class"])
        all_models = uniq_sorted(df_raw["model"])

        c1, c2 = st.columns(2)
        with c1:
            chosen_classes2 = st.multiselect("Classes (empty = All)", all_classes, default=[], key="cross_classes")
        with c2:
            chosen_models2 = st.multiselect("Models (empty = All)", all_models, default=[], key="cross_models")

        model_metric2 = st.selectbox(
            "Model metric",
            ["action_mean_intra", "frame_diff_ord2"],
            index=0,
            key="cross_model_metric",
            help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
        )
        human_metric2 = st.selectbox(
            "Human metric",
            human_cols,
            index=0,
            key="cross_human_metric"
        )

        scope_df2 = filter_by_multi(df_raw, chosen_classes2, chosen_models2).copy()
        if "action_mean_intra_orig" not in scope_df2.columns:
            scope_df2 = scope_df2.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
        req_model_col2 = "action_mean_intra_orig" if model_metric2 == "action_mean_intra" else model_metric2
        scope_df2 = scope_df2.dropna(subset=["base_name", req_model_col2, human_metric2], how="any")

        eligible = scope_df2.groupby("base_name")["model"].nunique().reset_index(name="n_models")
        eligible_names = set(eligible[eligible["n_models"] >= 2]["base_name"].tolist())
        scope_df2 = scope_df2[scope_df2["base_name"].isin(eligible_names)]

        if scope_df2.empty:
            st.info("No base videos with at least two different models in the current filters.")
        else:
            acc2, disagree_df2, total_pairs2 = cross_model_pairwise(scope_df2, model_metric2, human_metric2)
            st.markdown(f"**Cross-model pairwise accuracy:** {fmt3(acc2)} (over {total_pairs2} comparable cross-model pairs).")
            if not disagree_df2.empty:
                df_show2 = disagree_df2.copy()
                numeric_cols2 = [f"{model_metric2}_A", f"{model_metric2}_B", f"{human_metric2}_A", f"{human_metric2}_B"]
                for col in numeric_cols2:
                    if col in df_show2.columns:
                        df_show2[col] = pd.to_numeric(df_show2[col], errors="coerce").map(fmt3)
                st.markdown("**Disagreeing cross-model pairs (same base video)**")
                st.dataframe(df_show2, use_container_width=True, hide_index=True)
            else:
                st.success("All comparable cross-model pairs agree. 🎉")

    # --------------------- SPEARMAN (separate views per human metric) ---------------------
    with tab_spear:
        st.subheader("Spearman correlations (separate by human metric)")

        model_metrics_all = ["action_mean_intra", "frame_diff_ord2"]
        # one sub-tab per human metric
        sub_tabs = st.tabs(human_cols)
        for tab_obj, hmetric in zip(sub_tabs, human_cols):
            with tab_obj:
                st.caption(f"Human metric: **{hmetric}**")
                # Overall (only rows where the specific human metric is present)
                spear_overall = spearman_all(df_raw, model_metrics_all, [hmetric])
                show = spear_overall.copy()
                show["rho"] = show["rho"].map(fmt3)
                st.markdown("**Overall**")
                st.dataframe(show, use_container_width=True, hide_index=True)

                st.markdown("---")
                st.markdown("**By Class**")
                classes_all = uniq_sorted(df_raw["class"])
                chosen_cls = st.multiselect("Classes (empty = All)", classes_all, default=[], key=f"spear_cls_{hmetric}")
                spc = spearman_by_group(df_raw, model_metrics_all, [hmetric], "class", chosen_cls)
                spc["rho"] = spc["rho"].map(fmt3)
                st.dataframe(spc, use_container_width=True, hide_index=True)

                st.markdown("---")
                st.markdown("**By Model**")
                models_all = uniq_sorted(df_raw["model"])
                chosen_mdl = st.multiselect("Models (empty = All)", models_all, default=[], key=f"spear_mdl_{hmetric}")
                spm = spearman_by_group(df_raw, model_metrics_all, [hmetric], "model", chosen_mdl)
                spm["rho"] = spm["rho"].map(fmt3)
                st.dataframe(spm, use_container_width=True, hide_index=True)

    # --------------------- INTER-RATER (by class / by model ONLY) ---------------------
    with tab_ir:
        st.subheader("Inter-Rater Correlations (Spearman) — by Class / by Model")

        metric_to_wide = load_raters(RATER_GLOBS)

        # Quick diagnostics so you can see coverage
        # Overall average inter-rater correlation (no grouping) per rater, per metric
        if metric_to_wide:
            st.caption("Overall inter-rater averages (across all classes/models)")
            rows = []
            for metric, wide in metric_to_wide.items():
                avg = inter_rater_avg_overall(wide, min_overlap=2)
                if not avg.empty:
                    tmp = avg.copy()
                    tmp.insert(0, "metric", metric)
                    rows.append(tmp)
            if rows:
                overall_tbl = pd.concat(rows, ignore_index=True)
                overall_tbl["mean_rho"] = overall_tbl["mean_rho"].map(fmt3)
                st.dataframe(overall_tbl, use_container_width=True, hide_index=True)
            else:
                st.info("No rater pairs with enough overlapping videos to compute overall averages.")

        if not metric_to_wide:
            st.info("No rater files found. Expected patterns: 'raters*.json' or 'raters/*.json'.")
        else:
            meta = df_raw[["video_id", "class", "model"]].drop_duplicates()

            metrics_available = sorted(metric_to_wide.keys())
            chosen_metric = st.selectbox("Rater metric", metrics_available, key="ir_metric")

            wide = metric_to_wide.get(chosen_metric)
            if wide is None or wide.empty:
                st.info("Selected metric has no rater data.")
            else:
                sub_by_cls, sub_by_mdl = st.tabs(["By Class", "By Model"])

                with sub_by_cls:
                    classes_all = uniq_sorted(meta["class"])
                    chosen_cls_ir = st.multiselect("Classes (empty = All)", classes_all, default=[], key="ir_classes")

                    # Per-rater PAIRWISE
                    pairs_cls = inter_rater_pairs_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
                    if pairs_cls.empty:
                        st.info("Not enough overlap to compute class-wise inter-rater correlations.")
                    else:
                        show_pairs = pairs_cls.copy()
                        show_pairs["rho"] = show_pairs["rho"].map(fmt3)
                        st.markdown("**Per-rater pairwise correlations**")
                        st.dataframe(show_pairs, use_container_width=True, hide_index=True)

                    # Per-rater AVERAGE
                    avg_cls = inter_rater_avg_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
                    if avg_cls.empty:
                        st.info("Not enough overlap to compute class-wise inter-rater averages.")
                    else:
                        show_avg = avg_cls.copy()
                        show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
                        st.markdown("**Per-rater average correlation**")
                        st.dataframe(show_avg, use_container_width=True, hide_index=True)

                with sub_by_mdl:
                    models_all = uniq_sorted(meta["model"])
                    chosen_mdl_ir = st.multiselect("Models (empty = All)", models_all, default=[], key="ir_models")

                    # Per-rater PAIRWISE
                    pairs_mdl = inter_rater_pairs_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
                    if pairs_mdl.empty:
                        st.info("Not enough overlap to compute model-wise inter-rater correlations.")
                    else:
                        show_pairs = pairs_mdl.copy()
                        show_pairs["rho"] = show_pairs["rho"].map(fmt3)
                        st.markdown("**Per-rater pairwise correlations**")
                        st.dataframe(show_pairs, use_container_width=True, hide_index=True)

                    # Per-rater AVERAGE
                    avg_mdl = inter_rater_avg_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
                    if avg_mdl.empty:
                        st.info("Not enough overlap to compute model-wise inter-rater averages.")
                    else:
                        show_avg = avg_mdl.copy()
                        show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
                        st.markdown("**Per-rater average correlation**")
                        st.dataframe(show_avg, use_container_width=True, hide_index=True)

with right:
    st.subheader("Video Viewer")

    all_vids = df_raw["video_id"].tolist()
    if len(all_vids) == 0:
        st.info("No videos available after filtering rows with no human ratings.")
    else:
        selected_vid = st.selectbox("Choose a video id", sorted(all_vids))
        if selected_vid:
            vid_path = str(Path(video_dir) / selected_vid)
            st.video(vid_path)

            row_view = df_raw[df_raw["video_id"] == selected_vid].iloc[0]

            st.markdown("### Scores")
            # st.caption("Display: action_mean_intra is flipped once for readability; pairwise uses original (lower is better).")

            model_metrics = {
                k: round(float(row_view[k]), 3)
                for k in ["action_mean_intra", "frame_diff_ord2"]
                if pd.notna(row_view[k])
            }
            human_metrics = {
                k: round(float(row_view[k]), 3)
                for k in ["human_action", "human_anatomy", "human_appearance", "human_motion"]
                if pd.notna(row_view[k])
            }

            st.write("**Model metrics:**")
            st.json(model_metrics)
            st.write("**Human scores:**")
            st.json(human_metrics)