VideoEvals / src /streamlit_app.py
XThomasBU's picture
update
1d9ccf6
import json
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
import streamlit as st
st.set_page_config(page_title="VideoEval Explorer", layout="wide")
st.title("VideoEval — Tables + Viewer with Human Scores")
# =========================
# Paths / Config
# =========================
mapping_json_path = "src/YOUTUBE_DATA/id_map.json"
video_dir = "src/YOUTUBE_DATA"
def uniq_sorted(s) -> list:
return sorted(pd.Series(s).dropna().unique().tolist())
# --- Metric & video-id normalization for raters ---
ALIASES = {
"action": "human_action",
"anatomy": "human_anatomy",
"appearance": "human_appearance",
"motion": "human_motion",
"overall": "overall",
}
def normalize_metric_name(name: str) -> str:
if not isinstance(name, str):
return str(name)
key = name.strip().lower()
return ALIASES.get(key, name) # fall back to original if unknown
def normalize_video_id(v) -> str:
s = str(v)
# enforce ".mp4" suffix to match main df
return s if s.endswith(".mp4") else f"{s}.mp4"
# Model metric JSONs
REQ = {
"action_mean_intra": "src/action_mean_intra.json",
"frame_diff_ord2": "src/frame_diff_ord2.json",
}
# Human JSONs (aggregate)
HUMAN = {
"human_action": "src/human_scores_analysis_action_mos_centered.json",
"human_anatomy": "src/human_scores_analysis_anatomy_mos_centered.json",
"human_appearance": "src/human_scores_analysis_appearance_mos_centered.json",
"human_motion": "src/human_scores_analysis_motion_mos_centered.json",
}
# Rater JSONs glob patterns (support both)
RATER_GLOBS = ["src/raters*.json", "src/raters/*.json"]
# =========================
# Helpers
# =========================
def load_json(p: Path) -> dict:
with open(p, "r") as f:
d = json.load(f)
out = {}
for k, v in d.items():
try:
out[str(k)] = float(v)
except Exception:
out[str(k)] = np.nan
return out
def build_maps_from_mapping(p: Path) -> Tuple[Dict[str, str], Dict[str, str]]:
"""
Returns:
- v2m: video_id -> model
- v2base: video_id -> base_name (original 'videoName', e.g. 'v_JumpingJack_g13_c02')
Accepts either 'video_to_model' or 'model_to_videoName_to_id' schema.
"""
v2m, v2base = {}, {}
if not p or not p.exists():
return v2m, v2base
with open(p, "r") as f:
m = json.load(f)
if "video_to_model" in m and isinstance(m["video_to_model"], dict):
v2m = dict(m["video_to_model"])
return v2m, v2base
if "model_to_videoName_to_id" in m and isinstance(m["model_to_videoName_to_id"], dict):
for model, name_to_id in m["model_to_videoName_to_id"].items():
for base_name, cls_id in name_to_id.items():
vid = f"{cls_id}.mp4" # filenames like Class__HASH.mp4
v2m[vid] = model
v2base[vid] = base_name
return v2m, v2base
def minmax_normalize(series: pd.Series) -> pd.Series:
s = pd.to_numeric(series, errors="coerce")
if s.notna().sum() == 0:
return s
smin, smax = s.min(skipna=True), s.max(skipna=True)
if not np.isfinite(smin) or not np.isfinite(smax) or smin == smax:
return s.apply(lambda x: 0.0 if pd.notna(x) else np.nan)
return (s - smin) / (smax - smin)
def fmt3(x):
try:
x = float(x)
except (TypeError, ValueError):
return ""
if not np.isfinite(x):
return ""
return f"{x:.3f}"
def filter_by(df: pd.DataFrame, cls: str, mdl: str) -> pd.DataFrame:
"""Single-select filter (used by the two tables)."""
out = df
if cls != "(All)":
out = out[out["class"] == cls]
if mdl != "(All)":
out = out[out["model"] == mdl]
return out
def filter_by_multi(df: pd.DataFrame, classes: List[str] | None, models: List[str] | None) -> pd.DataFrame:
"""Multi-select filter used in pairwise tabs. Empty/None means 'all'."""
out = df
if classes:
out = out[out["class"].isin(classes)]
if models:
out = out[out["model"].isin(models)]
return out
def pairwise_agreement(df: pd.DataFrame, model_col: str, human_col: str):
"""
Pairwise ranking agreement between model_col and human_col over all rows in df.
Returns: (accuracy, disagree_pairs_df, total_pairs)
Directions:
- Model metrics: BOTH lower is better
* action_mean_intra -> use 'action_mean_intra_orig' (pre-negation)
* frame_diff_ord2 -> use 'frame_diff_ord2' (raw)
- Human metrics: higher is better
"""
if model_col == "action_mean_intra":
use_model_col = "action_mean_intra_orig"
elif model_col == "frame_diff_ord2":
use_model_col = "frame_diff_ord2"
else:
use_model_col = model_col # fallback
vids = df["video_id"].tolist()
mvals = pd.to_numeric(df[use_model_col], errors="coerce").values
hvals = pd.to_numeric(df[human_col], errors="coerce").values
rows = []
total = 0
agree = 0
n = len(vids)
for i in range(n):
for j in range(i + 1, n):
mi, mj = mvals[i], mvals[j]
hi, hj = hvals[i], hvals[j]
if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
continue
if mi == mj or hi == hj:
continue
# model order: LOWER is better
model_order = "A>B" if mi < mj else "B>A"
# human order: HIGHER is better
human_order = "A>B" if hi > hj else "B>A"
is_agree = (model_order == human_order)
total += 1
if is_agree:
agree += 1
else:
rows.append({
"video_A": vids[i],
"video_B": vids[j],
f"{model_col}_A": mi,
f"{model_col}_B": mj,
f"{human_col}_A": hi,
f"{human_col}_B": hj,
"model_order": model_order,
"human_order": human_order,
# "agree": is_agree,
})
acc = (agree / total) if total > 0 else np.nan
disagree_df = pd.DataFrame(rows)
return acc, disagree_df, total
def cross_model_pairwise(df: pd.DataFrame, model_col: str, human_col: str):
"""
Pairwise agreement only across DIFFERENT MODELS but the SAME base video.
Requires df to have: ['video_id','model','base_name', model_col, human_col, 'action_mean_intra_orig'].
Returns: (accuracy, disagree_pairs_df, total_pairs)
Directions:
- Model metrics: BOTH lower is better (action_mean_intra_orig, frame_diff_ord2)
- Human metrics: higher is better
"""
if model_col == "action_mean_intra":
use_model_col = "action_mean_intra_orig"
elif model_col == "frame_diff_ord2":
use_model_col = "frame_diff_ord2"
else:
use_model_col = model_col
rows = []
total = 0
agree = 0
scope = df.dropna(subset=["base_name"])
for base_name, g in scope.groupby("base_name"):
vids = g["video_id"].tolist()
models = g["model"].tolist()
mvals = pd.to_numeric(g[use_model_col], errors="coerce").values
hvals = pd.to_numeric(g[human_col], errors="coerce").values
n = len(vids)
for i in range(n):
for j in range(i + 1, n):
if models[i] == models[j]:
continue # cross-model only
mi, mj = mvals[i], mvals[j]
hi, hj = hvals[i], hvals[j]
if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
continue
if mi == mj or hi == hj:
continue
model_order = "A>B" if mi < mj else "B>A" # LOWER better
human_order = "A>B" if hi > hj else "B>A" # HIGHER better
is_agree = (model_order == human_order)
total += 1
if is_agree:
agree += 1
else:
rows.append({
"base_name": base_name,
"video_A": vids[i],
"model_A": models[i],
f"{model_col}_A": mi,
f"{human_col}_A": hi,
"video_B": vids[j],
"model_B": models[j],
f"{model_col}_B": mj,
f"{human_col}_B": hj,
"model_order": model_order,
"human_order": human_order,
# "agree": is_agree,
})
acc = (agree / total) if total > 0 else np.nan
disagree_df = pd.DataFrame(rows)
return acc, disagree_df, total
def spearman_all(df: pd.DataFrame, model_cols: list, human_cols: list) -> pd.DataFrame:
"""Spearman rho between each model metric and each human metric over all videos."""
records = []
for m in model_cols:
for h in human_cols:
sub = df[[m, h]].dropna()
n = len(sub)
rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
records.append({"model_metric": m, "human_metric": h, "rho": rho, "n": n})
return pd.DataFrame(records)
def spearman_by_group(df: pd.DataFrame, model_cols: list, human_cols: list, group_col: str, groups: List[str]) -> pd.DataFrame:
"""
Spearman rho per group (group_col ∈ {'class','model'}). If groups is empty -> all unique values.
Returns tidy df with columns: [group_col, model_metric, human_metric, rho, n]
"""
if not groups:
groups = sorted([g for g in df[group_col].dropna().unique().tolist()])
rows = []
for g in groups:
subdf = df[df[group_col] == g]
if subdf.empty:
continue
for m in model_cols:
for h in human_cols:
sub = subdf[[m, h]].dropna()
n = len(sub)
rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
rows.append({group_col: g, "model_metric": m, "human_metric": h, "rho": rho, "n": n})
return pd.DataFrame(rows)
# ---------- Rater loading & inter-rater correlations ----------
def _filename_to_rater_name(p: Path) -> str:
name = p.stem # e.g., "chvskch_scores"
if name.endswith("_scores"):
name = name[:-7]
return name
def _looks_like_video_key(k: str) -> bool:
s = str(k)
return "__" in s or s.endswith(".mp4") or s.endswith(".MP4")
def _detect_rater_structure(obj) -> str:
"""
Returns one of:
- 'metric_to_video' : {metric: {video_id: score}, ...}
- 'video_to_score' : {video_id: score, ...}
- 'video_to_metric_score' : {video_id: {metric: score}, ...}
- 'unknown'
"""
if not isinstance(obj, dict) or not obj:
return "unknown"
# If values are not dicts -> {video_id: score}
first_val = next(iter(obj.values()))
if not isinstance(first_val, dict):
return "video_to_score"
# Values are dicts. Decide by looking at the OUTER KEYS.
outer_keys = list(obj.keys())
if any(_looks_like_video_key(k) for k in outer_keys):
return "video_to_metric_score" # <-- your case
else:
return "metric_to_video"
def load_raters(globs: List[str]) -> Dict[str, pd.DataFrame]:
"""
Returns: dict metric_name -> wide DataFrame with index=video_id ('.mp4' enforced) and columns=rater_name.
Supports shapes:
- {metric: {video_id: score}}
- {video_id: score} -> metric 'overall'
- {video_id: {metric: score}} -> (your case)
Metric names normalized to app's human_* names.
"""
metric_to_frames: Dict[str, List[pd.DataFrame]] = {}
files = []
for pat in globs:
files.extend(Path(".").glob(pat))
for p in files:
try:
data = json.loads(Path(p).read_text())
except Exception:
continue
rater = _filename_to_rater_name(p)
shape = _detect_rater_structure(data)
if shape == "metric_to_video":
# {metric: {video_id: score}}
for metric, vid_scores in data.items():
metric_norm = normalize_metric_name(metric)
s = pd.Series({normalize_video_id(k): v for k, v in vid_scores.items()},
name=rater, dtype="float")
metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())
elif shape == "video_to_score":
# {video_id: score} -> 'overall'
s = pd.Series({normalize_video_id(k): v for k, v in data.items()},
name=rater, dtype="float")
metric_to_frames.setdefault("overall", []).append(s.to_frame())
elif shape == "video_to_metric_score":
# {video_id: {metric: score}} <-- YOUR FILES
bucket: Dict[str, Dict[str, float]] = {}
for vid, mdict in data.items():
if not isinstance(mdict, dict):
continue
for metric, val in mdict.items():
metric_norm = normalize_metric_name(metric)
bucket.setdefault(metric_norm, {})[normalize_video_id(vid)] = val
for metric_norm, vid_scores in bucket.items():
s = pd.Series(vid_scores, name=rater, dtype="float")
metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())
# else: unknown; skip
# Merge per metric
metric_to_wide: Dict[str, pd.DataFrame] = {}
for metric, frames in metric_to_frames.items():
if not frames:
continue
wide = frames[0]
for f in frames[1:]:
wide = wide.join(f, how="outer")
wide = wide.loc[:, ~wide.columns.duplicated()]
metric_to_wide[metric] = wide
return metric_to_wide
def inter_rater_pairs_by_group(
rater_wide: pd.DataFrame,
video_meta: pd.DataFrame,
by: str,
values: List[str] | None = None,
min_overlap: int = 2,
) -> pd.DataFrame:
"""
Return per-rater *pairwise* correlations within each group.
Rows: [by, rater, other_rater, rho, n]
Filters: drop NaN rho; keep only pairs with n >= min_overlap.
"""
pairs = inter_rater_corr_grouped(rater_wide, video_meta, by=by, values=values)
if pairs.empty:
return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])
# Ensure integer overlaps
pairs["n"] = pairs["n"].astype(int)
# keep only pairs with enough overlap and non-null rho
pairs = pairs[(pairs["n"] >= min_overlap) & pairs["rho"].notna()]
# # keep only adequate overlap, drop NaNs
# pairs = pairs[pairs["n"] >= min_overlap].dropna(subset=["rho"])
if pairs.empty:
return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])
# expand to directed rows so each rater has their own row per counterpart
a = pairs.rename(columns={"rater_i": "rater", "rater_j": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
b = pairs.rename(columns={"rater_j": "rater", "rater_i": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
out = pd.concat([a, b], ignore_index=True)
# sort for nicer reading
out = out.sort_values([by, "rater", "rho"], ascending=[True, True, False]).reset_index(drop=True)
return out
def inter_rater_avg_by_group(
rater_wide: pd.DataFrame,
video_meta: pd.DataFrame,
by: str,
values: List[str] | None = None,
min_overlap: int = 2,
) -> pd.DataFrame:
"""
Average (unweighted) inter-rater Spearman rho per rater within each group.
Uses same filters as inter_rater_pairs_by_group.
Rows: [by, rater, mean_rho, num_pairs_used]
"""
pairs_long = inter_rater_pairs_by_group(
rater_wide, video_meta, by=by, values=values, min_overlap=min_overlap
)
if pairs_long.empty:
return pd.DataFrame(columns=[by, "rater", "mean_rho", "num_pairs_used"])
agg = (
pairs_long.groupby([by, "rater"])
.agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
.reset_index()
)
agg["mean_rho"] = agg["mean_rho"].astype(float)
return agg
def inter_rater_corr_grouped(
rater_wide: pd.DataFrame,
video_meta: pd.DataFrame,
by: str,
values: List[str] | None = None
) -> pd.DataFrame:
"""
Spearman inter-rater correlation matrices grouped by `by` ∈ {'class','model'}.
Requires ≥2 raters with ≥2 overlapping videos inside each group.
Returns tidy df: [by, rater_i, rater_j, rho, n]
"""
assert by in ("class", "model")
def corr_to_long(df_corr: pd.DataFrame, counts: pd.DataFrame, group_label: str):
out = []
for i in df_corr.index:
for j in df_corr.columns:
if i >= j:
continue
out.append({by: group_label, "rater_i": i, "rater_j": j,
"rho": df_corr.loc[i, j], "n": int(counts.loc[i, j])})
return out
meta = video_meta[["video_id", "class", "model"]].drop_duplicates().set_index("video_id")
X = rater_wide.copy()
X.index = X.index.map(str) # should already be normalized to '.mp4'
X = X.join(meta, how="left")
if not values:
values = sorted([v for v in X[by].dropna().unique().tolist()])
results = []
def corr_with_counts(M: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
rho = M.corr(method="spearman", min_periods=2)
mask = ~M.isna()
counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="float")
for a in M.columns:
for b in M.columns:
counts.loc[a, b] = float((mask[a] & mask[b]).sum())
return rho, counts
for g in values:
sub = X[X[by] == g]
if sub.empty:
continue
# keep only rater columns with at least 2 ratings
cols = [c for c in sub.columns if c not in ("class", "model")]
usable = [c for c in cols if sub[c].notna().sum() >= 2]
if len(usable) < 2:
continue
M = sub[usable]
rho, counts = corr_with_counts(M)
# if every pair has <2 overlaps, skip
if (counts.values < 2).all():
continue
results.extend(corr_to_long(rho, counts, g))
out = pd.DataFrame(results)
if not out.empty:
out["rho"] = out["rho"].astype(float)
return out
def _corr_with_counts_matrix(M: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Spearman rho matrix + overlap counts matrix for a wide (videos x raters) frame."""
rho = M.corr(method="spearman", min_periods=2)
mask = ~M.isna()
counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="int64")
for a in M.columns:
for b in M.columns:
counts.loc[a, b] = int((mask[a] & mask[b]).sum())
return rho, counts
def inter_rater_pairs_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
"""
Pairwise inter-rater Spearman over the entire dataset (no grouping).
Returns columns: [rater, other_rater, rho, n]
n = overlapping videos used for that pair.
"""
if rater_wide is None or rater_wide.empty:
return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])
# keep only rater columns with at least 2 ratings
usable = [c for c in rater_wide.columns if rater_wide[c].notna().sum() >= 2]
M = rater_wide[usable]
if M.shape[1] < 2:
return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])
rho, counts = _corr_with_counts_matrix(M)
rows = []
for i in rho.index:
for j in rho.columns:
if i >= j:
continue
n_overlap = int(counts.loc[i, j])
r = rho.loc[i, j]
if n_overlap >= min_overlap and pd.notna(r):
rows.append({"rater": i, "other_rater": j, "rho": float(r), "n": n_overlap})
rows.append({"rater": j, "other_rater": i, "rho": float(r), "n": n_overlap}) # directed
out = pd.DataFrame(rows)
if not out.empty:
out = out.sort_values(["rater", "rho"], ascending=[True, False]).reset_index(drop=True)
return out
def inter_rater_avg_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
"""
Overall (no grouping) average Spearman rho per rater.
Returns columns: [rater, mean_rho, num_pairs_used]
"""
pairs = inter_rater_pairs_overall(rater_wide, min_overlap=min_overlap)
if pairs.empty:
return pd.DataFrame(columns=["rater", "mean_rho", "num_pairs_used"])
agg = (pairs.groupby("rater")
.agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
.reset_index()
.sort_values("mean_rho", ascending=False)
.reset_index(drop=True))
return agg
# =========================
# Load aggregate scores
# =========================
scores = {k: load_json(Path(p)) for k, p in REQ.items()}
human_scores = {k: load_json(Path(p)) for k, p in HUMAN.items()}
all_keys = set()
for d in scores.values():
all_keys |= set(d.keys())
for d in human_scores.values():
all_keys |= set(d.keys())
rows = []
for vid in sorted(all_keys):
rows.append({
"video_id": vid,
"class": vid.split("__")[0] if "__" in vid else "UNK",
"action_mean_intra": scores["action_mean_intra"].get(vid, np.nan),
"frame_diff_ord2": scores["frame_diff_ord2"].get(vid, np.nan),
"human_action": human_scores["human_action"].get(vid, np.nan),
"human_anatomy": human_scores["human_anatomy"].get(vid, np.nan),
"human_appearance": human_scores["human_appearance"].get(vid, np.nan),
"human_motion": human_scores["human_motion"].get(vid, np.nan),
})
df_raw = pd.DataFrame(rows)
# Keep original action_mean_intra for pairwise; flip for display ONCE (your JSON is negative)
df_raw["action_mean_intra"] = pd.to_numeric(df_raw["action_mean_intra"], errors="coerce")
df_raw["action_mean_intra_orig"] = df_raw["action_mean_intra"].copy()
df_raw["action_mean_intra"] = df_raw["action_mean_intra_orig"] # positive / higher-better in tables
# Map models + base_name via mapping.json
v2m, v2base = build_maps_from_mapping(Path(mapping_json_path))
df_raw["model"] = df_raw["video_id"].map(v2m).fillna("UNK")
df_raw["base_name"] = df_raw["video_id"].map(v2base) # may be NaN
# Metric columns (display + normalization)
metric_cols = [
"action_mean_intra", # positive (after flip) for display; pairwise uses *_orig
"frame_diff_ord2", # raw; lower-better
"human_action",
"human_anatomy",
"human_appearance",
"human_motion",
]
human_cols = ["human_action", "human_anatomy", "human_appearance", "human_motion"]
# Drop rows with NO human ratings at all
df_raw = df_raw.dropna(subset=human_cols, how="all")
# Normalized view for tables
df_norm = df_raw.copy()
for c in metric_cols:
df_norm[c] = minmax_normalize(df_norm[c])
# =========================
# UI Layout
# =========================
left, right = st.columns([2, 1])
with left:
tab_tables, tab_agree, tab_cross, tab_spear, tab_ir = st.tabs(
["Tables", "Agreement", "Cross-Model (same videoName)", "Spearman", "Inter-Rater"]
)
# --------------------- TABLES (A/B) ---------------------
with tab_tables:
st.subheader("Comparison Tables")
# Table A
st.markdown("**Table A**")
use_norm_a = st.checkbox("Show normalized scores (0–1) — A", value=False, key="norm_a")
classes = ["(All)"] + uniq_sorted(df_raw["class"])
models = ["(All)"] + uniq_sorted(df_raw["model"])
ca, ma = st.columns(2)
with ca:
chosen_c_a = st.selectbox("Class (A)", classes, key="class_a")
with ma:
chosen_m_a = st.selectbox("Model (A)", models, key="model_a")
df_view_a = df_norm if use_norm_a else df_raw
filt_a = filter_by(df_view_a, chosen_c_a, chosen_m_a).copy()
disp_a = filt_a.copy()
disp_a[metric_cols] = disp_a[metric_cols].applymap(fmt3)
st.dataframe(disp_a, use_container_width=True, hide_index=True)
st.markdown("---")
# Table B
st.markdown("**Table B**")
use_norm_b = st.checkbox("Show normalized scores (0–1) — B", value=False, key="norm_b")
cb, mb = st.columns(2)
with cb:
chosen_c_b = st.selectbox("Class (B)", classes, key="class_b")
with mb:
chosen_m_b = st.selectbox("Model (B)", models, key="model_b")
df_view_b = df_norm if use_norm_b else df_raw
filt_b = filter_by(df_view_b, chosen_c_b, chosen_m_b).copy()
disp_b = filt_b.copy()
disp_b[metric_cols] = disp_b[metric_cols].applymap(fmt3)
st.dataframe(disp_b, use_container_width=True, hide_index=True)
# --------------------- AGREEMENT (multi-class) ---------------------
with tab_agree:
st.subheader("Agreement (Model vs Human) — multi-class")
all_models = uniq_sorted(df_raw["model"])
all_classes = uniq_sorted(df_raw["class"])
c1, c2 = st.columns(2)
with c1:
chosen_classes = st.multiselect("Classes (empty = All)", all_classes, default=[], key="agree_classes")
with c2:
chosen_models = st.multiselect("Models (empty = All)", all_models, default=[], key="agree_models")
model_metric = st.selectbox(
"Model metric",
["action_mean_intra", "frame_diff_ord2"],
index=0,
key="agree_model_metric",
help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
)
human_metric = st.selectbox("Human metric", human_cols, index=0, key="agree_human_metric")
scope_df = filter_by_multi(df_raw, chosen_classes, chosen_models).copy()
if "action_mean_intra_orig" not in scope_df.columns:
scope_df = scope_df.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
req_model_col = "action_mean_intra_orig" if model_metric == "action_mean_intra" else model_metric
scope_df = scope_df.dropna(subset=[req_model_col, human_metric], how="any")
if len(scope_df) < 2:
st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
else:
acc, disagree_df, total_pairs = pairwise_agreement(scope_df, model_metric, human_metric)
st.markdown(f"**Pairwise accuracy:** {fmt3(acc)} (over {total_pairs} comparable pairs).")
if not disagree_df.empty:
df_show = disagree_df.copy()
numeric_cols = [f"{model_metric}_A", f"{model_metric}_B", f"{human_metric}_A", f"{human_metric}_B"]
for col in numeric_cols:
if col in df_show.columns:
df_show[col] = pd.to_numeric(df_show[col], errors="coerce").map(fmt3)
st.markdown("**Disagreeing pairs**")
st.dataframe(df_show, use_container_width=True, hide_index=True)
else:
st.success("All comparable pairs agree. 🎉")
# # --------------------- GLOBAL PAIRWISE (any videos, multi-class) ---------------------
# with tab_global:
# st.subheader("Global Pairwise (any videos) — multi-class")
# all_models = uniq_sorted(df_raw["model"])
# all_classes = uniq_sorted(df_raw["class"])
# c1, c2 = st.columns(2)
# with c1:
# chosen_classes_g = st.multiselect("Classes (empty = All)", all_classes, default=[], key="global_classes")
# with c2:
# chosen_models_g = st.multiselect("Models (empty = All)", all_models, default=[], key="global_models")
# model_metric_g = st.selectbox(
# "Model metric",
# ["action_mean_intra", "frame_diff_ord2"],
# index=0,
# key="global_model_metric",
# help="Agreement rule: BOTH model metrics use LOWER-is-better (action_mean_intra uses original sign)."
# )
# human_metric_g = st.selectbox(
# "Human metric",
# human_cols,
# index=0,
# key="global_human_metric"
# )
# scope_df_g = filter_by_multi(df_raw, chosen_classes_g, chosen_models_g).copy()
# if "action_mean_intra_orig" not in scope_df_g.columns:
# scope_df_g = scope_df_g.merge(
# df_raw[["video_id", "action_mean_intra_orig"]],
# on="video_id",
# how="left"
# )
# req_model_col_g = "action_mean_intra_orig" if model_metric_g == "action_mean_intra" else model_metric_g
# scope_df_g = scope_df_g.dropna(subset=[req_model_col_g, human_metric_g], how="any")
# if len(scope_df_g) < 2:
# st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
# else:
# acc_g, disagree_df_g, total_pairs_g = pairwise_agreement(scope_df_g, model_metric_g, human_metric_g)
# st.markdown(f"**Global pairwise accuracy:** {fmt3(acc_g)} (over {total_pairs_g} comparable pairs).")
# if not disagree_df_g.empty:
# df_show_g = disagree_df_g.copy()
# numeric_cols_g = [f"{model_metric_g}_A", f"{model_metric_g}_B", f"{human_metric_g}_A", f"{human_metric_g}_B"]
# for col in numeric_cols_g:
# if col in df_show_g.columns:
# df_show_g[col] = pd.to_numeric(df_show_g[col], errors="coerce").map(fmt3)
# st.markdown("**Disagreeing global pairs**")
# st.dataframe(df_show_g, use_container_width=True, hide_index=True)
# else:
# st.success("All comparable global pairs agree. 🎉")
# --------------------- CROSS-MODEL (same base video, multi-class) ---------------------
with tab_cross:
st.subheader("Cross-Model Agreement — same original video (base_name), multi-class")
all_classes = uniq_sorted(df_raw["class"])
all_models = uniq_sorted(df_raw["model"])
c1, c2 = st.columns(2)
with c1:
chosen_classes2 = st.multiselect("Classes (empty = All)", all_classes, default=[], key="cross_classes")
with c2:
chosen_models2 = st.multiselect("Models (empty = All)", all_models, default=[], key="cross_models")
model_metric2 = st.selectbox(
"Model metric",
["action_mean_intra", "frame_diff_ord2"],
index=0,
key="cross_model_metric",
help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
)
human_metric2 = st.selectbox(
"Human metric",
human_cols,
index=0,
key="cross_human_metric"
)
scope_df2 = filter_by_multi(df_raw, chosen_classes2, chosen_models2).copy()
if "action_mean_intra_orig" not in scope_df2.columns:
scope_df2 = scope_df2.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
req_model_col2 = "action_mean_intra_orig" if model_metric2 == "action_mean_intra" else model_metric2
scope_df2 = scope_df2.dropna(subset=["base_name", req_model_col2, human_metric2], how="any")
eligible = scope_df2.groupby("base_name")["model"].nunique().reset_index(name="n_models")
eligible_names = set(eligible[eligible["n_models"] >= 2]["base_name"].tolist())
scope_df2 = scope_df2[scope_df2["base_name"].isin(eligible_names)]
if scope_df2.empty:
st.info("No base videos with at least two different models in the current filters.")
else:
acc2, disagree_df2, total_pairs2 = cross_model_pairwise(scope_df2, model_metric2, human_metric2)
st.markdown(f"**Cross-model pairwise accuracy:** {fmt3(acc2)} (over {total_pairs2} comparable cross-model pairs).")
if not disagree_df2.empty:
df_show2 = disagree_df2.copy()
numeric_cols2 = [f"{model_metric2}_A", f"{model_metric2}_B", f"{human_metric2}_A", f"{human_metric2}_B"]
for col in numeric_cols2:
if col in df_show2.columns:
df_show2[col] = pd.to_numeric(df_show2[col], errors="coerce").map(fmt3)
st.markdown("**Disagreeing cross-model pairs (same base video)**")
st.dataframe(df_show2, use_container_width=True, hide_index=True)
else:
st.success("All comparable cross-model pairs agree. 🎉")
# --------------------- SPEARMAN (separate views per human metric) ---------------------
with tab_spear:
st.subheader("Spearman correlations (separate by human metric)")
model_metrics_all = ["action_mean_intra", "frame_diff_ord2"]
# one sub-tab per human metric
sub_tabs = st.tabs(human_cols)
for tab_obj, hmetric in zip(sub_tabs, human_cols):
with tab_obj:
st.caption(f"Human metric: **{hmetric}**")
# Overall (only rows where the specific human metric is present)
spear_overall = spearman_all(df_raw, model_metrics_all, [hmetric])
show = spear_overall.copy()
show["rho"] = show["rho"].map(fmt3)
st.markdown("**Overall**")
st.dataframe(show, use_container_width=True, hide_index=True)
st.markdown("---")
st.markdown("**By Class**")
classes_all = uniq_sorted(df_raw["class"])
chosen_cls = st.multiselect("Classes (empty = All)", classes_all, default=[], key=f"spear_cls_{hmetric}")
spc = spearman_by_group(df_raw, model_metrics_all, [hmetric], "class", chosen_cls)
spc["rho"] = spc["rho"].map(fmt3)
st.dataframe(spc, use_container_width=True, hide_index=True)
st.markdown("---")
st.markdown("**By Model**")
models_all = uniq_sorted(df_raw["model"])
chosen_mdl = st.multiselect("Models (empty = All)", models_all, default=[], key=f"spear_mdl_{hmetric}")
spm = spearman_by_group(df_raw, model_metrics_all, [hmetric], "model", chosen_mdl)
spm["rho"] = spm["rho"].map(fmt3)
st.dataframe(spm, use_container_width=True, hide_index=True)
# --------------------- INTER-RATER (by class / by model ONLY) ---------------------
with tab_ir:
st.subheader("Inter-Rater Correlations (Spearman) — by Class / by Model")
metric_to_wide = load_raters(RATER_GLOBS)
# Quick diagnostics so you can see coverage
# Overall average inter-rater correlation (no grouping) per rater, per metric
if metric_to_wide:
st.caption("Overall inter-rater averages (across all classes/models)")
rows = []
for metric, wide in metric_to_wide.items():
avg = inter_rater_avg_overall(wide, min_overlap=2)
if not avg.empty:
tmp = avg.copy()
tmp.insert(0, "metric", metric)
rows.append(tmp)
if rows:
overall_tbl = pd.concat(rows, ignore_index=True)
overall_tbl["mean_rho"] = overall_tbl["mean_rho"].map(fmt3)
st.dataframe(overall_tbl, use_container_width=True, hide_index=True)
else:
st.info("No rater pairs with enough overlapping videos to compute overall averages.")
if not metric_to_wide:
st.info("No rater files found. Expected patterns: 'raters*.json' or 'raters/*.json'.")
else:
meta = df_raw[["video_id", "class", "model"]].drop_duplicates()
metrics_available = sorted(metric_to_wide.keys())
chosen_metric = st.selectbox("Rater metric", metrics_available, key="ir_metric")
wide = metric_to_wide.get(chosen_metric)
if wide is None or wide.empty:
st.info("Selected metric has no rater data.")
else:
sub_by_cls, sub_by_mdl = st.tabs(["By Class", "By Model"])
with sub_by_cls:
classes_all = uniq_sorted(meta["class"])
chosen_cls_ir = st.multiselect("Classes (empty = All)", classes_all, default=[], key="ir_classes")
# Per-rater PAIRWISE
pairs_cls = inter_rater_pairs_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
if pairs_cls.empty:
st.info("Not enough overlap to compute class-wise inter-rater correlations.")
else:
show_pairs = pairs_cls.copy()
show_pairs["rho"] = show_pairs["rho"].map(fmt3)
st.markdown("**Per-rater pairwise correlations**")
st.dataframe(show_pairs, use_container_width=True, hide_index=True)
# Per-rater AVERAGE
avg_cls = inter_rater_avg_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
if avg_cls.empty:
st.info("Not enough overlap to compute class-wise inter-rater averages.")
else:
show_avg = avg_cls.copy()
show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
st.markdown("**Per-rater average correlation**")
st.dataframe(show_avg, use_container_width=True, hide_index=True)
with sub_by_mdl:
models_all = uniq_sorted(meta["model"])
chosen_mdl_ir = st.multiselect("Models (empty = All)", models_all, default=[], key="ir_models")
# Per-rater PAIRWISE
pairs_mdl = inter_rater_pairs_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
if pairs_mdl.empty:
st.info("Not enough overlap to compute model-wise inter-rater correlations.")
else:
show_pairs = pairs_mdl.copy()
show_pairs["rho"] = show_pairs["rho"].map(fmt3)
st.markdown("**Per-rater pairwise correlations**")
st.dataframe(show_pairs, use_container_width=True, hide_index=True)
# Per-rater AVERAGE
avg_mdl = inter_rater_avg_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
if avg_mdl.empty:
st.info("Not enough overlap to compute model-wise inter-rater averages.")
else:
show_avg = avg_mdl.copy()
show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
st.markdown("**Per-rater average correlation**")
st.dataframe(show_avg, use_container_width=True, hide_index=True)
with right:
st.subheader("Video Viewer")
all_vids = df_raw["video_id"].tolist()
if len(all_vids) == 0:
st.info("No videos available after filtering rows with no human ratings.")
else:
selected_vid = st.selectbox("Choose a video id", sorted(all_vids))
if selected_vid:
vid_path = str(Path(video_dir) / selected_vid)
st.video(vid_path)
row_view = df_raw[df_raw["video_id"] == selected_vid].iloc[0]
st.markdown("### Scores")
# st.caption("Display: action_mean_intra is flipped once for readability; pairwise uses original (lower is better).")
model_metrics = {
k: round(float(row_view[k]), 3)
for k in ["action_mean_intra", "frame_diff_ord2"]
if pd.notna(row_view[k])
}
human_metrics = {
k: round(float(row_view[k]), 3)
for k in ["human_action", "human_anatomy", "human_appearance", "human_motion"]
if pd.notna(row_view[k])
}
st.write("**Model metrics:**")
st.json(model_metrics)
st.write("**Human scores:**")
st.json(human_metrics)