Spaces:

XThomasBU
/

VideoEvals

Sleeping

App Files Files Community

VideoEvals / src /streamlit_app.py

XThomasBU

update

1d9ccf6 4 months ago

raw

history blame contribute delete

40.6 kB

	import json
	from pathlib import Path
	from typing import Dict, List, Tuple

	import numpy as np
	import pandas as pd
	import streamlit as st

	st.set_page_config(page_title="VideoEval Explorer", layout="wide")
	st.title("VideoEval — Tables + Viewer with Human Scores")

	# =========================
	# Paths / Config
	# =========================
	mapping_json_path = "src/YOUTUBE_DATA/id_map.json"
	video_dir = "src/YOUTUBE_DATA"

	def uniq_sorted(s) -> list:
	return sorted(pd.Series(s).dropna().unique().tolist())

	# --- Metric & video-id normalization for raters ---
	ALIASES = {
	"action": "human_action",
	"anatomy": "human_anatomy",
	"appearance": "human_appearance",
	"motion": "human_motion",
	"overall": "overall",
	}
	def normalize_metric_name(name: str) -> str:
	if not isinstance(name, str):
	return str(name)
	key = name.strip().lower()
	return ALIASES.get(key, name) # fall back to original if unknown

	def normalize_video_id(v) -> str:
	s = str(v)
	# enforce ".mp4" suffix to match main df
	return s if s.endswith(".mp4") else f"{s}.mp4"

	# Model metric JSONs
	REQ = {
	"action_mean_intra": "src/action_mean_intra.json",
	"frame_diff_ord2": "src/frame_diff_ord2.json",
	}

	# Human JSONs (aggregate)
	HUMAN = {
	"human_action": "src/human_scores_analysis_action_mos_centered.json",
	"human_anatomy": "src/human_scores_analysis_anatomy_mos_centered.json",
	"human_appearance": "src/human_scores_analysis_appearance_mos_centered.json",
	"human_motion": "src/human_scores_analysis_motion_mos_centered.json",
	}

	# Rater JSONs glob patterns (support both)
	RATER_GLOBS = ["src/raters.json", "src/raters/.json"]

	# =========================
	# Helpers
	# =========================
	def load_json(p: Path) -> dict:
	with open(p, "r") as f:
	d = json.load(f)
	out = {}
	for k, v in d.items():
	try:
	out[str(k)] = float(v)
	except Exception:
	out[str(k)] = np.nan
	return out

	def build_maps_from_mapping(p: Path) -> Tuple[Dict[str, str], Dict[str, str]]:
	"""
	Returns:
	- v2m: video_id -> model
	- v2base: video_id -> base_name (original 'videoName', e.g. 'v_JumpingJack_g13_c02')
	Accepts either 'video_to_model' or 'model_to_videoName_to_id' schema.
	"""
	v2m, v2base = {}, {}
	if not p or not p.exists():
	return v2m, v2base

	with open(p, "r") as f:
	m = json.load(f)

	if "video_to_model" in m and isinstance(m["video_to_model"], dict):
	v2m = dict(m["video_to_model"])
	return v2m, v2base

	if "model_to_videoName_to_id" in m and isinstance(m["model_to_videoName_to_id"], dict):
	for model, name_to_id in m["model_to_videoName_to_id"].items():
	for base_name, cls_id in name_to_id.items():
	vid = f"{cls_id}.mp4" # filenames like Class__HASH.mp4
	v2m[vid] = model
	v2base[vid] = base_name
	return v2m, v2base

	def minmax_normalize(series: pd.Series) -> pd.Series:
	s = pd.to_numeric(series, errors="coerce")
	if s.notna().sum() == 0:
	return s
	smin, smax = s.min(skipna=True), s.max(skipna=True)
	if not np.isfinite(smin) or not np.isfinite(smax) or smin == smax:
	return s.apply(lambda x: 0.0 if pd.notna(x) else np.nan)
	return (s - smin) / (smax - smin)

	def fmt3(x):
	try:
	x = float(x)
	except (TypeError, ValueError):
	return ""
	if not np.isfinite(x):
	return ""
	return f"{x:.3f}"

	def filter_by(df: pd.DataFrame, cls: str, mdl: str) -> pd.DataFrame:
	"""Single-select filter (used by the two tables)."""
	out = df
	if cls != "(All)":
	out = out[out["class"] == cls]
	if mdl != "(All)":
	out = out[out["model"] == mdl]
	return out

	def filter_by_multi(df: pd.DataFrame, classes: List[str] \| None, models: List[str] \| None) -> pd.DataFrame:
	"""Multi-select filter used in pairwise tabs. Empty/None means 'all'."""
	out = df
	if classes:
	out = out[out["class"].isin(classes)]
	if models:
	out = out[out["model"].isin(models)]
	return out

	def pairwise_agreement(df: pd.DataFrame, model_col: str, human_col: str):
	"""
	Pairwise ranking agreement between model_col and human_col over all rows in df.
	Returns: (accuracy, disagree_pairs_df, total_pairs)

	Directions:
	- Model metrics: BOTH lower is better
	* action_mean_intra -> use 'action_mean_intra_orig' (pre-negation)
	* frame_diff_ord2 -> use 'frame_diff_ord2' (raw)
	- Human metrics: higher is better
	"""
	if model_col == "action_mean_intra":
	use_model_col = "action_mean_intra_orig"
	elif model_col == "frame_diff_ord2":
	use_model_col = "frame_diff_ord2"
	else:
	use_model_col = model_col # fallback

	vids = df["video_id"].tolist()
	mvals = pd.to_numeric(df[use_model_col], errors="coerce").values
	hvals = pd.to_numeric(df[human_col], errors="coerce").values

	rows = []
	total = 0
	agree = 0
	n = len(vids)

	for i in range(n):
	for j in range(i + 1, n):
	mi, mj = mvals[i], mvals[j]
	hi, hj = hvals[i], hvals[j]
	if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
	continue
	if mi == mj or hi == hj:
	continue

	# model order: LOWER is better
	model_order = "A>B" if mi < mj else "B>A"
	# human order: HIGHER is better
	human_order = "A>B" if hi > hj else "B>A"

	is_agree = (model_order == human_order)
	total += 1
	if is_agree:
	agree += 1
	else:
	rows.append({
	"video_A": vids[i],
	"video_B": vids[j],
	f"{model_col}_A": mi,
	f"{model_col}_B": mj,
	f"{human_col}_A": hi,
	f"{human_col}_B": hj,
	"model_order": model_order,
	"human_order": human_order,
	# "agree": is_agree,
	})

	acc = (agree / total) if total > 0 else np.nan
	disagree_df = pd.DataFrame(rows)
	return acc, disagree_df, total

	def cross_model_pairwise(df: pd.DataFrame, model_col: str, human_col: str):
	"""
	Pairwise agreement only across DIFFERENT MODELS but the SAME base video.
	Requires df to have: ['video_id','model','base_name', model_col, human_col, 'action_mean_intra_orig'].
	Returns: (accuracy, disagree_pairs_df, total_pairs)

	Directions:
	- Model metrics: BOTH lower is better (action_mean_intra_orig, frame_diff_ord2)
	- Human metrics: higher is better
	"""
	if model_col == "action_mean_intra":
	use_model_col = "action_mean_intra_orig"
	elif model_col == "frame_diff_ord2":
	use_model_col = "frame_diff_ord2"
	else:
	use_model_col = model_col

	rows = []
	total = 0
	agree = 0

	scope = df.dropna(subset=["base_name"])

	for base_name, g in scope.groupby("base_name"):
	vids = g["video_id"].tolist()
	models = g["model"].tolist()
	mvals = pd.to_numeric(g[use_model_col], errors="coerce").values
	hvals = pd.to_numeric(g[human_col], errors="coerce").values
	n = len(vids)
	for i in range(n):
	for j in range(i + 1, n):
	if models[i] == models[j]:
	continue # cross-model only
	mi, mj = mvals[i], mvals[j]
	hi, hj = hvals[i], hvals[j]
	if not (np.isfinite(mi) and np.isfinite(mj) and np.isfinite(hi) and np.isfinite(hj)):
	continue
	if mi == mj or hi == hj:
	continue

	model_order = "A>B" if mi < mj else "B>A" # LOWER better
	human_order = "A>B" if hi > hj else "B>A" # HIGHER better

	is_agree = (model_order == human_order)
	total += 1
	if is_agree:
	agree += 1
	else:
	rows.append({
	"base_name": base_name,
	"video_A": vids[i],
	"model_A": models[i],
	f"{model_col}_A": mi,
	f"{human_col}_A": hi,
	"video_B": vids[j],
	"model_B": models[j],
	f"{model_col}_B": mj,
	f"{human_col}_B": hj,
	"model_order": model_order,
	"human_order": human_order,
	# "agree": is_agree,
	})

	acc = (agree / total) if total > 0 else np.nan
	disagree_df = pd.DataFrame(rows)
	return acc, disagree_df, total

	def spearman_all(df: pd.DataFrame, model_cols: list, human_cols: list) -> pd.DataFrame:
	"""Spearman rho between each model metric and each human metric over all videos."""
	records = []
	for m in model_cols:
	for h in human_cols:
	sub = df[[m, h]].dropna()
	n = len(sub)
	rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
	records.append({"model_metric": m, "human_metric": h, "rho": rho, "n": n})
	return pd.DataFrame(records)

	def spearman_by_group(df: pd.DataFrame, model_cols: list, human_cols: list, group_col: str, groups: List[str]) -> pd.DataFrame:
	"""
	Spearman rho per group (group_col ∈ {'class','model'}). If groups is empty -> all unique values.
	Returns tidy df with columns: [group_col, model_metric, human_metric, rho, n]
	"""
	if not groups:
	groups = sorted([g for g in df[group_col].dropna().unique().tolist()])
	rows = []
	for g in groups:
	subdf = df[df[group_col] == g]
	if subdf.empty:
	continue
	for m in model_cols:
	for h in human_cols:
	sub = subdf[[m, h]].dropna()
	n = len(sub)
	rho = sub.corr(method="spearman").iloc[0, 1] if n >= 2 else np.nan
	rows.append({group_col: g, "model_metric": m, "human_metric": h, "rho": rho, "n": n})
	return pd.DataFrame(rows)

	# ---------- Rater loading & inter-rater correlations ----------
	def _filename_to_rater_name(p: Path) -> str:
	name = p.stem # e.g., "chvskch_scores"
	if name.endswith("_scores"):
	name = name[:-7]
	return name

	def _looks_like_video_key(k: str) -> bool:
	s = str(k)
	return "__" in s or s.endswith(".mp4") or s.endswith(".MP4")

	def _detect_rater_structure(obj) -> str:
	"""
	Returns one of:
	- 'metric_to_video' : {metric: {video_id: score}, ...}
	- 'video_to_score' : {video_id: score, ...}
	- 'video_to_metric_score' : {video_id: {metric: score}, ...}
	- 'unknown'
	"""
	if not isinstance(obj, dict) or not obj:
	return "unknown"

	# If values are not dicts -> {video_id: score}
	first_val = next(iter(obj.values()))
	if not isinstance(first_val, dict):
	return "video_to_score"

	# Values are dicts. Decide by looking at the OUTER KEYS.
	outer_keys = list(obj.keys())
	if any(_looks_like_video_key(k) for k in outer_keys):
	return "video_to_metric_score" # <-- your case
	else:
	return "metric_to_video"

	def load_raters(globs: List[str]) -> Dict[str, pd.DataFrame]:
	"""
	Returns: dict metric_name -> wide DataFrame with index=video_id ('.mp4' enforced) and columns=rater_name.
	Supports shapes:
	- {metric: {video_id: score}}
	- {video_id: score} -> metric 'overall'
	- {video_id: {metric: score}} -> (your case)
	Metric names normalized to app's human_* names.
	"""
	metric_to_frames: Dict[str, List[pd.DataFrame]] = {}
	files = []
	for pat in globs:
	files.extend(Path(".").glob(pat))

	for p in files:
	try:
	data = json.loads(Path(p).read_text())
	except Exception:
	continue

	rater = _filename_to_rater_name(p)
	shape = _detect_rater_structure(data)

	if shape == "metric_to_video":
	# {metric: {video_id: score}}
	for metric, vid_scores in data.items():
	metric_norm = normalize_metric_name(metric)
	s = pd.Series({normalize_video_id(k): v for k, v in vid_scores.items()},
	name=rater, dtype="float")
	metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())

	elif shape == "video_to_score":
	# {video_id: score} -> 'overall'
	s = pd.Series({normalize_video_id(k): v for k, v in data.items()},
	name=rater, dtype="float")
	metric_to_frames.setdefault("overall", []).append(s.to_frame())

	elif shape == "video_to_metric_score":
	# {video_id: {metric: score}} <-- YOUR FILES
	bucket: Dict[str, Dict[str, float]] = {}
	for vid, mdict in data.items():
	if not isinstance(mdict, dict):
	continue
	for metric, val in mdict.items():
	metric_norm = normalize_metric_name(metric)
	bucket.setdefault(metric_norm, {})[normalize_video_id(vid)] = val
	for metric_norm, vid_scores in bucket.items():
	s = pd.Series(vid_scores, name=rater, dtype="float")
	metric_to_frames.setdefault(metric_norm, []).append(s.to_frame())

	# else: unknown; skip

	# Merge per metric
	metric_to_wide: Dict[str, pd.DataFrame] = {}
	for metric, frames in metric_to_frames.items():
	if not frames:
	continue
	wide = frames[0]
	for f in frames[1:]:
	wide = wide.join(f, how="outer")
	wide = wide.loc[:, ~wide.columns.duplicated()]
	metric_to_wide[metric] = wide

	return metric_to_wide

	def inter_rater_pairs_by_group(
	rater_wide: pd.DataFrame,
	video_meta: pd.DataFrame,
	by: str,
	values: List[str] \| None = None,
	min_overlap: int = 2,
	) -> pd.DataFrame:
	"""
	Return per-rater pairwise correlations within each group.
	Rows: [by, rater, other_rater, rho, n]
	Filters: drop NaN rho; keep only pairs with n >= min_overlap.
	"""
	pairs = inter_rater_corr_grouped(rater_wide, video_meta, by=by, values=values)
	if pairs.empty:
	return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])

	# Ensure integer overlaps
	pairs["n"] = pairs["n"].astype(int)

	# keep only pairs with enough overlap and non-null rho
	pairs = pairs[(pairs["n"] >= min_overlap) & pairs["rho"].notna()]

	# # keep only adequate overlap, drop NaNs
	# pairs = pairs[pairs["n"] >= min_overlap].dropna(subset=["rho"])
	if pairs.empty:
	return pd.DataFrame(columns=[by, "rater", "other_rater", "rho", "n"])

	# expand to directed rows so each rater has their own row per counterpart
	a = pairs.rename(columns={"rater_i": "rater", "rater_j": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
	b = pairs.rename(columns={"rater_j": "rater", "rater_i": "other_rater"})[[by, "rater", "other_rater", "rho", "n"]]
	out = pd.concat([a, b], ignore_index=True)

	# sort for nicer reading
	out = out.sort_values([by, "rater", "rho"], ascending=[True, True, False]).reset_index(drop=True)
	return out


	def inter_rater_avg_by_group(
	rater_wide: pd.DataFrame,
	video_meta: pd.DataFrame,
	by: str,
	values: List[str] \| None = None,
	min_overlap: int = 2,
	) -> pd.DataFrame:
	"""
	Average (unweighted) inter-rater Spearman rho per rater within each group.
	Uses same filters as inter_rater_pairs_by_group.
	Rows: [by, rater, mean_rho, num_pairs_used]
	"""
	pairs_long = inter_rater_pairs_by_group(
	rater_wide, video_meta, by=by, values=values, min_overlap=min_overlap
	)
	if pairs_long.empty:
	return pd.DataFrame(columns=[by, "rater", "mean_rho", "num_pairs_used"])

	agg = (
	pairs_long.groupby([by, "rater"])
	.agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
	.reset_index()
	)
	agg["mean_rho"] = agg["mean_rho"].astype(float)
	return agg

	def inter_rater_corr_grouped(
	rater_wide: pd.DataFrame,
	video_meta: pd.DataFrame,
	by: str,
	values: List[str] \| None = None
	) -> pd.DataFrame:
	"""
	Spearman inter-rater correlation matrices grouped by `by` ∈ {'class','model'}.
	Requires ≥2 raters with ≥2 overlapping videos inside each group.
	Returns tidy df: [by, rater_i, rater_j, rho, n]
	"""
	assert by in ("class", "model")

	def corr_to_long(df_corr: pd.DataFrame, counts: pd.DataFrame, group_label: str):
	out = []
	for i in df_corr.index:
	for j in df_corr.columns:
	if i >= j:
	continue
	out.append({by: group_label, "rater_i": i, "rater_j": j,
	"rho": df_corr.loc[i, j], "n": int(counts.loc[i, j])})
	return out

	meta = video_meta[["video_id", "class", "model"]].drop_duplicates().set_index("video_id")
	X = rater_wide.copy()
	X.index = X.index.map(str) # should already be normalized to '.mp4'
	X = X.join(meta, how="left")

	if not values:
	values = sorted([v for v in X[by].dropna().unique().tolist()])

	results = []

	def corr_with_counts(M: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
	rho = M.corr(method="spearman", min_periods=2)
	mask = ~M.isna()
	counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="float")
	for a in M.columns:
	for b in M.columns:
	counts.loc[a, b] = float((mask[a] & mask[b]).sum())
	return rho, counts

	for g in values:
	sub = X[X[by] == g]
	if sub.empty:
	continue
	# keep only rater columns with at least 2 ratings
	cols = [c for c in sub.columns if c not in ("class", "model")]
	usable = [c for c in cols if sub[c].notna().sum() >= 2]
	if len(usable) < 2:
	continue
	M = sub[usable]
	rho, counts = corr_with_counts(M)
	# if every pair has <2 overlaps, skip
	if (counts.values < 2).all():
	continue
	results.extend(corr_to_long(rho, counts, g))

	out = pd.DataFrame(results)
	if not out.empty:
	out["rho"] = out["rho"].astype(float)
	return out


	def _corr_with_counts_matrix(M: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Spearman rho matrix + overlap counts matrix for a wide (videos x raters) frame."""
	rho = M.corr(method="spearman", min_periods=2)
	mask = ~M.isna()
	counts = pd.DataFrame(index=M.columns, columns=M.columns, dtype="int64")
	for a in M.columns:
	for b in M.columns:
	counts.loc[a, b] = int((mask[a] & mask[b]).sum())
	return rho, counts

	def inter_rater_pairs_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
	"""
	Pairwise inter-rater Spearman over the entire dataset (no grouping).
	Returns columns: [rater, other_rater, rho, n]
	n = overlapping videos used for that pair.
	"""
	if rater_wide is None or rater_wide.empty:
	return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])

	# keep only rater columns with at least 2 ratings
	usable = [c for c in rater_wide.columns if rater_wide[c].notna().sum() >= 2]
	M = rater_wide[usable]
	if M.shape[1] < 2:
	return pd.DataFrame(columns=["rater", "other_rater", "rho", "n"])

	rho, counts = _corr_with_counts_matrix(M)

	rows = []
	for i in rho.index:
	for j in rho.columns:
	if i >= j:
	continue
	n_overlap = int(counts.loc[i, j])
	r = rho.loc[i, j]
	if n_overlap >= min_overlap and pd.notna(r):
	rows.append({"rater": i, "other_rater": j, "rho": float(r), "n": n_overlap})
	rows.append({"rater": j, "other_rater": i, "rho": float(r), "n": n_overlap}) # directed

	out = pd.DataFrame(rows)
	if not out.empty:
	out = out.sort_values(["rater", "rho"], ascending=[True, False]).reset_index(drop=True)
	return out

	def inter_rater_avg_overall(rater_wide: pd.DataFrame, min_overlap: int = 2) -> pd.DataFrame:
	"""
	Overall (no grouping) average Spearman rho per rater.
	Returns columns: [rater, mean_rho, num_pairs_used]
	"""
	pairs = inter_rater_pairs_overall(rater_wide, min_overlap=min_overlap)
	if pairs.empty:
	return pd.DataFrame(columns=["rater", "mean_rho", "num_pairs_used"])
	agg = (pairs.groupby("rater")
	.agg(mean_rho=("rho", "mean"), num_pairs_used=("rho", "count"))
	.reset_index()
	.sort_values("mean_rho", ascending=False)
	.reset_index(drop=True))
	return agg

	# =========================
	# Load aggregate scores
	# =========================
	scores = {k: load_json(Path(p)) for k, p in REQ.items()}
	human_scores = {k: load_json(Path(p)) for k, p in HUMAN.items()}

	all_keys = set()
	for d in scores.values():
	all_keys \|= set(d.keys())
	for d in human_scores.values():
	all_keys \|= set(d.keys())

	rows = []
	for vid in sorted(all_keys):
	rows.append({
	"video_id": vid,
	"class": vid.split("__")[0] if "__" in vid else "UNK",
	"action_mean_intra": scores["action_mean_intra"].get(vid, np.nan),
	"frame_diff_ord2": scores["frame_diff_ord2"].get(vid, np.nan),
	"human_action": human_scores["human_action"].get(vid, np.nan),
	"human_anatomy": human_scores["human_anatomy"].get(vid, np.nan),
	"human_appearance": human_scores["human_appearance"].get(vid, np.nan),
	"human_motion": human_scores["human_motion"].get(vid, np.nan),
	})

	df_raw = pd.DataFrame(rows)

	# Keep original action_mean_intra for pairwise; flip for display ONCE (your JSON is negative)
	df_raw["action_mean_intra"] = pd.to_numeric(df_raw["action_mean_intra"], errors="coerce")
	df_raw["action_mean_intra_orig"] = df_raw["action_mean_intra"].copy()
	df_raw["action_mean_intra"] = df_raw["action_mean_intra_orig"] # positive / higher-better in tables

	# Map models + base_name via mapping.json
	v2m, v2base = build_maps_from_mapping(Path(mapping_json_path))
	df_raw["model"] = df_raw["video_id"].map(v2m).fillna("UNK")
	df_raw["base_name"] = df_raw["video_id"].map(v2base) # may be NaN

	# Metric columns (display + normalization)
	metric_cols = [
	"action_mean_intra", # positive (after flip) for display; pairwise uses *_orig
	"frame_diff_ord2", # raw; lower-better
	"human_action",
	"human_anatomy",
	"human_appearance",
	"human_motion",
	]
	human_cols = ["human_action", "human_anatomy", "human_appearance", "human_motion"]

	# Drop rows with NO human ratings at all
	df_raw = df_raw.dropna(subset=human_cols, how="all")

	# Normalized view for tables
	df_norm = df_raw.copy()
	for c in metric_cols:
	df_norm[c] = minmax_normalize(df_norm[c])

	# =========================
	# UI Layout
	# =========================
	left, right = st.columns([2, 1])

	with left:
	tab_tables, tab_agree, tab_cross, tab_spear, tab_ir = st.tabs(
	["Tables", "Agreement", "Cross-Model (same videoName)", "Spearman", "Inter-Rater"]
	)

	# --------------------- TABLES (A/B) ---------------------
	with tab_tables:
	st.subheader("Comparison Tables")

	# Table A
	st.markdown("Table A")
	use_norm_a = st.checkbox("Show normalized scores (0–1) — A", value=False, key="norm_a")
	classes = ["(All)"] + uniq_sorted(df_raw["class"])
	models = ["(All)"] + uniq_sorted(df_raw["model"])
	ca, ma = st.columns(2)
	with ca:
	chosen_c_a = st.selectbox("Class (A)", classes, key="class_a")
	with ma:
	chosen_m_a = st.selectbox("Model (A)", models, key="model_a")
	df_view_a = df_norm if use_norm_a else df_raw
	filt_a = filter_by(df_view_a, chosen_c_a, chosen_m_a).copy()
	disp_a = filt_a.copy()
	disp_a[metric_cols] = disp_a[metric_cols].applymap(fmt3)
	st.dataframe(disp_a, use_container_width=True, hide_index=True)

	st.markdown("---")

	# Table B
	st.markdown("Table B")
	use_norm_b = st.checkbox("Show normalized scores (0–1) — B", value=False, key="norm_b")
	cb, mb = st.columns(2)
	with cb:
	chosen_c_b = st.selectbox("Class (B)", classes, key="class_b")
	with mb:
	chosen_m_b = st.selectbox("Model (B)", models, key="model_b")
	df_view_b = df_norm if use_norm_b else df_raw
	filt_b = filter_by(df_view_b, chosen_c_b, chosen_m_b).copy()
	disp_b = filt_b.copy()
	disp_b[metric_cols] = disp_b[metric_cols].applymap(fmt3)
	st.dataframe(disp_b, use_container_width=True, hide_index=True)

	# --------------------- AGREEMENT (multi-class) ---------------------
	with tab_agree:
	st.subheader("Agreement (Model vs Human) — multi-class")

	all_models = uniq_sorted(df_raw["model"])
	all_classes = uniq_sorted(df_raw["class"])
	c1, c2 = st.columns(2)
	with c1:
	chosen_classes = st.multiselect("Classes (empty = All)", all_classes, default=[], key="agree_classes")
	with c2:
	chosen_models = st.multiselect("Models (empty = All)", all_models, default=[], key="agree_models")

	model_metric = st.selectbox(
	"Model metric",
	["action_mean_intra", "frame_diff_ord2"],
	index=0,
	key="agree_model_metric",
	help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
	)
	human_metric = st.selectbox("Human metric", human_cols, index=0, key="agree_human_metric")

	scope_df = filter_by_multi(df_raw, chosen_classes, chosen_models).copy()
	if "action_mean_intra_orig" not in scope_df.columns:
	scope_df = scope_df.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
	req_model_col = "action_mean_intra_orig" if model_metric == "action_mean_intra" else model_metric
	scope_df = scope_df.dropna(subset=[req_model_col, human_metric], how="any")

	if len(scope_df) < 2:
	st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
	else:
	acc, disagree_df, total_pairs = pairwise_agreement(scope_df, model_metric, human_metric)
	st.markdown(f"Pairwise accuracy: {fmt3(acc)} (over {total_pairs} comparable pairs).")
	if not disagree_df.empty:
	df_show = disagree_df.copy()
	numeric_cols = [f"{model_metric}_A", f"{model_metric}_B", f"{human_metric}_A", f"{human_metric}_B"]
	for col in numeric_cols:
	if col in df_show.columns:
	df_show[col] = pd.to_numeric(df_show[col], errors="coerce").map(fmt3)
	st.markdown("Disagreeing pairs")
	st.dataframe(df_show, use_container_width=True, hide_index=True)
	else:
	st.success("All comparable pairs agree. 🎉")

	# # --------------------- GLOBAL PAIRWISE (any videos, multi-class) ---------------------
	# with tab_global:
	# st.subheader("Global Pairwise (any videos) — multi-class")

	# all_models = uniq_sorted(df_raw["model"])
	# all_classes = uniq_sorted(df_raw["class"])

	# c1, c2 = st.columns(2)
	# with c1:
	# chosen_classes_g = st.multiselect("Classes (empty = All)", all_classes, default=[], key="global_classes")
	# with c2:
	# chosen_models_g = st.multiselect("Models (empty = All)", all_models, default=[], key="global_models")

	# model_metric_g = st.selectbox(
	# "Model metric",
	# ["action_mean_intra", "frame_diff_ord2"],
	# index=0,
	# key="global_model_metric",
	# help="Agreement rule: BOTH model metrics use LOWER-is-better (action_mean_intra uses original sign)."
	# )
	# human_metric_g = st.selectbox(
	# "Human metric",
	# human_cols,
	# index=0,
	# key="global_human_metric"
	# )

	# scope_df_g = filter_by_multi(df_raw, chosen_classes_g, chosen_models_g).copy()
	# if "action_mean_intra_orig" not in scope_df_g.columns:
	# scope_df_g = scope_df_g.merge(
	# df_raw[["video_id", "action_mean_intra_orig"]],
	# on="video_id",
	# how="left"
	# )
	# req_model_col_g = "action_mean_intra_orig" if model_metric_g == "action_mean_intra" else model_metric_g
	# scope_df_g = scope_df_g.dropna(subset=[req_model_col_g, human_metric_g], how="any")

	# if len(scope_df_g) < 2:
	# st.info("Need at least 2 videos (with non-missing metrics) to compute pairwise agreement.")
	# else:
	# acc_g, disagree_df_g, total_pairs_g = pairwise_agreement(scope_df_g, model_metric_g, human_metric_g)
	# st.markdown(f"Global pairwise accuracy: {fmt3(acc_g)} (over {total_pairs_g} comparable pairs).")

	# if not disagree_df_g.empty:
	# df_show_g = disagree_df_g.copy()
	# numeric_cols_g = [f"{model_metric_g}_A", f"{model_metric_g}_B", f"{human_metric_g}_A", f"{human_metric_g}_B"]
	# for col in numeric_cols_g:
	# if col in df_show_g.columns:
	# df_show_g[col] = pd.to_numeric(df_show_g[col], errors="coerce").map(fmt3)
	# st.markdown("Disagreeing global pairs")
	# st.dataframe(df_show_g, use_container_width=True, hide_index=True)
	# else:
	# st.success("All comparable global pairs agree. 🎉")

	# --------------------- CROSS-MODEL (same base video, multi-class) ---------------------
	with tab_cross:
	st.subheader("Cross-Model Agreement — same original video (base_name), multi-class")

	all_classes = uniq_sorted(df_raw["class"])
	all_models = uniq_sorted(df_raw["model"])

	c1, c2 = st.columns(2)
	with c1:
	chosen_classes2 = st.multiselect("Classes (empty = All)", all_classes, default=[], key="cross_classes")
	with c2:
	chosen_models2 = st.multiselect("Models (empty = All)", all_models, default=[], key="cross_models")

	model_metric2 = st.selectbox(
	"Model metric",
	["action_mean_intra", "frame_diff_ord2"],
	index=0,
	key="cross_model_metric",
	help="Pairwise uses LOWER-is-better for both model metrics (action_mean_intra uses original sign)."
	)
	human_metric2 = st.selectbox(
	"Human metric",
	human_cols,
	index=0,
	key="cross_human_metric"
	)

	scope_df2 = filter_by_multi(df_raw, chosen_classes2, chosen_models2).copy()
	if "action_mean_intra_orig" not in scope_df2.columns:
	scope_df2 = scope_df2.merge(df_raw[["video_id", "action_mean_intra_orig"]], on="video_id", how="left")
	req_model_col2 = "action_mean_intra_orig" if model_metric2 == "action_mean_intra" else model_metric2
	scope_df2 = scope_df2.dropna(subset=["base_name", req_model_col2, human_metric2], how="any")

	eligible = scope_df2.groupby("base_name")["model"].nunique().reset_index(name="n_models")
	eligible_names = set(eligible[eligible["n_models"] >= 2]["base_name"].tolist())
	scope_df2 = scope_df2[scope_df2["base_name"].isin(eligible_names)]

	if scope_df2.empty:
	st.info("No base videos with at least two different models in the current filters.")
	else:
	acc2, disagree_df2, total_pairs2 = cross_model_pairwise(scope_df2, model_metric2, human_metric2)
	st.markdown(f"Cross-model pairwise accuracy: {fmt3(acc2)} (over {total_pairs2} comparable cross-model pairs).")
	if not disagree_df2.empty:
	df_show2 = disagree_df2.copy()
	numeric_cols2 = [f"{model_metric2}_A", f"{model_metric2}_B", f"{human_metric2}_A", f"{human_metric2}_B"]
	for col in numeric_cols2:
	if col in df_show2.columns:
	df_show2[col] = pd.to_numeric(df_show2[col], errors="coerce").map(fmt3)
	st.markdown("Disagreeing cross-model pairs (same base video)")
	st.dataframe(df_show2, use_container_width=True, hide_index=True)
	else:
	st.success("All comparable cross-model pairs agree. 🎉")

	# --------------------- SPEARMAN (separate views per human metric) ---------------------
	with tab_spear:
	st.subheader("Spearman correlations (separate by human metric)")

	model_metrics_all = ["action_mean_intra", "frame_diff_ord2"]
	# one sub-tab per human metric
	sub_tabs = st.tabs(human_cols)
	for tab_obj, hmetric in zip(sub_tabs, human_cols):
	with tab_obj:
	st.caption(f"Human metric: {hmetric}")
	# Overall (only rows where the specific human metric is present)
	spear_overall = spearman_all(df_raw, model_metrics_all, [hmetric])
	show = spear_overall.copy()
	show["rho"] = show["rho"].map(fmt3)
	st.markdown("Overall")
	st.dataframe(show, use_container_width=True, hide_index=True)

	st.markdown("---")
	st.markdown("By Class")
	classes_all = uniq_sorted(df_raw["class"])
	chosen_cls = st.multiselect("Classes (empty = All)", classes_all, default=[], key=f"spear_cls_{hmetric}")
	spc = spearman_by_group(df_raw, model_metrics_all, [hmetric], "class", chosen_cls)
	spc["rho"] = spc["rho"].map(fmt3)
	st.dataframe(spc, use_container_width=True, hide_index=True)

	st.markdown("---")
	st.markdown("By Model")
	models_all = uniq_sorted(df_raw["model"])
	chosen_mdl = st.multiselect("Models (empty = All)", models_all, default=[], key=f"spear_mdl_{hmetric}")
	spm = spearman_by_group(df_raw, model_metrics_all, [hmetric], "model", chosen_mdl)
	spm["rho"] = spm["rho"].map(fmt3)
	st.dataframe(spm, use_container_width=True, hide_index=True)

	# --------------------- INTER-RATER (by class / by model ONLY) ---------------------
	with tab_ir:
	st.subheader("Inter-Rater Correlations (Spearman) — by Class / by Model")

	metric_to_wide = load_raters(RATER_GLOBS)

	# Quick diagnostics so you can see coverage
	# Overall average inter-rater correlation (no grouping) per rater, per metric
	if metric_to_wide:
	st.caption("Overall inter-rater averages (across all classes/models)")
	rows = []
	for metric, wide in metric_to_wide.items():
	avg = inter_rater_avg_overall(wide, min_overlap=2)
	if not avg.empty:
	tmp = avg.copy()
	tmp.insert(0, "metric", metric)
	rows.append(tmp)
	if rows:
	overall_tbl = pd.concat(rows, ignore_index=True)
	overall_tbl["mean_rho"] = overall_tbl["mean_rho"].map(fmt3)
	st.dataframe(overall_tbl, use_container_width=True, hide_index=True)
	else:
	st.info("No rater pairs with enough overlapping videos to compute overall averages.")

	if not metric_to_wide:
	st.info("No rater files found. Expected patterns: 'raters.json' or 'raters/.json'.")
	else:
	meta = df_raw[["video_id", "class", "model"]].drop_duplicates()

	metrics_available = sorted(metric_to_wide.keys())
	chosen_metric = st.selectbox("Rater metric", metrics_available, key="ir_metric")

	wide = metric_to_wide.get(chosen_metric)
	if wide is None or wide.empty:
	st.info("Selected metric has no rater data.")
	else:
	sub_by_cls, sub_by_mdl = st.tabs(["By Class", "By Model"])

	with sub_by_cls:
	classes_all = uniq_sorted(meta["class"])
	chosen_cls_ir = st.multiselect("Classes (empty = All)", classes_all, default=[], key="ir_classes")

	# Per-rater PAIRWISE
	pairs_cls = inter_rater_pairs_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
	if pairs_cls.empty:
	st.info("Not enough overlap to compute class-wise inter-rater correlations.")
	else:
	show_pairs = pairs_cls.copy()
	show_pairs["rho"] = show_pairs["rho"].map(fmt3)
	st.markdown("Per-rater pairwise correlations")
	st.dataframe(show_pairs, use_container_width=True, hide_index=True)

	# Per-rater AVERAGE
	avg_cls = inter_rater_avg_by_group(wide, meta, by="class", values=chosen_cls_ir, min_overlap=2)
	if avg_cls.empty:
	st.info("Not enough overlap to compute class-wise inter-rater averages.")
	else:
	show_avg = avg_cls.copy()
	show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
	st.markdown("Per-rater average correlation")
	st.dataframe(show_avg, use_container_width=True, hide_index=True)

	with sub_by_mdl:
	models_all = uniq_sorted(meta["model"])
	chosen_mdl_ir = st.multiselect("Models (empty = All)", models_all, default=[], key="ir_models")

	# Per-rater PAIRWISE
	pairs_mdl = inter_rater_pairs_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
	if pairs_mdl.empty:
	st.info("Not enough overlap to compute model-wise inter-rater correlations.")
	else:
	show_pairs = pairs_mdl.copy()
	show_pairs["rho"] = show_pairs["rho"].map(fmt3)
	st.markdown("Per-rater pairwise correlations")
	st.dataframe(show_pairs, use_container_width=True, hide_index=True)

	# Per-rater AVERAGE
	avg_mdl = inter_rater_avg_by_group(wide, meta, by="model", values=chosen_mdl_ir, min_overlap=2)
	if avg_mdl.empty:
	st.info("Not enough overlap to compute model-wise inter-rater averages.")
	else:
	show_avg = avg_mdl.copy()
	show_avg["mean_rho"] = show_avg["mean_rho"].map(fmt3)
	st.markdown("Per-rater average correlation")
	st.dataframe(show_avg, use_container_width=True, hide_index=True)

	with right:
	st.subheader("Video Viewer")

	all_vids = df_raw["video_id"].tolist()
	if len(all_vids) == 0:
	st.info("No videos available after filtering rows with no human ratings.")
	else:
	selected_vid = st.selectbox("Choose a video id", sorted(all_vids))
	if selected_vid:
	vid_path = str(Path(video_dir) / selected_vid)
	st.video(vid_path)

	row_view = df_raw[df_raw["video_id"] == selected_vid].iloc[0]

	st.markdown("### Scores")
	# st.caption("Display: action_mean_intra is flipped once for readability; pairwise uses original (lower is better).")

	model_metrics = {
	k: round(float(row_view[k]), 3)
	for k in ["action_mean_intra", "frame_diff_ord2"]
	if pd.notna(row_view[k])
	}
	human_metrics = {
	k: round(float(row_view[k]), 3)
	for k in ["human_action", "human_anatomy", "human_appearance", "human_motion"]
	if pd.notna(row_view[k])
	}

	st.write("Model metrics:")
	st.json(model_metrics)
	st.write("Human scores:")
	st.json(human_metrics)