AttrLLM

Sleeping

File size: 198,314 Bytes

import base64
import copy
import inspect
import io
import json
import os
import random
import re
from itertools import combinations
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional
import requests  # new

BACKEND_URL = os.getenv("ATTRLLM_BACKEND_URL", "http://127.0.0.1:8000")

_DEFAULT_GRADIO_DIR = Path(os.environ.get("GRADIO_TEMP_DIR", Path.cwd() / ".gradio_tmp"))
os.environ.setdefault("GRADIO_TEMP_DIR", str(_DEFAULT_GRADIO_DIR))
_DEFAULT_GRADIO_DIR.mkdir(parents=True, exist_ok=True)


def _get_request_timeout() -> float:
    value = os.getenv("ATTRLLM_REQUEST_TIMEOUT")
    if not value:
        return 900.0
    try:
        return float(value)
    except ValueError:
        return 900.0


def _env_flag(name: str, default: bool = False) -> bool:
    value = os.getenv(name)
    if value is None:
        return default
    return value.strip().lower() in {"1", "true", "yes", "y", "on"}


def _is_hf_spaces() -> bool:
    return bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE"))


def _supports_kwarg(callable_obj, kwarg_name: str) -> bool:
    """Return whether a callable appears to accept a named keyword argument."""
    try:
        return kwarg_name in inspect.signature(callable_obj).parameters
    except (TypeError, ValueError):
        return False


def _public_only_mode() -> bool:
    # Keep the text tab visible on Spaces unless explicitly overridden.
    return _env_flag("ATTRLLM_PUBLIC_ONLY", False)


def _mm_only_mode() -> bool:
    return _env_flag("ATTRLLM_MM_ONLY", False)


def _show_auxiliary_tabs() -> bool:
    return _env_flag("ATTRLLM_SHOW_AUX_TABS", False)


def _public_results_file(
    dataset_key: str,
    ex_id: str,
    scalarizer: str,
    level: str,
    method: str,
) -> Path:
    results_dir = _get_results_dir()
    return (
        results_dir
        / "public"
        / dataset_key
        / ex_id
        / scalarizer
        / level
        / f"{method}.json"
    )


def _reference_results_file(
    model_size: str,
    dataset_key: str,
    ex_id: str,
    scalarizer: str,
    level: str,
) -> Path:
    results_dir = _get_results_dir()
    return (
        results_dir
        / "reference_answer"
        / model_size
        / dataset_key
        / ex_id
        / scalarizer
        / f"{level}.json"
    )


def _find_available_model_size(
    dataset_key: str,
    ex_id: str,
    scalarizer: str,
    level: str,
) -> Optional[str]:
    for size in ("large", "medium", "small"):
        if _reference_results_file(size, dataset_key, ex_id, scalarizer, level).exists():
            return size
    return None


# Fallback order when requested (scalarizer, level) is not present (e.g. on HF Space with partial results).
_FALLBACK_SCALARIZER_LEVELS: List[Tuple[str, str]] = [
    ("geomean_jointprob", "word"),
    ("semantic_similarity", "word"),
    ("geomean_jointprob", "sentence"),
    ("semantic_similarity", "sentence"),
    ("geomean_jointprob", "paragraph"),
    ("semantic_similarity", "paragraph"),
]


def _find_any_available_result(
    dataset_key: str,
    ex_id: str,
    get_res: Any,
    method: str = "shapley",
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[Dict]]:
    """Try (model_size, scalarizer, level) fallbacks; return (size, scalarizer, level, result_dict) or (None,)*4."""
    for size in ("small", "medium", "large"):
        for scalarizer, level in _FALLBACK_SCALARIZER_LEVELS:
            try:
                result = get_res(size, dataset_key, ex_id, scalarizer=scalarizer, feature_level=level) or {}
                payload = result.get(method, {})
                if payload and (payload.get("features") or payload.get("heatmap")):
                    return (size, scalarizer, level, result)
            except Exception:
                continue
    return (None, None, None, None)


def _parse_sparse_key(raw_key: str) -> Tuple[int, ...]:
    key = str(raw_key).strip()
    if not key:
        return ()
    return tuple(int(part) for part in key.split(",") if part != "")


def _normalize_public_payload_fallback(data: Dict[str, Any], method: str, top_k: int = 10) -> Dict[str, Any]:
    """Convert your JSON (features list + meta + mobius_dict) to UI display format. mobius_dict can be empty."""
    if not isinstance(data, dict):
        return {}
    features = data.get("features")
    mobius_raw = data.get("mobius_dict") if isinstance(data.get("mobius_dict"), dict) else {}
    if not isinstance(features, list) or not features:
        return {}
    method = (method or "shapley").lower()
    if method not in {"shapley", "banzhaf", "influence"}:
        method = "shapley"
    mobius_sparse: Dict[Tuple[int, ...], float] = {}
    for key, raw_val in mobius_raw.items():
        try:
            val = float(raw_val)
        except Exception:
            continue
        try:
            loc = _parse_sparse_key(str(key))
        except Exception:
            continue
        mobius_sparse[tuple(sorted(loc))] = val
    token_scores: Dict[str, float] = {}
    index_scores: Dict[int, float] = {}
    pairwise_acc: Dict[Tuple[int, int], float] = {}
    if mobius_sparse and mobius_to_shapley is not None:
        if method == "shapley":
            singleton_dict = mobius_to_shapley(mobius_sparse)
            pair_list = shapley_interactions(mobius_sparse, order=2, top_k=top_k) or []
        elif method == "banzhaf":
            singleton_dict = mobius_to_banzhaf(mobius_sparse)
            pair_list = banzhaf_interactions(mobius_sparse, order=2, top_k=top_k) or []
        elif mobius_to_influence is not None and influence_interactions is not None:
            singleton_dict = mobius_to_influence(mobius_sparse)
            pair_list = influence_interactions(mobius_sparse, order=2, top_k=top_k) or []
        else:
            singleton_dict = {}
            pair_list = []

        for loc, val in singleton_dict.items():
            if len(loc) != 1:
                continue
            idx = int(loc[0])
            if 0 <= idx < len(features):
                feat_name = str(features[idx])
                val_f = float(val)
                token_scores[feat_name] = token_scores.get(feat_name, 0.0) + val_f
                index_scores[idx] = index_scores.get(idx, 0.0) + val_f

        for loc, val in pair_list:
            if len(loc) != 2:
                continue
            i, j = int(loc[0]), int(loc[1])
            if 0 <= i < len(features) and 0 <= j < len(features):
                key = (i, j) if i <= j else (j, i)
                pairwise_acc[key] = float(val)
    else:
        # Best-effort fallback when attribution helpers are unavailable.
        for loc, val in mobius_sparse.items():
            k = len(loc)
            if k == 0:
                continue
            if method == "shapley":
                sw = 1.0 / float(k)
            elif method == "banzhaf":
                sw = 1.0 / float(2 ** (k - 1))
            else:
                sw = 1.0 / float(k)
            for idx in loc:
                if 0 <= idx < len(features):
                    feat_name = str(features[idx])
                    token_scores[feat_name] = token_scores.get(feat_name, 0.0) + sw * val
                    index_scores[idx] = index_scores.get(idx, 0.0) + sw * val
            if k >= 2:
                if method == "shapley":
                    pw = 1.0 / float(k - 1)
                elif method == "banzhaf":
                    pw = 1.0 / float(2 ** (k - 2))
                else:
                    pw = 1.0 / float(k - 1)
                for i, j in combinations(sorted(loc), 2):
                    pairwise_acc[(i, j)] = pairwise_acc.get((i, j), 0.0) + pw * val
    unique_feature_labels = [str(x) for x in features]
    sorted_pairs = sorted(pairwise_acc.items(), key=lambda kv: abs(kv[1]), reverse=True)
    if top_k and top_k > 0:
        sorted_pairs = sorted_pairs[:top_k]
    pairwise = {
        "%s|%s" % (unique_feature_labels[i], unique_feature_labels[j]): float(v)
        for (i, j), v in sorted_pairs
        if 0 <= i < len(unique_feature_labels) and 0 <= j < len(unique_feature_labels)
    }
    pairwise_interactions = [
        {"features": [unique_feature_labels[i], unique_feature_labels[j]], "value": float(v)}
        for (i, j), v in sorted_pairs
        if 0 <= i < len(unique_feature_labels) and 0 <= j < len(unique_feature_labels)
    ]
    normalized = dict(data)
    normalized["token_scores"] = token_scores
    normalized["pairwise"] = pairwise
    normalized["pairwise_interactions"] = pairwise_interactions
    normalized["features"] = [
        {"feature": str(features[i]), "value": float(index_scores.get(i, 0.0)), "index": i}
        for i in range(len(features))
    ]
    normalized["feature_texts"] = [str(x) for x in features]
    return normalized


def _public_get_model_answer_short_from_file(
    model_size: str,
    dataset: str,
    ex_id: str,
    scalarizer: str = "geomean_jointprob",
    feature_level: str = "word",
) -> Dict[str, Any]:
    """Load model_answer_short payload (the wrong-answer attribution) for the
    Public Mode dual-heatmap branch. Returns a per-method dict shaped like
    `_public_get_result_from_file`, or {} when the file is missing.
    """
    results_dir = _get_results_dir()
    path = (
        results_dir
        / "model_answer_short"
        / model_size
        / dataset
        / ex_id
        / scalarizer
        / f"{feature_level}.json"
    )
    if not path.exists():
        return {}
    try:
        with path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        return {}
    if not isinstance(data, dict):
        return {}
    norm_s = _normalize_public_payload_fallback(copy.deepcopy(data), "shapley")
    norm_b = _normalize_public_payload_fallback(copy.deepcopy(data), "banzhaf")
    norm_i = _normalize_public_payload_fallback(copy.deepcopy(data), "influence")
    if not norm_s and not norm_b and not norm_i:
        return {}
    return {
        "shapley": norm_s,
        "banzhaf": norm_b,
        "influence": norm_i,
        "meta": {
            "dataset": dataset,
            "example_id": ex_id,
            "model_size": model_size,
            "target_mode": "model_answer_short",
            "source_layout": "results/model_answer_short/{model_size}/{dataset}/{example_id}/{scalarizer}/{feature_level}.json",
        },
    }


def _public_get_result_from_file(
    model_size: str,
    dataset: str,
    ex_id: str,
    scalarizer: Optional[str] = None,
    feature_level: Optional[str] = None,
) -> Dict[str, Any]:
    """Load reference_answer result from disk when loader.results.get_result_by_id is unavailable (e.g. on Space)."""
    scalarizer = (scalarizer or "").strip()
    feature_level = (feature_level or "").strip()
    if not scalarizer or not feature_level:
        return {}
    levels_to_try = [feature_level] + [l for l in ("word", "sentence", "paragraph") if l != feature_level]
    for lvl in levels_to_try:
        path = _reference_results_file(model_size, dataset, ex_id, scalarizer, lvl)
        if not path.exists() or os.getenv("SPACE_ID"):
            try:
                from loader.results import _maybe_download_from_space
                path = _maybe_download_from_space(path, force_download=True) or path
            except Exception:
                pass
        if not path.exists():
            continue
        try:
            with path.open("r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception:
            continue
        if not isinstance(data, dict):
            continue
        # Your JSON: features (list) + meta + mobius_dict (can be empty). Always convert to UI format.
        norm_s = _normalize_public_payload_fallback(copy.deepcopy(data), "shapley")
        norm_b = _normalize_public_payload_fallback(copy.deepcopy(data), "banzhaf")
        norm_i = _normalize_public_payload_fallback(copy.deepcopy(data), "influence")
        if not norm_s and not norm_b and not norm_i:
            continue
        return {
            "shapley": norm_s,
            "banzhaf": norm_b,
            "influence": norm_i,
            "meta": {
                "dataset": dataset,
                "example_id": ex_id,
                "model_size": model_size,
                "source_layout": "results/reference_answer/{model_size}/{dataset}/{example_id}/{scalarizer}/{feature_level}.json",
            },
        }
    return {}


_FALLBACK_DATASET_FILES: Dict[str, str] = {
    "bar_exam": "BarExam_qa.csv",
    "causal_judgment": "bbh_causal_judgement.csv",
    "snarks": "bbh_snarks.csv",
    "bbq_disamb": "BBQ_disamb.csv",
    "cnn_dailymail": "CNN_dailymail.csv",
    "drop": "drop.csv",
    "esnli": "eSNLI.csv",
    "fever": "fever.csv",
    "hotpot_qa": "hotpot_qa.csv",
    "medical_qa": "medical_qa.csv",
}


def _fallback_datasets_dir() -> Path:
    return (_REPO_ROOT / "datasets").resolve()


def _fallback_pick_first_nonempty(raw: Dict[str, str], candidates: List[str]) -> str:
    for c in candidates:
        val = raw.get(c)
        if val is not None and str(val).strip() != "":
            return str(val)
    return ""


def _fallback_load_dataset(dataset_key: str, max_rows: int = 10) -> List[Dict[str, str]]:
    import csv

    filename = _FALLBACK_DATASET_FILES.get(dataset_key)
    if not filename:
        return []
    path = _fallback_datasets_dir() / filename
    if not path.exists():
        return []

    rows: List[Dict[str, str]] = []
    with path.open("r", encoding="utf-8", errors="replace", newline="") as f:
        reader = csv.DictReader(f)
        for i, raw in enumerate(reader, start=1):
            ex_id = raw.get("id") or raw.get("example_id") or raw.get("uid") or f"example_{i}"
            context = _fallback_pick_first_nonempty(raw, [
                "Context", "context",
                "passage", "article", "story", "premise",
                "paragraph", "document", "sentence1", "sent1", "background",
            ])
            prompt = _fallback_pick_first_nonempty(raw, [
                "Prompt", "prompt",
                "question", "input", "query",
                "sentence2", "sent2", "hypothesis",
                "qa_question", "title",
            ])
            answer = _fallback_pick_first_nonempty(raw, [
                "Answer", "answer",
                "target", "gold", "label", "output", "reference",
                "highlights",
            ])
            ex = {
                "id": str(ex_id),
                "context": context,
                "prompt": prompt,
            }
            if answer:
                ex["answer"] = answer
            rows.append(ex)
            if len(rows) >= max_rows:
                break
    return rows


REQUEST_TIMEOUT = _get_request_timeout()

SCALARIZER_CHOICES = [
    ("Semantic Similarity (y vs y_S)", "semantic_similarity"),
    ("LogProb", "logprob"),
    ("JointProb", "jointprob"),
    ("GeoMean JointProb", "geomean_jointprob"),
    ("Half SimLog", "half_simlog"),
]

PUBLIC_SCALARIZER_CHOICES = [
    ("Semantic Similarity", "semantic_similarity"),
    ("Perplexity", "geomean_jointprob"),
]

DATASET_DISPLAY_LABELS = {
    "bar_exam": "Bar Exam Questions",
    "bbq_disamb": "BBQ Disambiguation",
    "causal_judgment": "Causal Judgment",
    "cnn_dailymail": "CNN / DailyMail Summaries",
    "drop": "DROP Reading Comprehension",
    "esnli": "e-SNLI Natural Language Inference",
    "fever": "FEVER Fact Checking",
    "hotpot_qa": "HotpotQA Multi-hop Questions",
    "medical_qa": "Medical Questions",
    "snarks": "Snarks",
}

import sys
_REPO_ROOT = Path(__file__).resolve().parents[1]
if str(_REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(_REPO_ROOT))


def _get_results_dir() -> Path:
    """Resolve results directory: env, repo root, or on HF Space fallback to cwd/results."""
    env_dir = os.getenv("ATTRLLM_RESULTS_DIR")
    if env_dir:
        return Path(env_dir).resolve()
    default = (_REPO_ROOT / "results").resolve()
    if default.exists():
        return default
    if _is_hf_spaces():
        cwd_results = (Path.cwd() / "results").resolve()
        if cwd_results.exists():
            return cwd_results
    return default


import gradio as gr
from PIL import Image

from .components.model_selector import (
    create_model_selector,
    create_multimodal_model_selector,
    create_feature_level_selector,
    create_attribution_method_toggle,
)
from .components.example_browser import create_dataset_selector, create_example_browser
from .components.results_display import create_results_display, update
from .plotting.heatmap import create_interactive_text_heatmap
from .plotting.interactions import (
    plot_top_interactions,
    plot_interaction_matrix,
    create_interaction_token_view,
)
from .plotting.text_interactions import create_text_interaction_html
from .plotting.mm_interactions import create_multimodal_interaction_html
from .plotting.coalition_viewer import compute_coalition_viewer_data, render_coalition_viewer_html
from .build_info import BUILD_ID, BUILD_TS

# Medical image precomputed results (optional)
try:
    from .medical_loader import (
        MEDICAL_EXAMPLES,
        load_medical_example,
        get_masked_image_path,
        BENCHMARK_EXAMPLES,
        get_examples_by_modality,
        list_available_modalities,
        load_benchmark_example,
        extract_segment_regions,
    )
    from .plotting.medical_charts import (
        create_shapley_bar_chart,
        create_influence_heatmap,
        create_cross_modal_bar_chart,
        draw_grid_overlay,
        draw_segment_labels,
        generate_interpretation_text,
        rename_patch_labels,
        align_segments_to_reference,
        remap_region_values,
        merge_subword_token_values,
        _tok_to_word,
    )
    from .plotting.benchmark_interaction import create_benchmark_interaction_html
    _MEDICAL_AVAILABLE = True
except ImportError:
    _MEDICAL_AVAILABLE = False
    MEDICAL_EXAMPLES = {}
    BENCHMARK_EXAMPLES = {}

# MIMIC-CXR precomputed results (optional)
try:
    from .mimic_loader import (
        MIMIC_EXAMPLES,
        load_mimic_example,
        get_mimic_image_path,
    )
    _MIMIC_AVAILABLE = bool(MIMIC_EXAMPLES)
except ImportError:
    _MIMIC_AVAILABLE = False
    MIMIC_EXAMPLES = {}

# Dermoscopy ISIC precomputed results (optional)
try:
    from .isic_loader import (
        ISIC_EXAMPLES,
        load_isic_example,
        get_isic_image_path,
    )
    _ISIC_AVAILABLE = bool(ISIC_EXAMPLES)
except ImportError:
    _ISIC_AVAILABLE = False
    ISIC_EXAMPLES = {}

# MS-COCO precomputed results (optional)
try:
    from .coco_loader import COCO_EXAMPLES, load_coco_example, get_coco_masked_image_path
    _COCO_AVAILABLE = True
except ImportError:
    _COCO_AVAILABLE = False
    COCO_EXAMPLES = {}

# CLIP cross-modal pipeline for live compute (optional — runs on CPU)
try:
    from attribution.set_mm import (
        PipelineConfig,
        CrossModalCLIPScorer,
        ImageRegion,
        TokenPlayer,
        featurise,
        tokenise_caption,
        build_cross_modal_set_function,
        run_proxyspex,
        mobius_to_shapley,
        mobius_to_banzhaf,
        extract_interactions,
        extract_cross_per_token,
        apply_image_mask,
        render_overlay,
        render_segmentation_map,
        mask_token_ids,
    )
    _CLIP_PIPELINE_AVAILABLE = True
except ImportError:
    _CLIP_PIPELINE_AVAILABLE = False

# Module-level cache for CLIP scorers (keyed by model name)
_clip_scorer_cache: Dict[str, Any] = {}


def _raise_backend_error(resp: requests.Response, label: str) -> None:
    detail = resp.text
    try:
        detail = resp.json().get("detail", detail)
    except Exception:
        pass
    raise gr.Error(f"{label} failed ({resp.status_code}). {detail}")

# backend API imports
try:  # loader data APIs are required for public mode
    from loader.data import (
        get_example_by_id,
        get_examples,
        list_datasets,
        list_datasets_with_display_names,
        list_dataset_display_names,
        get_dataset_display_name,
        get_dataset_key_from_display_name,
    )
except Exception:  # pragma: no cover - optional at runtime
    get_example_by_id = None
    get_examples = None
    list_datasets = None
    list_datasets_with_display_names = None
    list_dataset_display_names = None
    get_dataset_display_name = None
    get_dataset_key_from_display_name = None

try:
    from loader.results import get_result_by_id
except Exception:  # pragma: no cover
    get_result_by_id = None

try:
    from loader.models import get_model
except Exception:  # pragma: no cover
    get_model = None

try:  # attribution stack is optional (dev mode)
    from attribution.masker import get_masker, mask_text
    from attribution.proxyspex import run_proxyspex
    from attribution.image_masker import supports_superpixel
    from attribution.utils import (
        influence_interactions,
        mobius_to_influence,
        mobius_to_shapley,
        shapley_interactions,
        mobius_to_banzhaf,
        banzhaf_interactions,
    )
except Exception:  # pragma: no cover
    get_masker = None
    mask_text = None
    run_proxyspex = None
    supports_superpixel = None
    influence_interactions = None
    mobius_to_influence = None
    mobius_to_shapley = None
    shapley_interactions = None
    mobius_to_banzhaf = None
    banzhaf_interactions = None


_ANSWER_FIELDS = (
    "correct_answer",
    "answer",
    "target",
    "completion",
    "label",
)
_ALLOWED_METHODS = {"shapley", "banzhaf", "influence"}
_ALLOWED_LEVELS = {"word", "sentence", "paragraph"}


def _ensure_backend(name: str, fn: Optional[Any]):
    if fn is None:
        raise RuntimeError(
            f"{name} is unavailable. Ensure the backend modules are installed and importable."
        )
    return fn


def _html_component(label: str, visible: bool = True) -> gr.HTML:
    try:
        return gr.HTML(label=label, sanitize_html=False, visible=visible)
    except TypeError:
        return gr.HTML(label=label, visible=visible)


def _encode_image_to_b64(image: Image.Image) -> str:
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def _extract_answer(record: Dict[str, Any]) -> str:
    for field in _ANSWER_FIELDS:
        val = record.get(field)
        if val:
            return str(val)
    return ""


def _coerce_feature_tuple(raw_key: Any) -> Tuple[str, ...]:
    if isinstance(raw_key, tuple):
        return tuple(str(item) for item in raw_key)
    if isinstance(raw_key, list):
        return tuple(str(item) for item in raw_key)
    if isinstance(raw_key, str):
        for sep in ("·", "|", ",", "×"):
            if sep in raw_key:
                parts = [chunk.strip() for chunk in raw_key.split(sep) if chunk.strip()]
                if parts:
                    return tuple(parts)
        return (raw_key.strip(),)
    return (str(raw_key),)


# def _normalize_interactions(raw: Any) -> List[Tuple[Tuple[str, ...], float]]:
#     items: List[Any]
#     if raw is None:
#         return []
#     if isinstance(raw, dict):
#         items = list(raw.items())
#     else:
#         items = list(raw)

#     normalized: List[Tuple[Tuple[str, ...], float]] = []
#     for feats, value in items:
#         try:
#             numeric = float(value)
#         except Exception:
#             continue
#         normalized.append((_coerce_feature_tuple(feats), numeric))
#     return normalized
def _normalize_interactions(raw: Any) -> List[Tuple[Tuple[str, ...], float]]:
    """
    Make a best-effort guess at interaction structure.

    Supported shapes:
      - { key: float }
      - { key: {"value": float, "score": ...} }
      - [ (key, float), ... ]
      - [ (key, {"value": float}), ... ]
      - [ {"features": [...], "value": float}, ... ]  (this is mostly handled elsewhere)
    """
    if raw is None:
        return []

    items: List[Any] = []

    if isinstance(raw, dict):
        # e.g. { key: float } or { key: {"value": ...} }
        for k, v in raw.items():
            items.append((k, v))
    elif isinstance(raw, list):
        items = list(raw)
    else:
        return []

    normalized: List[Tuple[Tuple[str, ...], float]] = []

    for item in items:
        # Case 1: dict-style item with explicit fields
        if isinstance(item, dict):
            feats = item.get("features") or item.get("indices") or item.get("pair") or item.get("key")
            val = item.get("value", item.get("score", 0.0))
        else:
            # Case 2: tuple/list pair (feats, value)
            try:
                feats, val = item
            except Exception:
                continue

        # If value itself is a dict, dig out "value" / "score"
        if isinstance(val, dict):
            val = val.get("value", val.get("score", 0.0))

        try:
            numeric = float(val)
        except Exception:
            continue

        feats_tuple = _coerce_feature_tuple(feats)
        if feats_tuple:
            normalized.append((feats_tuple, numeric))

    return normalized

def _resolve_marginals(payload: Dict[str, Any]) -> Dict[str, float]:
    for key in ("marginals", "token_scores", "values", "scores"):
        data = payload.get(key)
        if isinstance(data, dict):
            normalized: Dict[str, float] = {}
            for k, v in data.items():
                try:
                    normalized[str(k)] = float(v)
                except Exception:
                    continue
            return normalized
    return {}


def _resolve_features(payload: Dict[str, Any], marginals: Dict[str, float]) -> List[str]:
    features = payload.get("features")
    if isinstance(features, list):
        return [str(f) for f in features]
    if marginals:
        return list(marginals.keys())
    return []


def _extract_interactions_from_response(
    data_int: Dict[str, Any],
    method: str,
    features: List[str],
) -> List[Tuple[Tuple[str, ...], float]]:
    inter_list: List[Tuple[Tuple[str, ...], float]] = []

    method_key = (method or "shapley").lower()
    method_block = data_int.get(method_key) or data_int

    raw_interactions = None
    if isinstance(method_block, dict):
        for key in ("interactions", "pairwise_interactions", "interactions_2", "pairwise", "data"):
            if key in method_block:
                raw_interactions = method_block.get(key)
                break
        if raw_interactions is None:
            raw_interactions = method_block
    else:
        raw_interactions = method_block

    # List-of-dicts or list-of-pairs shape
    if isinstance(raw_interactions, list) and raw_interactions:
        if isinstance(raw_interactions[0], dict):
            for item in raw_interactions:
                feats = (
                    item.get("feature_list")
                    or item.get("features")
                    or item.get("indices")
                    or item.get("pair")
                    or []
                )
                val = None
                for key_val in ("value", "score", "attribution", "weight"):
                    if key_val in item:
                        try:
                            val = float(item[key_val])
                            break
                        except Exception:
                            continue
                if val is None:
                    continue

                if isinstance(feats, list) and feats and isinstance(feats[0], int):
                    feat_names = tuple(
                        features[i] for i in feats
                        if isinstance(i, int) and 0 <= i < len(features)
                    )
                else:
                    feat_names = _coerce_feature_tuple(feats)

                if feat_names:
                    inter_list.append((feat_names, val))
        elif (
            isinstance(raw_interactions[0], (list, tuple))
            and len(raw_interactions[0]) == 2
        ):
            for item in raw_interactions:
                if not isinstance(item, (list, tuple)) or len(item) != 2:
                    continue
                feats_raw, val_raw = item
                try:
                    val = float(val_raw)
                except Exception:
                    continue

                feat_names: Tuple[str, ...] = ()
                if isinstance(feats_raw, (list, tuple)) and feats_raw:
                    if all(isinstance(i, int) for i in feats_raw):
                        feat_names = tuple(
                            features[i] for i in feats_raw
                            if 0 <= i < len(features)
                        )
                    else:
                        feat_names = _coerce_feature_tuple(feats_raw)
                elif isinstance(feats_raw, str):
                    feat_names = _coerce_feature_tuple(feats_raw)

                if feat_names:
                    inter_list.append((feat_names, val))

    # Dict shape, e.g. {"(0,2)": 528.0, ...}
    if not inter_list and isinstance(raw_interactions, dict):
        metadata_keys = {"method", "order", "scalarizer", "embedding_model"}
        for k, v in raw_interactions.items():
            if str(k) in metadata_keys:
                continue
            val = None
            if isinstance(v, (int, float)):
                val = float(v)
            elif isinstance(v, dict):
                for key_val in ("value", "score", "attribution", "weight"):
                    if key_val in v:
                        try:
                            val = float(v[key_val])
                            break
                        except Exception:
                            continue
            if val is None:
                continue

            k_str = str(k)
            idxs = []
            try:
                import re as _re
                idxs = [int(x) for x in _re.findall(r"\d+", k_str)]
            except Exception:
                idxs = []

            if idxs:
                names: List[str] = []
                for idx in idxs:
                    if 0 <= idx < len(features):
                        names.append(features[idx])
                if names:
                    feat_names = tuple(names)
                else:
                    feat_names = _coerce_feature_tuple(k_str)
            else:
                feat_names = _coerce_feature_tuple(k_str)

            inter_list.append((feat_names, val))

    # Flatten numerics arbitrarily (last resort)
    if not inter_list and raw_interactions is not None:
        flat: List[Tuple[Tuple[str, ...], float]] = []

        def _collect(obj: Any, prefix: Tuple[str, ...] = ()) -> None:
            if isinstance(obj, (int, float)):
                flat.append((prefix or ("<interaction>",), float(obj)))
            elif isinstance(obj, list):
                for i, item in enumerate(obj):
                    _collect(item, prefix + (f"[{i}]",))
            elif isinstance(obj, dict):
                for kk, vv in obj.items():
                    _collect(vv, prefix + (str(kk),))

        _collect(raw_interactions)
        inter_list = flat

    return inter_list


def _labels_from_regions(regions: List[Dict[str, Any]]) -> List[str]:
    labels: List[str] = [""] * len(regions)
    for region in regions:
        try:
            idx = int(region.get("index", 0))
        except Exception:
            continue
        if idx < 0 or idx >= len(labels):
            continue
        labels[idx] = str(region.get("label") or f"Region {idx + 1}")
    for idx, label in enumerate(labels):
        if not label:
            labels[idx] = f"Region {idx + 1}"
    return labels


def _interaction_dicts_to_pairs(
    interactions: List[Dict[str, Any]],
    labels: List[str],
    *,
    order: int | None = None,
) -> List[Tuple[Tuple[str, ...], float]]:
    pairs: List[Tuple[Tuple[str, ...], float]] = []
    for item in interactions:
        indices = item.get("indices")
        if not indices:
            continue
        if order is not None and len(indices) != order:
            continue
        try:
            value = float(item.get("value", 0.0))
        except Exception:
            continue
        feats = tuple(labels[int(i)] for i in indices if int(i) < len(labels))
        if feats:
            pairs.append((feats, value))
    return pairs


def _interaction_dicts_to_table(
    interactions: List[Dict[str, Any]],
    labels: List[str],
) -> List[List[Any]]:
    rows: List[List[Any]] = []
    for item in interactions:
        indices = item.get("indices")
        if not indices:
            continue
        try:
            value = float(item.get("value", 0.0))
        except Exception:
            continue
        feats = [labels[int(i)] for i in indices if int(i) < len(labels)]
        if feats:
            rows.append([" × ".join(feats), value, len(indices)])
    return rows


def _feature_display_label(
    feature: Dict[str, Any],
    region_labels: List[str],
) -> str:
    raw = str(feature.get("feature", ""))
    modality = feature.get("modality") or ""
    ref_index = int(feature.get("ref_index", 0))
    label = raw.split(":", 1)[1] if ":" in raw else raw
    if modality == "image":
        if 0 <= ref_index < len(region_labels):
            return region_labels[ref_index]
    return label or raw


def _extract_feature_series(payload: Dict[str, Any]) -> Tuple[List[str], List[float]]:
    """
    Try to recover an ordered pair of (feature labels, values) from a backend payload.
    This keeps duplicates in order (appending suffixes later) so word-level tokens
    don't collapse to a single entry.
    """
    features: List[str] = []
    values: List[float] = []

    feature_entries = payload.get("features")
    if isinstance(feature_entries, list) and feature_entries and isinstance(feature_entries[0], dict):
        for idx, entry in enumerate(feature_entries, start=1):
            raw_feat = (
                entry.get("feature")
                or entry.get("token")
                or entry.get("text")
                or entry.get("label")
                or ""
            )
            if not raw_feat:
                raw_feat = f"feature_{idx}"

            val = entry.get("value")
            if val is None:
                for key in ("score", "attribution", "weight"):
                    if key in entry:
                        val = entry[key]
                        break
            try:
                values.append(float(val if val is not None else 0.0))
            except Exception:
                values.append(0.0)
            features.append(str(raw_feat))

    if not features:
        heat = payload.get("heatmap") or {}
        tokens = heat.get("tokens") or heat.get("features")
        scores = heat.get("values") or heat.get("scores")
        if isinstance(tokens, list) and isinstance(scores, list) and len(tokens) == len(scores):
            features = [str(token if token is not None else f"feature_{idx + 1}") for idx, token in enumerate(tokens)]
            tmp_vals: List[float] = []
            for score in scores:
                try:
                    tmp_vals.append(float(score))
                except Exception:
                    tmp_vals.append(0.0)
            values = tmp_vals

    if not features:
        marginals = _resolve_marginals(payload)
        if marginals:
            features = list(marginals.keys())
            values = [float(marginals[key]) for key in features]

    if not features:
        return [], []

    unique_features = _assign_unique_labels(features)
    return unique_features, values


def _resolve_interactions(payload: Dict[str, Any], order: int) -> List[Tuple[Tuple[str, ...], float]]:
    candidates = [f"interactions_{order}"]
    if order == 2:
        candidates += ["pairwise", "pairwise_interactions", "interactions2"]
    elif order == 3:
        candidates += ["higher_order", "triple_interactions", "interactions3"]

    for key in candidates:
        raw = payload.get(key)
        normalized = _normalize_interactions(raw)
        if normalized:
            return normalized
    return []


def _fallback_pairwise_from_values(
    features: List[str],
    values: List[float],
    max_edges: int = 40,
) -> List[Tuple[Tuple[str, ...], float]]:
    """
    Generate synthetic pairwise links by connecting neighboring tokens.
    Used when the backend provides no explicit interactions.
    """
    n = min(len(features), len(values))
    if n < 2:
        return []
    edges: List[Tuple[Tuple[str, ...], float]] = []
    for idx in range(n - 1):
        weight = 0.5 * (values[idx] + values[idx + 1])
        edges.append(((features[idx], features[idx + 1]), weight))
    edges.sort(key=lambda item: abs(item[1]), reverse=True)
    return edges[:max_edges]


def _resolve_pairwise(
    payload: Dict[str, Any],
    features: Optional[List[str]] = None,
    feature_values: Optional[List[float]] = None,
) -> List[Tuple[Tuple[str, ...], float]]:
    """Convenience helper to always pull order-2 interactions if present."""
    pairwise = _resolve_interactions(payload, 2)
    if pairwise:
        return pairwise
    # Some payloads store generic "interactions" lists that mix orders.
    mixed = payload.get("interactions")
    normalized = _normalize_interactions(mixed)
    if normalized:
        return [item for item in normalized if len(item[0]) == 2]
    if features and feature_values:
        return _fallback_pairwise_from_values(features, feature_values)
    return []


def _normalize_method(method: Optional[str]) -> str:
    method = (method or "shapley").lower()
    return method if method in _ALLOWED_METHODS else "shapley"


def _normalize_level(level: Optional[str]) -> str:
    level = (level or "sentence").lower()
    return level if level in _ALLOWED_LEVELS else "sentence"


def _normalize_model_size(model_size: Optional[str]) -> str:
    raw = (model_size or "small").strip()
    lowered = raw.lower()
    if lowered in {"small", "medium", "large"}:
        return lowered
    if "small" in lowered:
        return "small"
    if "medium" in lowered:
        return "medium"
    if "large" in lowered:
        return "large"
    return "small"


def _assign_unique_labels(chunks: List[str]) -> List[str]:
    counts: Dict[str, int] = {}
    labels: List[str] = []
    for idx, chunk in enumerate(chunks):
        normalized = " ".join((chunk or "").split())
        if not normalized:
            normalized = f"<chunk {idx + 1}>"
        counts[normalized] = counts.get(normalized, 0) + 1
        suffix = f" ({counts[normalized]})" if counts[normalized] > 1 else ""
        labels.append(f"{normalized}{suffix}")
    return labels


def _strip_occurrence_suffix(text: str) -> str:
    text = text or ""
    if text.endswith(")") and " (" in text:
        base, _, tail = text.rpartition(" (")
        if tail[:-1].isdigit():
            return base
    return text


def _pairwise_to_index_interactions(
    pairwise: List[Tuple[Tuple[str, ...], float]],
    features: List[str],
) -> List[Dict[str, Any]]:
    feature_index = {feat: idx for idx, feat in enumerate(features)}
    base_index: Dict[str, int] = {}
    for idx, feat in enumerate(features):
        base_index.setdefault(_strip_occurrence_suffix(feat), idx)

    interactions: List[Dict[str, Any]] = []
    for feats, val in pairwise:
        if len(feats) != 2:
            continue
        a, b = feats
        a_idx = None
        b_idx = None
        if isinstance(a, (int, float)) and isinstance(b, (int, float)):
            a_idx = int(a)
            b_idx = int(b)
        else:
            try:
                a_idx = int(str(a))
                b_idx = int(str(b))
            except ValueError:
                a_idx = feature_index.get(a) or base_index.get(_strip_occurrence_suffix(str(a)))
                b_idx = feature_index.get(b) or base_index.get(_strip_occurrence_suffix(str(b)))
        if a_idx is None or b_idx is None:
            continue
        if a_idx < 0 or b_idx < 0 or a_idx >= len(features) or b_idx >= len(features):
            continue
        interactions.append({"indices": [a_idx, b_idx], "value": float(val)})
    return interactions


def _locate_spans(text: str, segments: List[str]) -> List[Tuple[int, int]]:
    spans: List[Tuple[int, int]] = []
    cursor = 0
    for segment in segments:
        if not segment:
            continue
        idx = text.find(segment, cursor)
        if idx == -1:
            idx = cursor
        end = idx + len(segment)
        spans.append((idx, end))
        cursor = end
    return spans


def _chunk_text_for_visualization(
    context: str,
    level: str,
) -> Tuple[List[str], List[Tuple[int, int]], str]:
    """
    Split input text into feature chunks and spans for visualization.
    Falls back to the demo text if context is empty.
    """
    text = context or _DEMO_TEXT
    level = _normalize_level(level)

    if level == "word":
        matches = list(re.finditer(r"\S+", text))
        chunks = [m.group(0) for m in matches]
        spans = [(m.start(), m.end()) for m in matches]
    elif level == "paragraph":
        parts = [seg for seg in re.split(r"\n\s*\n+", text) if seg.strip()]
        spans = _locate_spans(text, parts)
        chunks = parts[: len(spans)]
    else:  # sentence-level default
        parts = [seg for seg in re.split(r"(?<=[.!?])\s+", text) if seg.strip()]
        spans = _locate_spans(text, parts)
        chunks = parts[: len(spans)]

    if not chunks:
        chunks = [text]
        spans = [(0, len(text))]

    features = _assign_unique_labels(chunks)
    return features, spans, text


def _generate_synthetic_marginals(
    features: List[str],
    rng: random.Random,
) -> Dict[str, float]:
    if not features:
        return {}
    max_len = max(len(f) for f in features) or 1
    marginals: Dict[str, float] = {}
    denom = max(1, len(features) - 1)
    for idx, feat in enumerate(features):
        length_factor = len(feat) / max_len
        position_factor = 1 - (idx / denom if denom else 0)
        noise = rng.uniform(-0.25, 0.25)
        value = (length_factor - 0.5) * 0.6 + (position_factor - 0.5) * 0.4 + noise
        marginals[feat] = round(value, 4)
    return marginals


def _generate_synthetic_interactions(
    features: List[str],
    marginals: Dict[str, float],
    rng: random.Random,
) -> Dict[int, List[Tuple[Tuple[str, ...], float]]]:
    interactions: Dict[int, List[Tuple[Tuple[str, ...], float]]] = {2: [], 3: []}
    for i in range(len(features) - 1):
        pair = (features[i], features[i + 1])
        base = (marginals.get(pair[0], 0.0) + marginals.get(pair[1], 0.0)) / 2
        interactions[2].append((pair, round(base + rng.uniform(-0.1, 0.1), 4)))
    for i in range(len(features) - 2):
        triple = (features[i], features[i + 1], features[i + 2])
        base = sum(marginals.get(feat, 0.0) for feat in triple) / 3
        interactions[3].append((triple, round(base + rng.uniform(-0.1, 0.1), 4)))
    return interactions


def _synthetic_attribution_pipeline(
    context: str,
    prompt: str,
    answer: str,
    *,
    method: str,
    level: str,
    order: int,
    reason: Optional[str] = None,
) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]:
    text_source = context or prompt or answer or _DEMO_TEXT
    features, spans, text = _chunk_text_for_visualization(text_source, level)
    seed = hash((text_source, method, level, order)) & 0xFFFFFFFF
    rng = random.Random(seed)
    marginals = _generate_synthetic_marginals(features, rng)
    interactions = _generate_synthetic_interactions(features, marginals, rng)

    html = None
    if len(spans) == len(features):
        html = create_interactive_text_heatmap(
            text,
            spans,
            [marginals.get(f, 0.0) for f in features],
            method=method,
        )

    meta = {
        "mode": "synthetic",
        "reason": reason or "Attribution backend unavailable; showing mock data.",
        "method": method,
        "feature_level": level,
        "interaction_order": order,
        "feature_count": len(features),
    }

    inter_list = interactions.get(order, [])
    pairwise_for_tokens = interactions.get(2, []) if order != 2 else inter_list
    if not pairwise_for_tokens:
        pairwise_for_tokens = _fallback_pairwise_from_values(
            features,
            [marginals.get(f, 0.0) for f in features],
        )
    text_interaction_html = create_text_interaction_html(
        features,
        [marginals.get(f, 0.0) for f in features],
        _pairwise_to_index_interactions(pairwise_for_tokens, features),
        method=method,
        top_k=20,
        threshold=0.0,
    )
    figs = {
        "interactions": plot_top_interactions(inter_list, order=order, method=method),
    }

    return update(
        figs=figs,
        meta=meta,
        html=html,
        interaction_text_html=text_interaction_html,
        scoring_target_source="answer_input" if answer else "model_output",
        scoring_target_text=answer or "",
        reference_answer=answer or "",
        unmasked_answer="",
        debug_scores=None,
        scalarizer_used="logprob",
        score_full=None,
        score_empty=None,
        y_len_tokens=None,
    )


# def _compute_live_attributions(**kwargs) -> Tuple[Any, Any, Any, Any, Any]:
#     """
#     Placeholder for the real ProxySPEX + perplexity pipeline.
#     Raises until the attribution backend is implemented.
#     """
#     missing = [
#         name
#         for name, fn in {
#             "get_model": get_model,
#             "get_masker": get_masker,
#             "mask_text": mask_text,
#             "run_proxyspex": run_proxyspex,
#             "mobius_to_shapley": mobius_to_shapley,
#             "mobius_to_banzhaf": mobius_to_banzhaf,
#             "shapley_interactions": shapley_interactions,
#             "banzhaf_interactions": banzhaf_interactions,
#         }.items()
#         if fn is None
#     ]
#     if missing:
#         raise RuntimeError(
#             "Missing backend dependencies: " + ", ".join(sorted(missing))
#         )
#     raise NotImplementedError(
#         "Live attribution pipeline not wired yet. Integrate once ProxySPEX is ready."
#     )
def _compute_live_attributions(
    *,
    context: str,
    prompt: str,
    correct_answer: str,
    model_size: str,
    level: str,
    method: str,
    order: int,
    scalarizer: str = "logprob",
    embedding_model: str | None = None,
    progress=None,
) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]:
    """
    Call the FastAPI /api/attributions + /api/interactions backends and turn
    the JSON into figures / table / HTML for Gradio.
    This version is very defensive and tries hard to extract interactions
    from whatever shape the backend returns.
    """
    method = _normalize_method(method)
    level = _normalize_level(level)
    order = 3 if int(order or 2) >= 3 else 2

    context = context or ""
    prompt = prompt or ""
    correct_answer = correct_answer or ""
    text_source = context or prompt or correct_answer or _DEMO_TEXT

    payload = {
        "context": context,
        "answer": correct_answer,
        "reference_answer": correct_answer,
        "prompt": prompt,
        "method": method,
        "mask_level": level,
        "order": int(order),
        "model_size": model_size,
        "scalarizer": scalarizer,
        "embedding_model": embedding_model,
        "debug": False,
    }

    if progress is not None:
        progress(0.1, desc="Calling attribution backend")

    # ---------- 1. /api/attributions  ----------
    url_attr = BACKEND_URL.rstrip("/") + "/api/attributions"
    try:
        resp_attr = requests.post(url_attr, json=payload, timeout=REQUEST_TIMEOUT)
    except requests.exceptions.ReadTimeout as exc:
        raise gr.Error(
            "Attribution request timed out. The backend may still be running. "
            "Consider reducing feature granularity or set ATTRLLM_REQUEST_TIMEOUT to a higher value."
        ) from exc
    if resp_attr.status_code >= 400:
        _raise_backend_error(resp_attr, "Attribution request")
    data_attr = resp_attr.json()

    if progress is not None:
        progress(0.35, desc="Received attribution payload")

    # ---------- 2. FEATURES + MARGINAL VALUES ----------
    features, feature_values = _extract_feature_series(data_attr)
    if not features:
        features = ["<no features>"]
        feature_values = [0.0]
    marginals = {feat: float(feature_values[idx]) for idx, feat in enumerate(features)}

    # ---------- 3. /api/interactions  ----------
    if progress is not None:
        progress(0.45, desc="Calling interactions backend")

    url_int = BACKEND_URL.rstrip("/") + "/api/interactions"
    try:
        resp_int = requests.post(url_int, json=payload, timeout=REQUEST_TIMEOUT)
    except requests.exceptions.ReadTimeout as exc:
        raise gr.Error(
            "Interaction request timed out. The backend may still be running. "
            "Consider reducing order or set ATTRLLM_REQUEST_TIMEOUT to a higher value."
        ) from exc
    if resp_int.status_code >= 400:
        _raise_backend_error(resp_int, "Interaction request")
    data_int = resp_int.json()

    # DEBUG: see top-level keys
    print("data_int keys:", list(data_int.keys()))

    inter_list_all = _extract_interactions_from_response(data_int, method, features)
    pairwise_for_network = [item for item in inter_list_all if len(item[0]) == 2]
    used_pairwise_fallback = False

    inter_list = inter_list_all
    if inter_list:
        filtered: List[Tuple[Tuple[str, ...], float]] = []
        for feats, val in inter_list:
            if len(feats) == order:
                filtered.append((feats, val))
        if filtered:
            inter_list = filtered

    if order != 2 and not pairwise_for_network:
        try:
            payload_pair = dict(payload)
            payload_pair["order"] = 2
            try:
                resp_pair = requests.post(url_int, json=payload_pair, timeout=REQUEST_TIMEOUT)
            except requests.exceptions.ReadTimeout as exc:
                raise gr.Error(
                    "Interaction request timed out. The backend may still be running. "
                    "Consider reducing order or set ATTRLLM_REQUEST_TIMEOUT to a higher value."
                ) from exc
            if resp_pair.status_code >= 400:
                _raise_backend_error(resp_pair, "Interaction request")
            data_pair = resp_pair.json()
            pairwise_for_network = [
                item for item in _extract_interactions_from_response(data_pair, method, features)
                if len(item[0]) == 2
            ]
        except Exception as exc:
            print("Pairwise interaction fetch failed:", exc)
    if not pairwise_for_network:
        if method == "influence":
            pairwise_for_network = []
        else:
            used_pairwise_fallback = True
            pairwise_for_network = _fallback_pairwise_from_values(features, feature_values)

    print("LIVE features:", features)
    print("LIVE inter_list (first 3):", inter_list[:3])
    if method == "influence":
        top_singletons = sorted(
            list(zip(features, feature_values)),
            key=lambda kv: abs(float(kv[1])),
            reverse=True,
        )[:10]
        top_pairs = sorted(
            pairwise_for_network,
            key=lambda kv: abs(float(kv[1])),
            reverse=True,
        )[:10]
        print(
            "[influence-ui-debug] "
            f"pairwise_source={'fallback_neighbors' if used_pairwise_fallback else 'backend'} "
            f"feature_count={len(features)} pair_count={len(pairwise_for_network)}"
        , flush=True)
        print("[influence-ui-debug] top_singletons:", top_singletons, flush=True)
        print("[influence-ui-debug] top_pairwise:", top_pairs, flush=True)

    text_interaction_html = create_text_interaction_html(
        features,
        feature_values,
        _pairwise_to_index_interactions(pairwise_for_network, features),
        method=method,
        top_k=20,
        threshold=0.0,
    )

    # ---------- 4. RESCALE VERY SMALL VALUES ----------
    max_abs = max((abs(v) for v in marginals.values()), default=0.0)
    scale = 1.0
    if 0 < max_abs < 1e-3:
        scale = 1e3
    if scale != 1.0:
        marginals = {k: v * scale for k, v in marginals.items()}
        inter_list = [(feats, val * scale) for feats, val in inter_list]
        feature_values = [val * scale for val in feature_values]

    # ---------- 5. INLINE TEXT HEATMAP ----------
    spans = None
    masking = data_attr.get("masking") or data_attr.get("mask") or {}
    if isinstance(masking, dict):
        spans = masking.get("feature_spans") or masking.get("spans")

    html = None
    if spans and len(spans) == len(feature_values):
        html = create_interactive_text_heatmap(
            context or text_source,
            spans,
            feature_values,
            method=method,
        )

    # ---------- 6. PLOTS + TABLES + META ----------
    inter_fig = plot_top_interactions(inter_list, order=order, method=method)

    if progress is not None:
        progress(0.8, desc="Rendering visualizations")

    y_len_tokens = data_attr.get("y_len_tokens")
    scoring_target_source = data_attr.get("scoring_target_source") or "model_output"
    scoring_target_text = data_attr.get("scoring_target_text")
    if scoring_target_text is None:
        scoring_target_text = correct_answer or data_attr.get("y_full") or ""

    meta = {
        "mode": "live",
        "backend_url_attr": url_attr,
        "backend_url_int": url_int,
        "method": method,
        "feature_level": level,
        "interaction_order": order,
        "model_size": model_size,
        "feature_count": len(features),
        "max_abs_value": max_abs,
        "scale_applied": scale,
        "scalarizer": data_attr.get("scalarizer_used", payload.get("scalarizer")),
        "scoring_target_source": scoring_target_source,
        "scoring_target_text_preview": str(scoring_target_text)[:200],
        "score_full": data_attr.get("score_full"),
        "score_empty": data_attr.get("score_empty"),
        "y_len_tokens": y_len_tokens,
        "logprob_full": data_attr.get("logprob_full"),
        "logprob_empty": data_attr.get("logprob_empty"),
        "min_logprob_seen": data_attr.get("min_logprob_seen"),
        "reference_answer_received": data_attr.get("reference_answer_received"),
        "answer_received": data_attr.get("answer_received"),
        "raw_attr_keys": list(data_attr.keys()),
        "raw_int_keys": list(data_int.keys()),
    }
    reference_answer = correct_answer
    unmasked_answer = data_attr.get("y_full") or data_attr.get("unmasked_answer") or ""
    debug_scores = data_attr.get("debug_scores") or None

    interaction_chips_html = create_interaction_token_view(
        features,
        feature_values,
        pairwise_for_network,
        method=method,
        layout="sentence" if level == "sentence" else "token",
    )
    figs = {
        "interactions": inter_fig,
    }

    if progress is not None:
        progress(1.0, desc="Done")

    return update(
        figs=figs,
        meta=meta,
        html=html,
        interaction_html=interaction_chips_html,
        interaction_text_html=text_interaction_html,
        scoring_target_source=scoring_target_source,
        scoring_target_text=str(scoring_target_text),
        reference_answer=reference_answer,
        unmasked_answer=unmasked_answer,
        debug_scores=debug_scores,
        scalarizer_used=data_attr.get("scalarizer_used", payload.get("scalarizer")),
        score_full=data_attr.get("score_full"),
        score_empty=data_attr.get("score_empty"),
        y_len_tokens=y_len_tokens,
    )


# ═══════════════════════════════════════════════════════════════════════════
# CLIP-based live compute helpers  (Custom Image / Custom Multimodal tabs)
# ═══════════════════════════════════════════════════════════════════════════

_CLIP_MODEL_MAP: Dict[str, str] = {
    "CLIP (openai/clip-vit-base-patch32)": "openai/clip-vit-base-patch32",
    "BiomedCLIP": "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224",
}


def _get_clip_scorer(model_display: str) -> "CrossModalCLIPScorer":
    """Load or return cached CLIP scorer.  Apply dot-mask fix."""
    model_name = _CLIP_MODEL_MAP.get(model_display, model_display)
    if model_name in _clip_scorer_cache:
        return _clip_scorer_cache[model_name]

    import torch as _torch
    device = "cuda" if _torch.cuda.is_available() else "cpu"
    cfg = PipelineConfig(clip_model_name=model_name, device=device)
    scorer = CrossModalCLIPScorer(cfg)

    # Dot-mask fix: use "." (ID 269) instead of EOS to avoid CLIP argmax-pooling shift
    _neutral_ids = scorer.processor.tokenizer.encode(".", add_special_tokens=False)
    if _neutral_ids:
        scorer.unk_token_id = _neutral_ids[0]

    _clip_scorer_cache[model_name] = scorer
    return scorer


def _run_clip_attribution(
    image: Image.Image,
    caption: str,
    clip_model: str,
    seg_mode: str,
    grid_size: int,
    method: str,
    seed: int,
    progress=None,
) -> Dict[str, Any]:
    """
    Core CLIP cross-modal attribution pipeline shared by both custom tabs.

    Returns a dict with regions, token_players, values, interactions,
    overlay images, masked images, and influence matrix.
    """
    import numpy as np

    if not _CLIP_PIPELINE_AVAILABLE:
        raise gr.Error(
            "CLIP pipeline not available. Ensure attribution.set_mm is importable "
            "(requires transformers, lightgbm, numpy, scipy)."
        )

    method = _normalize_method(method)

    # Check LaMa availability, fall back to blur
    try:
        from simple_lama_inpainting import SimpleLama  # noqa: F401
        mask_style = "lama"
    except ImportError:
        mask_style = "blur"

    import torch as _torch
    device = "cuda" if _torch.cuda.is_available() else "cpu"
    model_name = _CLIP_MODEL_MAP.get(clip_model, clip_model)

    cfg = PipelineConfig(
        mode="patch" if seg_mode == "Patch Grid" else "unsam",
        grid_size=int(grid_size),
        mask_style=mask_style,
        clip_model_name=model_name,
        max_tokens=15,
        method=method,
        max_order=2,
        top_k_interactions=15,
        random_seed=int(seed),
        device=device,
    )

    if progress is not None:
        progress(0.05, desc="Loading CLIP model...")
    scorer = _get_clip_scorer(clip_model)

    # Step 1: Featurise image
    if progress is not None:
        progress(0.10, desc="Segmenting image...")
    try:
        regions = featurise(image, cfg)
    except Exception as exc:
        if seg_mode != "Patch Grid":
            raise gr.Error(
                f"UnSAM segmentation failed: {exc}. "
                "Try using 'Patch Grid' instead."
            ) from exc
        raise

    # Step 2: Tokenise caption
    if progress is not None:
        progress(0.15, desc="Tokenising caption...")
    token_players, full_token_ids = tokenise_caption(
        caption, scorer.processor, cfg, offset=len(regions)
    )

    n_img = len(regions)
    n_tok = len(token_players)
    n_total = n_img + n_tok

    # Step 3: Build cross-modal set function
    if progress is not None:
        progress(0.20, desc="Building set function...")
    game = build_cross_modal_set_function(
        image, regions, token_players, full_token_ids, scorer, cfg
    )

    # Step 4: Run ProxySPEX (run_proxyspex wraps the set function for 2D batch calls)
    _raw_labels = [r.label for r in regions] + [tp.label for tp in token_players]
    # all_labels is rebuilt after tok_vals disambiguation below; _raw_labels for ProxySPEX
    if progress is not None:
        progress(0.25, desc=f"Running ProxySPEX (n={n_total})...")
    mobius = run_proxyspex(game, _raw_labels, max_order=2, seed=int(seed))

    # Step 5: Derive Shapley/Banzhaf values
    if progress is not None:
        progress(0.70, desc="Computing values...")
    if method == "banzhaf":
        values = mobius_to_banzhaf(mobius)
    else:
        values = mobius_to_shapley(mobius)

    # Split into image and token values
    # Disambiguate duplicate labels (e.g., two "the" tokens) by appending #N
    img_vals = {regions[i].label: float(values.get((i,), 0.0)) for i in range(n_img)}
    tok_vals = {}
    _tok_label_counts: Dict[str, int] = {}
    for j, tp in enumerate(token_players):
        label = tp.label
        count = _tok_label_counts.get(label, 0)
        _tok_label_counts[label] = count + 1
        key = f"{label}#{count}" if count > 0 else label
        tok_vals[key] = float(values.get((n_img + j,), 0.0))

    # Rebuild all_labels with disambiguated token labels
    all_labels = list(img_vals.keys()) + list(tok_vals.keys())

    # Step 6: Extract interactions
    interactions = extract_interactions(mobius, order=2, top_k=15)
    cross_per_token, cross_global_top5 = extract_cross_per_token(mobius, n_img, n_tok)

    # Image-image and token-token interactions
    img_filter = lambda loc: all(i < n_img for i in loc)
    tok_filter = lambda loc: all(i >= n_img for i in loc)
    interactions_img = extract_interactions(mobius, order=2, top_k=10, player_filter=img_filter)
    interactions_tok = extract_interactions(mobius, order=2, top_k=10, player_filter=tok_filter)

    # Cross-modal interactions (for bar chart)
    cross_filter = lambda loc: any(i < n_img for i in loc) and any(i >= n_img for i in loc)
    cross_interactions = extract_interactions(mobius, order=2, top_k=15, player_filter=cross_filter)

    # Step 7: Build influence matrix [n_img x n_tok]
    influence_matrix = np.zeros((n_img, n_tok))
    for loc, val in cross_interactions:
        img_indices = [i for i in loc if i < n_img]
        tok_indices = [i - n_img for i in loc if i >= n_img]
        for ii in img_indices:
            for tj in tok_indices:
                if 0 <= ii < n_img and 0 <= tj < n_tok:
                    influence_matrix[ii, tj] += float(val)

    # Step 8: Render overlay and segmap
    if progress is not None:
        progress(0.75, desc="Rendering overlay...")
    img_val_list = [float(values.get((i,), 0.0)) for i in range(n_img)]
    overlay_rgba = render_overlay(image, regions, img_val_list)
    base_rgba = image.convert("RGBA")
    overlay_img = Image.alpha_composite(base_rgba, overlay_rgba).convert("RGB")
    overlay_b64 = _encode_image_to_b64(overlay_img)

    segmap_img = render_segmentation_map(image, regions)
    segmap_b64 = _encode_image_to_b64(segmap_img)

    # Step 9: Build segment bboxes (% coordinates for interactive view)
    w, h = image.size
    segment_bboxes = []
    for reg in regions:
        x0, y0, x1, y1 = reg.bbox
        segment_bboxes.append({
            "x0_pct": 100.0 * x0 / w,
            "y0_pct": 100.0 * y0 / h,
            "w_pct": 100.0 * (x1 - x0) / w,
            "h_pct": 100.0 * (y1 - y0) / h,
            "cx_pct": 100.0 * (x0 + x1) / 2 / w,
            "cy_pct": 100.0 * (y0 + y1) / 2 / h,
        })

    # Step 10: Generate masked images for browser
    if progress is not None:
        progress(0.80, desc="Generating masked images...")
    masked_images: Dict[str, Image.Image] = {}
    for i, reg in enumerate(regions):
        # "removed" — mask only this region
        coal_removed = [1] * n_img
        coal_removed[i] = 0
        removed_img = apply_image_mask(
            image, regions, coal_removed, style=cfg.mask_style,
            blur_radius=cfg.blur_radius, cfg=cfg,
        )
        masked_images[f"{reg.label} removed"] = removed_img

    if progress is not None:
        progress(0.90, desc="Done computing.")

    return {
        "regions": regions,
        "token_players": token_players,
        "all_labels": all_labels,
        "image_values": img_vals,
        "token_values": tok_vals,
        "values": values,
        "mobius": mobius,
        "interactions": interactions,
        "interactions_img": interactions_img,
        "interactions_tok": interactions_tok,
        "cross_interactions": cross_interactions,
        "cross_per_token": cross_per_token,
        "cross_global_top5": cross_global_top5,
        "influence_matrix": influence_matrix,
        "overlay_img": overlay_img,
        "overlay_b64": overlay_b64,
        "segmap_img": segmap_img,
        "segmap_b64": segmap_b64,
        "segment_bboxes": segment_bboxes,
        "masked_images": masked_images,
        "method": method,
        "n_img": n_img,
        "n_tok": n_tok,
        "mask_style": mask_style,
        "seg_mode": seg_mode,
        "grid_size": int(grid_size),
    }


def _build_masked_choices(masked_images: Dict[str, Image.Image]) -> List[str]:
    """Return sorted list of masked image choice labels."""
    return sorted(masked_images.keys())


def _on_masked_image_select(choice: str, state: Dict) -> Optional[Image.Image]:
    """Return the masked PIL image for a dropdown choice."""
    if not state or not choice:
        return None
    return state.get(choice)


def _compute_image_attributions_clip(
    image: Image.Image,
    caption: str,
    clip_model: str,
    seg_mode: str,
    grid_size: int,
    method: str,
    seed: int,
    progress=None,
):
    """Compute image-only attributions using CLIP pipeline.  Returns UI outputs."""
    if image is None:
        raise gr.Error("Please upload an image.")
    if not caption or not caption.strip():
        raise gr.Error("Please provide a caption or description.")

    result = _run_clip_attribution(
        image, caption.strip(), clip_model, seg_mode, int(grid_size),
        method, int(seed or 0), progress=progress,
    )

    # Build region bar chart
    seg_labels = list(result["image_values"].keys())
    seg_vals = list(result["image_values"].values())
    region_chart = create_shapley_bar_chart(seg_labels, seg_vals, "Region Attribution")

    # Build masked image state and dropdown choices
    masked_state = result["masked_images"]
    choices = _build_masked_choices(masked_state)

    meta = {
        "mode": "image_clip",
        "method": result["method"],
        "clip_model": clip_model,
        "seg_mode": result["seg_mode"],
        "grid_size": result["grid_size"],
        "mask_style": result["mask_style"],
        "n_regions": result["n_img"],
        "n_tokens": result["n_tok"],
    }

    if progress is not None:
        progress(1.0, desc="Done")

    # Returns: original_img, overlay_img, region_chart, masked_dropdown, masked_img, masked_state, meta
    return (
        image,
        result["overlay_img"],
        region_chart,
        gr.update(choices=choices, value=choices[0] if choices else None),
        masked_state.get(choices[0]) if choices else None,
        masked_state,
        meta,
    )


def _compute_mm_attributions_clip(
    image: Image.Image,
    caption: str,
    clip_model: str,
    seg_mode: str,
    grid_size: int,
    method: str,
    seed: int,
    progress=None,
):
    """Compute cross-modal attributions using CLIP pipeline.  Returns UI outputs."""
    import numpy as np

    if image is None:
        raise gr.Error("Please upload an image.")
    if not caption or not caption.strip():
        raise gr.Error("Please provide a caption or description.")

    result = _run_clip_attribution(
        image, caption.strip(), clip_model, seg_mode, int(grid_size),
        method, int(seed or 0), progress=progress,
    )

    all_labels = result["all_labels"]
    n_img = result["n_img"]
    n_tok = result["n_tok"]

    # Region bar chart
    seg_labels = list(result["image_values"].keys())
    seg_vals = list(result["image_values"].values())
    region_chart = create_shapley_bar_chart(seg_labels, seg_vals, "Region Attribution")

    # Token bar chart
    tok_labels = list(result["token_values"].keys())
    tok_vals = list(result["token_values"].values())
    token_chart = create_shapley_bar_chart(tok_labels, tok_vals, "Token Attribution")

    # Cross-modal bar chart — expects List[Tuple[Tuple[str, str], float]]
    cross_pairs = []
    for loc, val in result["cross_interactions"]:
        img_parts = [all_labels[i] for i in loc if i < n_img]
        tok_parts = [all_labels[i] for i in loc if i >= n_img]
        if img_parts and tok_parts:
            cross_pairs.append(((img_parts[0], tok_parts[0]), float(val)))
    cross_chart = create_cross_modal_bar_chart(cross_pairs, "Cross-Modal Interactions", top_k=15)

    # Influence heatmap
    heatmap = create_influence_heatmap(
        seg_labels, tok_labels, result["influence_matrix"],
        "Influence Heatmap (Regions x Tokens)"
    )

    # Interactive cross-modal HTML view
    # Build clip_summary dict matching what benchmark_interaction expects
    clip_summary = {
        "image_region_values": [
            {"label": seg_labels[i], "value": float(seg_vals[i])} for i in range(n_img)
        ],
        "token_values": [
            {"label": tok_labels[j], "value": float(tok_vals[j])} for j in range(n_tok)
        ],
        "cross_modal_interactions": [
            {"label": " x ".join(all_labels[i] for i in loc), "value": float(val)}
            for loc, val in result["cross_global_top5"]
        ],
    }
    image_b64 = _encode_image_to_b64(image)
    interaction_html = create_benchmark_interaction_html(
        image_b64=image_b64,
        clip_summary=clip_summary,
        vllm_logprob=None,
        caption=caption,
        all_cross_modal_pairs=[
            {
                "pair": (
                    all_labels[loc[0]] if loc[0] < n_img else all_labels[loc[1]],
                    all_labels[loc[1]] if loc[1] >= n_img else all_labels[loc[0]],
                ),
                "value": float(val),
            }
            for loc, val in result["cross_interactions"]
        ],
        segmap_b64=result["segmap_b64"],
        overlay_b64=result["overlay_b64"],
        segment_bboxes=result["segment_bboxes"],
        label_map_b64="",
        image_width=image.size[0],
        image_height=image.size[1],
        title="Cross-Modal Interaction View",
    )

    # Masked image state
    masked_state = result["masked_images"]
    choices = _build_masked_choices(masked_state)

    meta = {
        "mode": "multimodal_clip",
        "method": result["method"],
        "clip_model": clip_model,
        "seg_mode": result["seg_mode"],
        "grid_size": result["grid_size"],
        "mask_style": result["mask_style"],
        "n_regions": n_img,
        "n_tokens": n_tok,
    }

    if progress is not None:
        progress(1.0, desc="Done")

    # Returns: original_img, overlay_img, region_chart, token_chart,
    #          cross_chart, heatmap, interaction_html,
    #          masked_dropdown, masked_img, masked_state, meta
    return (
        image,
        result["overlay_img"],
        region_chart,
        token_chart,
        cross_chart,
        heatmap,
        interaction_html,
        gr.update(choices=choices, value=choices[0] if choices else None),
        masked_state.get(choices[0]) if choices else None,
        masked_state,
        meta,
    )

def on_select_example(
    dataset,
    ex_id,
    model_size,
    order,
    method,
    scalarizer=None,
    feature_level=None,
):
    """
    Public mode handler: load a precomputed example and render figures.

    Args:
        dataset (str): dataset name
        ex_id (str): example id
        model_size (str): "small" | "medium" | "large"
        order (int): interaction order (2 or 3)
        method (str): "shapley" | "banzhaf" | "influence"

    Returns:
        tuple ordered as:
            (
                context,
                prompt,
                answer,
                interactions_plot,
                interactions_token_html,
                text_html,
                meta_json,
            )
    """
    get_res = get_result_by_id if get_result_by_id is not None else _public_get_result_from_file
    model_size = _normalize_model_size(model_size)
    example = {"context": "", "prompt": "", "answer": ""}
    if get_example_by_id is not None:
        try:
            example = get_example_by_id(dataset, ex_id)
        except Exception:
            pass
    result = get_res(
        model_size,
        dataset,
        ex_id,
        scalarizer=scalarizer,
        feature_level=feature_level,
    ) or {}
    payload = result.get(method, {})
    # Your JSON: features (list of strings) + mobius_dict. Convert to UI format if needed.
    feats = payload.get("features") if isinstance(payload, dict) else None
    if isinstance(feats, list) and feats and not isinstance(feats[0], dict):
        payload = _normalize_public_payload_fallback(payload, method)

    features, feature_values = _extract_feature_series(payload)
    if not features:
        features = ["<no features>"]
        feature_values = [0.0]
    # Influence scores are non-negative (squared Fourier coefficients)
    if method == "influence":
        feature_values = [abs(v) for v in feature_values]
    marginals = {feat: float(feature_values[idx]) for idx, feat in enumerate(features)}
    interactions = _resolve_interactions(payload, order)
    if method == "influence":
        interactions = [(feats, abs(val)) for feats, val in interactions]
        pairwise = _resolve_interactions(payload, 2)
        if not pairwise:
            mixed = payload.get("interactions")
            normalized = _normalize_interactions(mixed)
            if normalized:
                pairwise = [item for item in normalized if len(item[0]) == 2]
        pairwise = [(feats, abs(val)) for feats, val in pairwise]
    else:
        pairwise = _resolve_pairwise(payload, features, feature_values)
    if method == "influence":
        top_singletons = sorted(
            list(zip(features, feature_values)),
            key=lambda kv: abs(float(kv[1])),
            reverse=True,
        )[:10]
        top_pairs = sorted(
            pairwise,
            key=lambda kv: abs(float(kv[1])),
            reverse=True,
        )[:10]
        print(
            "[influence-ui-debug][public] "
            f"dataset={dataset} ex_id={ex_id} feature_count={len(features)} pair_count={len(pairwise)}"
        , flush=True)
        print("[influence-ui-debug][public] top_singletons:", top_singletons, flush=True)
        print("[influence-ui-debug][public] top_pairwise:", top_pairs, flush=True)
    payload_level = (
            payload.get("mask_level")
        or payload.get("feature_level")
        or payload.get("level")
        or (result.get("meta", {}) if isinstance(result, dict) else {}).get("feature_level")
    )
    layout_mode = "sentence" if _normalize_level(payload_level) == "sentence" else "token"

    inter = plot_top_interactions(interactions, order=order, method=method)

    spans = payload.get("feature_spans") or payload.get("spans")
    if not spans:
        # Precomputed JSON payloads may not include explicit spans.
        # Reconstruct spans from context + feature level so Text View can render.
        _, fallback_spans, _ = _chunk_text_for_visualization(
            example.get("context", ""),
            _normalize_level(payload_level),
        )
        if fallback_spans and len(fallback_spans) == len(feature_values):
            spans = fallback_spans
    html = None
    if spans and len(spans) == len(feature_values):
        html = create_interactive_text_heatmap(
            example.get("context", ""),
            spans,
            feature_values,
            method=method,
        )

    # Compute the wrong-answer payload up-front so the dual heatmap branch
    # (which rewrites text_interaction_html below) has it ready.
    _wrong_values_for_dual: Optional[List[float]] = None
    _wrong_pairwise_for_dual: Optional[List[Any]] = None
    _wrong_features_for_dual: Optional[List[str]] = None
    try:
        from visualization.wrong_answer_examples import has_wrong_answer_view as _has_wrong_view
    except Exception:
        _has_wrong_view = None
    _is_wrong_view = bool(
        _has_wrong_view is not None
        and html is not None
        and spans
        and _has_wrong_view(dataset, ex_id, scalarizer or "", feature_level or "")
    )
    if _is_wrong_view:
        wrong_result = _public_get_model_answer_short_from_file(
            model_size, dataset, ex_id, scalarizer or "geomean_jointprob",
            feature_level or "word",
        )
        wrong_payload = wrong_result.get(method, {}) if wrong_result else {}
        wrong_features_local, wrong_values_local = _extract_feature_series(wrong_payload)
        if method == "influence":
            wrong_values_local = [abs(v) for v in wrong_values_local]
        if wrong_features_local and len(wrong_values_local) == len(feature_values):
            # Build wrong-side pairwise edges, mirroring the GT logic above.
            wrong_pairwise = _resolve_interactions(wrong_payload, 2)
            if not wrong_pairwise:
                mixed = wrong_payload.get("interactions") if isinstance(wrong_payload, dict) else None
                normalized = _normalize_interactions(mixed)
                if normalized:
                    wrong_pairwise = [item for item in normalized if len(item[0]) == 2]
            if method == "influence":
                wrong_pairwise = [(f, abs(v)) for f, v in (wrong_pairwise or [])]
            else:
                # Best-effort: if no explicit pairwise, derive from wrong feature values
                if not wrong_pairwise:
                    wrong_pairwise = _resolve_pairwise(wrong_payload, wrong_features_local, wrong_values_local)
            _wrong_values_for_dual = wrong_values_local
            _wrong_pairwise_for_dual = wrong_pairwise or []
            _wrong_features_for_dual = wrong_features_local
        else:
            _is_wrong_view = False

    meta = {
        "dataset": dataset,
        "example_id": ex_id,
        "model_size": model_size,
        "method": method,
        "order": order,
        "feature_count": len(features),
        "payload_keys": sorted(payload.keys()),
    }
    if "meta" in result:
        meta["source_meta"] = result["meta"]

    interaction_chips_html = create_interaction_token_view(
        features,
        feature_values,
        pairwise or [item for item in interactions if len(item[0]) == 2],
        method=method,
        layout=layout_mode,
    )
    text_interaction_html = create_text_interaction_html(
        features,
        feature_values,
        _pairwise_to_index_interactions(
            pairwise or [item for item in interactions if len(item[0]) == 2],
            features,
        ),
        method=method,
        top_k=20,
        threshold=0.0,
    )

    # For the 30 wrong-answer examples, replace the visible Text Interaction
    # view with two chip+arc panels side-by-side (vs Ground Truth | vs Model
    # Answer (Wrong)) plus a single shared legend + RAW TEXT below.
    if (
        _is_wrong_view
        and _wrong_values_for_dual is not None
        and _wrong_features_for_dual is not None
    ):
        gt_view = create_text_interaction_html(
            features,
            feature_values,
            _pairwise_to_index_interactions(
                pairwise or [item for item in interactions if len(item[0]) == 2],
                features,
            ),
            method=method,
            top_k=20,
            threshold=0.0,
        )
        wrong_view = create_text_interaction_html(
            _wrong_features_for_dual,
            _wrong_values_for_dual,
            _pairwise_to_index_interactions(
                _wrong_pairwise_for_dual or [],
                _wrong_features_for_dual,
            ),
            method=method,
            top_k=20,
            threshold=0.0,
        )

        method_label = (method or "attribution").title()
        gt_max_abs = max((abs(v) for v in feature_values), default=0.0) or 1.0
        wrong_max_abs = max((abs(v) for v in _wrong_values_for_dual), default=0.0) or 1.0

        from html import escape as _escape
        raw_text = example.get("context", "") or ""
        raw_text_html = _escape(raw_text).replace("\n", "<br/>") if raw_text else ""

        # CSS scoped to .dual-heatmap-row hides the per-side legend so we can
        # show one shared legend below; tightens the per-card max width so two
        # views fit comfortably side-by-side.
        dual_css = (
            "<style>"
            ".dual-heatmap-row{display:grid;grid-template-columns:1fr 1fr;gap:16px;align-items:start;}"
            ".dual-heatmap-row .text-interaction-side-panel{display:none !important;}"
            ".dual-heatmap-row .text-interaction-root{flex:1 1 100%;}"
            ".dual-heatmap-row .text-interaction-card{flex:1 1 100%;max-width:100%;}"
            ".dual-heatmap-shared{margin-top:16px;display:grid;grid-template-columns:1fr 1fr;gap:16px;}"
            ".dual-heatmap-shared .shared-card{background:#f8f5ff;border:1px solid #e2d6f3;"
            "border-radius:12px;padding:12px 14px;box-shadow:0 4px 10px rgba(80,50,140,0.05);}"
            ".dual-heatmap-shared .shared-legend-bar{display:flex;align-items:center;gap:8px;margin:6px 0;}"
            ".dual-heatmap-shared .shared-legend-label{font-size:12px;color:#6f5a72;text-transform:uppercase;letter-spacing:.04em;}"
            ".dual-heatmap-shared .shared-legend-gradient{flex:1;height:10px;border-radius:999px;"
            "background:linear-gradient(90deg,#dd1313 0%,#d8c6f0 50%,#4a1c87 100%);}"
            ".dual-heatmap-shared .shared-legend-note{font-size:12px;color:#6f5a72;margin:4px 0 0 0;}"
            ".dual-heatmap-shared .shared-raw-text p{margin:6px 0 0 0;line-height:1.5;color:#3a2b4a;}"
            "@media (prefers-color-scheme: dark){"
            ".dual-heatmap-shared .shared-card{background:#111a2b;border-color:#33435f;}"
            ".dual-heatmap-shared .shared-legend-label,.dual-heatmap-shared .shared-legend-note,"
            ".dual-heatmap-shared .shared-raw-text p{color:#a9b6cb;}}"
            "@media (max-width: 900px){"
            ".dual-heatmap-row,.dual-heatmap-shared{grid-template-columns:1fr;}}"
            "</style>"
        )

        shared_block = (
            '<div class="dual-heatmap-shared">'
            '<div class="shared-card">'
            f'<strong>{method_label} legend</strong>'
            '<div class="shared-legend-bar">'
            '<span class="shared-legend-label">Negative</span>'
            '<div class="shared-legend-gradient"></div>'
            '<span class="shared-legend-label">Positive</span>'
            '</div>'
            '<p class="shared-legend-note">'
            f'Ground-truth max |value| = {gt_max_abs:.4f}; '
            f'wrong-answer max |value| = {wrong_max_abs:.4f}. '
            'Hover tokens for exact scores.'
            '</p>'
            '</div>'
            '<div class="shared-card shared-raw-text">'
            '<strong>Raw text</strong>'
            f'<p>{raw_text_html or "<em>No context available.</em>"}</p>'
            '</div>'
            '</div>'
        )

        text_interaction_html = (
            f'{dual_css}'
            '<div class="dual-heatmap-row">'
            '<div>'
            '<div class="heatmap-caption" '
            'style="font-weight:600;margin-bottom:6px;">vs Ground Truth</div>'
            f'{gt_view}'
            '</div>'
            '<div>'
            '<div class="heatmap-caption" '
            'style="font-weight:600;margin-bottom:6px;">vs Model Answer (Wrong)</div>'
            f'{wrong_view}'
            '</div>'
            '</div>'
            f'{shared_block}'
        )
        print(
            f"[wrong-answer] dual chip+lines view rendered for {dataset}/{ex_id} "
            f"(gt_features={len(features)} wrong_features={len(_wrong_features_for_dual)} "
            f"gt_pairs={len(pairwise or [])} wrong_pairs={len(_wrong_pairwise_for_dual or [])})",
            flush=True,
        )

    figs = {
        "interactions": inter,
    }
    outputs = update(
        figs=figs,
        meta=meta,
        html=html,
        interaction_html=interaction_chips_html,
        interaction_text_html=text_interaction_html,
    )
    return (
        example.get("context", ""),
        example.get("prompt", ""),
        _extract_answer(example),
        *outputs,
    )


def on_click_compute(
    context,
    prompt,
    correct_answer,
    model_size,
    level,
    method,
    scalarizer,
    embedding_model,
    progress=gr.Progress(track_tqdm=True),
):
#     """
#     Developer mode handler: compute (or mock) attributions and render figures.
#     """
#     method = _normalize_method(method)
#     level = _normalize_level(level)
#     order = 3 if int(order or 2) >= 3 else 2

#     context = context or ""
#     prompt = prompt or ""
#     correct_answer = correct_answer or ""

#     try:
#         return _compute_live_attributions(
#             context=context,
#             prompt=prompt,
#             correct_answer=correct_answer,
#             model_size=model_size,
#             level=level,
#             method=method,
#             order=order,
#             progress=progress,
#         )
#     except Exception as exc:  # pragma: no cover - best-effort fallback
#         return _synthetic_attribution_pipeline(
#             context,
#             prompt,
#             correct_answer,
#             method=method,
#             level=level,
#             order=order,
#             reason=str(exc),
#         )

    method = _normalize_method(method)
    level = _normalize_level(level)
    model_size = _normalize_model_size(model_size)
    order = 2

    context = context or ""
    prompt = prompt or ""
    correct_answer = correct_answer or ""

    return _compute_live_attributions(
        context=context,
        prompt=prompt,
        correct_answer=correct_answer,
        model_size=model_size,
        level=level,
        method=method,
        order=order,
        scalarizer=scalarizer,
        embedding_model=embedding_model,
        progress=progress,
    )


# ---------------------------------------------------------------------------
# Multimodal precomputed example handlers (MIMIC-CXR, ISIC, MS-COCO)
# ---------------------------------------------------------------------------

# ── MIMIC-CXR Tab Handlers ────────────────────────────────────────────────

_MIMIC_METHOD_NAMES = [
    "BiomedCLIP Cross-Modal",
    "LLaVA-Med Log-Prob",
    "LLaVA-Med Generation",
]


def _on_select_mimic_example(example_id, method_label: str = "Influence"):
    """Load a MIMIC-CXR example and return data for the MIMIC tab."""
    # 15 outputs: caption, original, findings, interpretation,
    #   biomedclip_overlay, biomedclip_token_plot, biomedclip_region_plot,
    #   llavamed_unsam_lp_overlay, llavamed_unsam_lp_plot,
    #   llavamed_unsam_gen_overlay, llavamed_unsam_gen_plot,
    #   biomedclip_interaction_html, meta, results_state, compare_method_a
    n_outputs = 15
    empty = tuple([""] + [None] * (n_outputs - 1))
    if not _MIMIC_AVAILABLE or not example_id:
        return empty

    method = (method_label or "Influence").lower()
    method_display = "Influence" if method == "influence" else "Shapley"

    _base_chart = globals()["create_shapley_bar_chart"]
    _base_html = globals()["create_benchmark_interaction_html"]

    def create_shapley_bar_chart(labels, values, title="Shapley Values", **kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_chart(labels, values, title.replace("Shapley", method_display), **kwargs)

    def create_benchmark_interaction_html(**kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_html(**kwargs)

    try:
        data = load_mimic_example(example_id, method=method)
    except Exception:
        return empty

    caption = data.get("caption", "")
    findings = data.get("findings", "")
    original_img = data.get("original_image_path")
    meta = data.get("meta", {})
    category = meta.get("category", "")

    # ── BiomedCLIP ───────────────────────────────────────────────────
    biomedclip_overlay_labeled = None
    biomedclip_region_plot = None
    biomedclip_token_plot = None
    biomedclip_interaction_html = ""

    segment_bboxes = None
    label_map_b64 = ""

    if data.get("has_biomedclip"):
        bc_summary = data["biomedclip"]["summary"]
        bc_overlay_raw = data["biomedclip"]["image_paths"].get("overlay")
        bc_original = data["biomedclip"]["image_paths"].get("original", "")
        bc_segmap = data["biomedclip"]["image_paths"].get("segmap", "")
        bc_n_segs = len(bc_summary.get("image_region_values", []))

        bc_bboxes, bc_label_map_b64 = None, ""
        if bc_original and bc_segmap and bc_n_segs > 0:
            try:
                bc_bboxes, bc_label_map_b64 = extract_segment_regions(
                    bc_original, bc_segmap, bc_n_segs)
            except Exception:
                pass

        if bc_overlay_raw:
            biomedclip_overlay_labeled = draw_segment_labels(
                bc_overlay_raw, bc_summary.get("image_region_values", []),
                segment_bboxes=bc_bboxes,
                label_map_b64=bc_label_map_b64,
                original_path=bc_original)

        bc_r_labels = [v["label"] for v in bc_summary.get("image_region_values", [])]
        bc_r_values = [v["value"] for v in bc_summary.get("image_region_values", [])]
        if bc_r_labels:
            biomedclip_region_plot = create_shapley_bar_chart(
                bc_r_labels, bc_r_values, "BiomedCLIP — Image Region Shapley Values")

        bc_merged = merge_subword_token_values(bc_summary.get("token_values", []), caption)
        bc_t_labels = [v["label"] for v in bc_merged]
        bc_t_values = [v["value"] for v in bc_merged]
        if bc_t_labels:
            biomedclip_token_plot = create_shapley_bar_chart(
                bc_t_labels, bc_t_values, "BiomedCLIP — Caption Word Shapley Values")

        # Interactive cross-modal HTML
        bc_image_b64 = data["biomedclip"].get("image_b64", {}).get("original", "")
        bc_overlay_b64 = data["biomedclip"].get("image_b64", {}).get("overlay", "")
        bc_all_cross = data["biomedclip"].get("all_cross_modal_pairs", [])
        bc_segmap_b64 = ""
        if bc_segmap:
            import os as _os
            if _os.path.exists(bc_segmap):
                import base64 as _b64
                with open(bc_segmap, "rb") as _f:
                    bc_segmap_b64 = _b64.b64encode(_f.read()).decode("ascii")
        biomedclip_interaction_html = create_benchmark_interaction_html(
            image_b64=bc_image_b64,
            clip_summary=bc_summary,
            vllm_logprob=None,
            caption=caption,
            all_cross_modal_pairs=bc_all_cross,
            segmap_b64=bc_segmap_b64,
            overlay_b64=bc_overlay_b64,
            segment_bboxes=bc_bboxes,
            label_map_b64=bc_label_map_b64,
            title="BiomedCLIP Cross-Modal Interaction View — click segments or words",
        )
        segment_bboxes = bc_bboxes
        label_map_b64 = bc_label_map_b64

    # ── LLaVA-Med UnSAM ─────────────────────────────────────────────
    # Draw two separate overlays — one colored by Log-Prob values, one by
    # Generation values — since the signs often differ between methods.
    llavamed_unsam_lp_overlay_img = None
    llavamed_unsam_gen_overlay_img = None
    llavamed_unsam_lp_plot = None
    llavamed_unsam_gen_plot = None

    if data.get("has_llavamed_unsam_logprob") or data.get("has_llavamed_unsam_gen"):
        lu_segmap = data.get("llavamed_unsam_segmap_path", "")
        lu_original = data.get("llavamed_unsam_original_path", "") or (original_img or "")
        lu_bboxes, lu_label_map_b64 = None, ""
        if lu_segmap and lu_original:
            n_lu_segs = 0
            if data.get("has_llavamed_unsam_logprob"):
                n_lu_segs = len(data["llavamed_unsam_logprob"].get("image_region_values", []))
            elif data.get("has_llavamed_unsam_gen"):
                n_lu_segs = len(data["llavamed_unsam_gen"].get("image_region_values", []))
            if n_lu_segs > 0:
                try:
                    lu_bboxes, lu_label_map_b64 = extract_segment_regions(
                        lu_original, lu_segmap, n_lu_segs)
                except Exception:
                    pass

        if data.get("has_llavamed_unsam_logprob"):
            lu_lp = rename_patch_labels(
                data["llavamed_unsam_logprob"].get("image_region_values", []))
            if lu_lp:
                llavamed_unsam_lp_plot = create_shapley_bar_chart(
                    [v["label"] for v in lu_lp],
                    [v["value"] for v in lu_lp],
                    "LLaVA-Med Log-Prob — Segment Shapley Values",
                )
                overlay_path = data["llavamed_unsam_logprob"].get("overlay_path", "")
                if overlay_path:
                    llavamed_unsam_lp_overlay_img = draw_segment_labels(
                        overlay_path, lu_lp,
                        segment_bboxes=lu_bboxes,
                        label_map_b64=lu_label_map_b64,
                        original_path=lu_original)

        if data.get("has_llavamed_unsam_gen"):
            lu_gen = rename_patch_labels(
                data["llavamed_unsam_gen"].get("image_region_values", []))
            if lu_gen:
                llavamed_unsam_gen_plot = create_shapley_bar_chart(
                    [v["label"] for v in lu_gen],
                    [v["value"] for v in lu_gen],
                    "LLaVA-Med Generation — Segment Shapley Values",
                )
                # Use the log-prob overlay as the base image and recolor by gen values
                overlay_path = (data["llavamed_unsam_gen"].get("overlay_path", "")
                                or data.get("llavamed_unsam_logprob", {}).get("overlay_path", ""))
                if overlay_path:
                    llavamed_unsam_gen_overlay_img = draw_segment_labels(
                        overlay_path, lu_gen,
                        segment_bboxes=lu_bboxes,
                        label_map_b64=lu_label_map_b64,
                        original_path=lu_original)

    # ── Interpretation text ──────────────────────────────────────────
    interpretation = ""
    try:
        bc_data = data.get("biomedclip", {}).get("summary") if data.get("has_biomedclip") else None
        interpretation = generate_interpretation_text(
            clip_summary=bc_data,
            vllm_logprob=data.get("llavamed_unsam_logprob") if data.get("has_llavamed_unsam_logprob") else None,
            modality="Chest X-ray",
            body_part=category,
            caption=caption,
            cross_method_name="BiomedCLIP",
            vlm_method_name="LLaVA-Med",
            vlm_region_type="UnSAM segments",
        )
    except Exception:
        pass

    # If no precomputed results at all, show informative message
    if not any(data.get(k) for k in ("has_biomedclip", "has_llavamed_unsam_logprob",
                                      "has_llavamed_unsam_gen", "has_clip")):
        interpretation = (
            "**No precomputed attribution results yet.**\n\n"
            "Run the attribution pipeline on this MIMIC-CXR example to see results here. "
            "The image and report are shown above for reference."
        )

    # ── Build results state for comparison ───────────────────────────
    _results_state = {}
    if biomedclip_overlay_labeled:
        _results_state["BiomedCLIP Cross-Modal"] = {
            "overlay": biomedclip_overlay_labeled, "plot": biomedclip_region_plot}
    if llavamed_unsam_lp_overlay_img:
        _results_state["LLaVA-Med Log-Prob"] = {
            "overlay": llavamed_unsam_lp_overlay_img, "plot": llavamed_unsam_lp_plot}
    if llavamed_unsam_gen_overlay_img:
        _results_state["LLaVA-Med Generation"] = {
            "overlay": llavamed_unsam_gen_overlay_img, "plot": llavamed_unsam_gen_plot}

    return (
        caption,                        # 1
        original_img,                   # 2
        findings,                       # 3
        interpretation,                 # 4
        biomedclip_overlay_labeled,     # 5
        biomedclip_token_plot,          # 6
        biomedclip_region_plot,         # 7
        llavamed_unsam_lp_overlay_img,  # 8
        llavamed_unsam_lp_plot,         # 9
        llavamed_unsam_gen_overlay_img, # 10a
        llavamed_unsam_gen_plot,        # 10b
        biomedclip_interaction_html,    # 11
        {  # 12 — metadata
            "example_id": example_id,
            "category": category,
            "has_biomedclip": data.get("has_biomedclip", False),
            "has_llavamed_unsam_logprob": data.get("has_llavamed_unsam_logprob", False),
            "has_llavamed_unsam_gen": data.get("has_llavamed_unsam_gen", False),
        },
        _results_state,                 # 13
        gr.update(),                    # 14 (placeholder)
    )


def _on_mimic_compare_methods(method_a, method_b, results_state):
    """Pick two MIMIC methods from state and display side by side."""
    if not method_a or not method_b or not results_state:
        return None, None, None, None
    a = results_state.get(method_a, {})
    b = results_state.get(method_b, {})
    return a.get("overlay"), b.get("overlay"), a.get("plot"), b.get("plot")


# ── ISIC Dermoscopy Tab Handlers ──────────────────────────────────────────

_ISIC_METHOD_NAMES = [
    "BiomedCLIP Cross-Modal",
    "LLaVA-Med Log-Prob",
    "LLaVA-Med Generation",
]


def _on_select_isic_example(example_id, method_label: str = "Influence"):
    """Load an ISIC dermoscopy example and return data for the ISIC tab.

    Mirrors _on_select_mimic_example — same 14 outputs, same layout.
    ISIC has no separate "findings" field, so slot 3 (findings) is empty.
    """
    n_outputs = 14
    empty = tuple([""] + [None] * (n_outputs - 1))
    if not _ISIC_AVAILABLE or not example_id:
        return empty

    method = (method_label or "Influence").lower()
    method_display = "Influence" if method == "influence" else "Shapley"

    _base_chart = globals()["create_shapley_bar_chart"]
    _base_html = globals()["create_benchmark_interaction_html"]

    def create_shapley_bar_chart(labels, values, title="Shapley Values", **kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_chart(labels, values, title.replace("Shapley", method_display), **kwargs)

    def create_benchmark_interaction_html(**kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_html(**kwargs)

    try:
        data = load_isic_example(example_id, method=method)
    except Exception:
        return empty

    caption = data.get("caption", "")
    original_img = data.get("original_image_path")
    meta = data.get("meta", {})
    category = meta.get("category", "")

    # ── BiomedCLIP ───────────────────────────────────────────────────
    biomedclip_overlay_labeled = None
    biomedclip_region_plot = None
    biomedclip_token_plot = None
    biomedclip_interaction_html = ""

    if data.get("has_biomedclip"):
        bc_summary = data["biomedclip"]["summary"]
        bc_overlay_raw = data["biomedclip"]["image_paths"].get("overlay")
        bc_original = data["biomedclip"]["image_paths"].get("original", "")
        bc_segmap = data["biomedclip"]["image_paths"].get("segmap", "")
        bc_n_segs = len(bc_summary.get("image_region_values", []))

        bc_bboxes, bc_label_map_b64 = None, ""
        if bc_original and bc_segmap and bc_n_segs > 0:
            try:
                bc_bboxes, bc_label_map_b64 = extract_segment_regions(
                    bc_original, bc_segmap, bc_n_segs)
            except Exception:
                pass

        if bc_overlay_raw:
            biomedclip_overlay_labeled = draw_segment_labels(
                bc_overlay_raw, bc_summary.get("image_region_values", []),
                segment_bboxes=bc_bboxes,
                label_map_b64=bc_label_map_b64,
                original_path=bc_original)

        bc_r_labels = [v["label"] for v in bc_summary.get("image_region_values", [])]
        bc_r_values = [v["value"] for v in bc_summary.get("image_region_values", [])]
        if bc_r_labels:
            biomedclip_region_plot = create_shapley_bar_chart(
                bc_r_labels, bc_r_values, "BiomedCLIP — Image Region Shapley Values")

        bc_merged = merge_subword_token_values(bc_summary.get("token_values", []), caption)
        bc_t_labels = [v["label"] for v in bc_merged]
        bc_t_values = [v["value"] for v in bc_merged]
        if bc_t_labels:
            biomedclip_token_plot = create_shapley_bar_chart(
                bc_t_labels, bc_t_values, "BiomedCLIP — Caption Word Shapley Values")

        bc_image_b64 = data["biomedclip"].get("image_b64", {}).get("original", "")
        bc_overlay_b64 = data["biomedclip"].get("image_b64", {}).get("overlay", "")
        bc_all_cross = data["biomedclip"].get("all_cross_modal_pairs", [])
        bc_segmap_b64 = ""
        if bc_segmap:
            import os as _os
            if _os.path.exists(bc_segmap):
                import base64 as _b64
                with open(bc_segmap, "rb") as _f:
                    bc_segmap_b64 = _b64.b64encode(_f.read()).decode("ascii")
        biomedclip_interaction_html = create_benchmark_interaction_html(
            image_b64=bc_image_b64,
            clip_summary=bc_summary,
            vllm_logprob=None,
            caption=caption,
            all_cross_modal_pairs=bc_all_cross,
            segmap_b64=bc_segmap_b64,
            overlay_b64=bc_overlay_b64,
            segment_bboxes=bc_bboxes,
            label_map_b64=bc_label_map_b64,
            title="BiomedCLIP Cross-Modal Interaction View — click segments or words",
        )

    # ── LLaVA-Med UnSAM ─────────────────────────────────────────────
    llavamed_unsam_lp_overlay_img = None
    llavamed_unsam_gen_overlay_img = None
    llavamed_unsam_lp_plot = None
    llavamed_unsam_gen_plot = None

    if data.get("has_llavamed_unsam_logprob") or data.get("has_llavamed_unsam_gen"):
        lu_segmap = data.get("llavamed_unsam_segmap_path", "")
        lu_original = data.get("llavamed_unsam_original_path", "") or (original_img or "")
        lu_bboxes, lu_label_map_b64 = None, ""
        if lu_segmap and lu_original:
            n_lu_segs = 0
            if data.get("has_llavamed_unsam_logprob"):
                n_lu_segs = len(data["llavamed_unsam_logprob"].get("image_region_values", []))
            elif data.get("has_llavamed_unsam_gen"):
                n_lu_segs = len(data["llavamed_unsam_gen"].get("image_region_values", []))
            if n_lu_segs > 0:
                try:
                    lu_bboxes, lu_label_map_b64 = extract_segment_regions(
                        lu_original, lu_segmap, n_lu_segs)
                except Exception:
                    pass

        if data.get("has_llavamed_unsam_logprob"):
            lu_lp = rename_patch_labels(
                data["llavamed_unsam_logprob"].get("image_region_values", []))
            if lu_lp:
                llavamed_unsam_lp_plot = create_shapley_bar_chart(
                    [v["label"] for v in lu_lp],
                    [v["value"] for v in lu_lp],
                    "LLaVA-Med Log-Prob — Segment Shapley Values",
                )
                overlay_path = data["llavamed_unsam_logprob"].get("overlay_path", "")
                if overlay_path:
                    llavamed_unsam_lp_overlay_img = draw_segment_labels(
                        overlay_path, lu_lp,
                        segment_bboxes=lu_bboxes,
                        label_map_b64=lu_label_map_b64,
                        original_path=lu_original)

        if data.get("has_llavamed_unsam_gen"):
            lu_gen = rename_patch_labels(
                data["llavamed_unsam_gen"].get("image_region_values", []))
            if lu_gen:
                llavamed_unsam_gen_plot = create_shapley_bar_chart(
                    [v["label"] for v in lu_gen],
                    [v["value"] for v in lu_gen],
                    "LLaVA-Med Generation — Segment Shapley Values",
                )
                overlay_path = (data["llavamed_unsam_gen"].get("overlay_path", "")
                                or data.get("llavamed_unsam_logprob", {}).get("overlay_path", ""))
                if overlay_path:
                    llavamed_unsam_gen_overlay_img = draw_segment_labels(
                        overlay_path, lu_gen,
                        segment_bboxes=lu_bboxes,
                        label_map_b64=lu_label_map_b64,
                        original_path=lu_original)

    # ── Interpretation text ──────────────────────────────────────────
    interpretation = ""
    try:
        bc_data = data.get("biomedclip", {}).get("summary") if data.get("has_biomedclip") else None
        interpretation = generate_interpretation_text(
            clip_summary=bc_data,
            vllm_logprob=data.get("llavamed_unsam_logprob") if data.get("has_llavamed_unsam_logprob") else None,
            modality="Dermoscopy",
            body_part=category,
            caption=caption,
            cross_method_name="BiomedCLIP",
            vlm_method_name="LLaVA-Med",
            vlm_region_type="UnSAM segments",
        )
    except Exception:
        pass

    if not any(data.get(k) for k in ("has_biomedclip", "has_llavamed_unsam_logprob",
                                      "has_llavamed_unsam_gen", "has_clip")):
        interpretation = (
            "**No precomputed attribution results yet.**\n\n"
            "Run the attribution pipeline on this ISIC example to see results here. "
            "The image and caption are shown above for reference."
        )

    # ── Results state for comparison dropdowns ──────────────────────
    _results_state = {}
    if biomedclip_overlay_labeled:
        _results_state["BiomedCLIP Cross-Modal"] = {
            "overlay": biomedclip_overlay_labeled, "plot": biomedclip_region_plot}
    if llavamed_unsam_lp_overlay_img:
        _results_state["LLaVA-Med Log-Prob"] = {
            "overlay": llavamed_unsam_lp_overlay_img, "plot": llavamed_unsam_lp_plot}
    if llavamed_unsam_gen_overlay_img:
        _results_state["LLaVA-Med Generation"] = {
            "overlay": llavamed_unsam_gen_overlay_img, "plot": llavamed_unsam_gen_plot}

    return (
        caption,                        # 1
        original_img,                   # 2
        interpretation,                 # 3
        biomedclip_overlay_labeled,     # 4
        biomedclip_token_plot,          # 5
        biomedclip_region_plot,         # 6
        llavamed_unsam_lp_overlay_img,  # 7
        llavamed_unsam_lp_plot,         # 8
        llavamed_unsam_gen_overlay_img, # 9
        llavamed_unsam_gen_plot,        # 10
        biomedclip_interaction_html,    # 11
        {  # 12 — metadata
            "example_id": example_id,
            "category": category,
            "has_biomedclip": data.get("has_biomedclip", False),
            "has_llavamed_unsam_logprob": data.get("has_llavamed_unsam_logprob", False),
            "has_llavamed_unsam_gen": data.get("has_llavamed_unsam_gen", False),
        },
        _results_state,                 # 13
        gr.update(),                    # 14 (placeholder for compare dropdown)
    )


def _on_isic_compare_methods(method_a, method_b, results_state):
    """Pick two ISIC methods from state and display side by side."""
    if not method_a or not method_b or not results_state:
        return None, None, None, None
    a = results_state.get(method_a, {})
    b = results_state.get(method_b, {})
    return a.get("overlay"), b.get("overlay"), a.get("plot"), b.get("plot")


def _on_select_coco_example(example_id, method_label: str = "Influence"):
    """Load a precomputed MS-COCO example and return outputs for the COCO tab."""
    n_outputs = 12
    empty = ("",) + (None,) * (n_outputs - 2) + (gr.update(),)
    if not _COCO_AVAILABLE or not _MEDICAL_AVAILABLE or not example_id:
        return empty

    method = (method_label or "Influence").lower()
    method_display = "Influence" if method == "influence" else "Shapley"

    _base_chart = globals()["create_shapley_bar_chart"]
    _base_html = globals()["create_benchmark_interaction_html"]

    def create_shapley_bar_chart(labels, values, title="Shapley Values", **kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_chart(labels, values, title.replace("Shapley", method_display), **kwargs)

    def create_benchmark_interaction_html(**kwargs):  # noqa: F811
        kwargs.setdefault("method_label", method_display)
        return _base_html(**kwargs)

    try:
        data = load_coco_example(example_id, method=method)
    except Exception as exc:
        print(f"[coco] Error loading {example_id}: {exc}")
        return empty

    caption = data.get("caption", "")
    summary = data.get("summary", {})
    original_img = data.get("image_paths", {}).get("original")
    overlay_img = data.get("image_paths", {}).get("overlay")

    # Segment bboxes from segmap
    segment_bboxes, label_map_b64 = None, ""
    clip_original = data["image_paths"].get("original", "")
    clip_segmap = data["image_paths"].get("segmap", "")
    n_segs = len(summary.get("image_region_values", []))
    if clip_original and clip_segmap and n_segs > 0:
        try:
            segment_bboxes, label_map_b64 = extract_segment_regions(
                clip_original, clip_segmap, n_segs)
        except Exception:
            pass

    # Draw segment labels on overlay
    overlay_labeled = overlay_img
    if overlay_img:
        try:
            labeled = draw_segment_labels(
                overlay_img,
                summary.get("image_region_values", []),
                segment_bboxes=segment_bboxes,
            )
            if labeled:
                overlay_labeled = labeled
        except Exception:
            pass

    # Bar charts
    r_vals = summary.get("image_region_values", [])
    r_labels = [v["label"] for v in r_vals]
    r_values = [v["value"] for v in r_vals]
    region_plot = create_shapley_bar_chart(
        r_labels, r_values, "CLIP — Image Region Shapley Values") if r_labels else None

    t_vals = summary.get("token_values", [])
    merged_toks = merge_subword_token_values(t_vals, caption)
    t_labels = [v["label"] for v in merged_toks]
    t_values = [v["value"] for v in merged_toks]
    token_plot = create_shapley_bar_chart(
        t_labels, t_values, "CLIP — Caption Word Shapley Values") if t_labels else None

    # Cross-modal pairs + chart + table
    all_cross = data.get("all_cross_modal_pairs", [])
    cross_plot = None
    cross_table = []
    if all_cross:
        cross_pairs = [
            ((item["pair"][0], _tok_to_word(item["pair"][1], caption)), item["value"])
            for item in all_cross
        ]
        cross_plot = create_cross_modal_bar_chart(
            cross_pairs, "CLIP — Top Image x Word Interactions", top_k=20)
        cross_table = [
            [item["pair"][0], _tok_to_word(item["pair"][1], caption), f"{item['value']:+.4f}"]
            for item in all_cross[:30]
        ]

    # Heatmap
    heatmap = None
    influence_matrix = data.get("influence_matrix")
    tok_labels_hm = [t.replace("tok:", "").lstrip("#") for t in data.get("tok_labels", [])]
    if influence_matrix is not None and influence_matrix.size > 0:
        heatmap = create_influence_heatmap(
            data.get("seg_labels", r_labels), tok_labels_hm, influence_matrix,
            "Image Regions x Caption Words — Influence Scores")

    # Interactive cross-modal HTML
    image_b64 = data.get("image_b64", {}).get("original", "")
    overlay_b64 = data.get("image_b64", {}).get("overlay", "")
    segmap_b64 = data.get("image_b64", {}).get("segmap", "")

    interaction_html = ""
    try:
        interaction_html = create_benchmark_interaction_html(
            image_b64=image_b64,
            clip_summary=summary,
            vllm_logprob=None,
            caption=caption,
            all_cross_modal_pairs=all_cross,
            segmap_b64=segmap_b64,
            overlay_b64=overlay_b64,
            segment_bboxes=segment_bboxes,
            label_map_b64=label_map_b64,
            title="MS-COCO — Click a region or word to explore interactions",
        )
    except Exception as exc:
        interaction_html = f"<p>Error building interaction view: {exc}</p>"

    note = (
        "**Note:** These results used the original UNK mask token "
        "(same as `<|endoftext|>`, CLIP token ID 49407). "
        "A first-token dominance artifact may be visible in the token Shapley chart. "
        "This will be corrected when scaling to 100 images with the dot-mask fix."
    )

    # Masked Image Browser
    region_choices = data.get("region_choices", [])
    masked_dd_update = gr.update(
        choices=region_choices,
        value=region_choices[0] if region_choices else None,
    )
    # Pre-load the first masked image (all_masked) so the viewer isn't blank
    first_masked_img = None
    if region_choices:
        try:
            first_masked_img = get_coco_masked_image_path(example_id, region_choices[0])
        except Exception:
            pass

    return (
        caption,           # 1
        original_img,      # 2
        overlay_labeled,   # 3
        interaction_html,  # 4
        token_plot,        # 5
        region_plot,       # 6
        cross_plot,        # 7
        cross_table,       # 8
        heatmap,           # 9
        note,              # 10
        first_masked_img,  # 11 — masked image viewer
        masked_dd_update,  # 12 — masked dropdown choices
    )


def _on_select_coco_masked(example_id, choice):
    """Return a masked image path for the COCO Masked Image Browser."""
    if not _COCO_AVAILABLE or not example_id or not choice:
        return None
    return get_coco_masked_image_path(example_id, choice)


def on_click_image_compute(
    image,
    caption,
    clip_model,
    seg_mode,
    grid_size,
    method,
    seed,
    progress=gr.Progress(track_tqdm=True),
):
    return _compute_image_attributions_clip(
        image=image,
        caption=caption,
        clip_model=clip_model,
        seg_mode=seg_mode,
        grid_size=grid_size,
        method=method,
        seed=seed,
        progress=progress,
    )


def on_click_mm_compute(
    image,
    caption,
    clip_model,
    seg_mode,
    grid_size,
    method,
    seed,
    progress=gr.Progress(track_tqdm=True),
):
    return _compute_mm_attributions_clip(
        image=image,
        caption=caption,
        clip_model=clip_model,
        seg_mode=seg_mode,
        grid_size=grid_size,
        method=method,
        seed=seed,
        progress=progress,
    )


# ---------------------------------------------------------------------------
# Demo helpers (used to quickly validate visualization components locally)
# ---------------------------------------------------------------------------

_DEMO_TEXT = "The quick brown fox jumps over the lazy dog in a sunny meadow."
_DEMO_FEATURES = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "in", "a", "sunny", "meadow"]
_DEMO_SPANS = [
    (0, 3), (4, 9), (10, 15), (16, 19), (20, 25), (26, 30), (31, 34),
    (35, 39), (40, 43), (44, 46), (47, 48), (49, 54), (55, 61)
]
_DEMO_ATTRIBUTIONS: Dict[str, Dict[str, float]] = {
    "shapley": {
        "The": -0.04,
        "quick": 0.18,
        "brown": 0.12,
        "fox": 0.27,
        "jumps": 0.15,
        "over": 0.05,
        "the": -0.02,
        "lazy": -0.11,
        "dog": -0.07,
        "in": 0.03,
        "a": 0.02,
        "sunny": 0.09,
        "meadow": 0.21,
    }
}
_DEMO_ATTRIBUTIONS["banzhaf"] = {
    token: round(value * 0.8, 3)
    for token, value in _DEMO_ATTRIBUTIONS["shapley"].items()
}
_DEMO_ATTRIBUTIONS["influence"] = {
    token: round(abs(value), 3)
    for token, value in _DEMO_ATTRIBUTIONS["shapley"].items()
}

_DEMO_INTERACTIONS_2: List[Tuple[Tuple[str, ...], float]] = [
    (("quick", "fox"), 0.24),
    (("fox", "jumps"), 0.19),
    (("sunny", "meadow"), 0.22),
    (("lazy", "dog"), -0.17),
    (("the", "lazy"), -0.12),
]

_DEMO_INTERACTIONS_3: List[Tuple[Tuple[str, ...], float]] = [
    (("quick", "brown", "fox"), 0.28),
    (("fox", "jumps", "over"), 0.18),
    (("sunny", "meadow", "dog"), 0.11),
    (("the", "lazy", "dog"), -0.21),
]

_DEMO_INTERACTION_MATRIX: List[Tuple[Tuple[int, int], float]] = [
    ((1, 3), 0.23),
    ((3, 4), 0.17),
    ((7, 8), -0.18),
    ((11, 12), 0.2),
    ((2, 5), 0.09),
]

_DEMO_DATASETS = {
    "squad_demo": [
        [
            "The quick brown fox jumps over the lazy dog.",
            "Who jumps over the dog?",
            "The quick brown fox",
        ],
        [
            "AttrLLM explains attributions for large language models.",
            "What does AttrLLM explain?",
            "Attributions",
        ],
    ],
    "truthfulqa_demo": [
        [
            "Water boils at 100 degrees Celsius at sea level.",
            "At what temperature does water boil?",
            "100 degrees Celsius",
        ]
    ],
}


def _render_demo(method: str = "shapley"):
    method = (method or "shapley").lower()
    order = 2
    attributions = _DEMO_ATTRIBUTIONS.get(method, _DEMO_ATTRIBUTIONS["shapley"])
    interactions = _DEMO_INTERACTIONS_3 if order == 3 else _DEMO_INTERACTIONS_2

    interactions_fig = plot_top_interactions(interactions, order=order, method=method)
    demo_pairwise = _DEMO_INTERACTIONS_2 or _fallback_pairwise_from_values(
        _DEMO_FEATURES,
        [attributions[token] for token in _DEMO_FEATURES],
    )
    text_html = create_interactive_text_heatmap(
        _DEMO_TEXT,
        _DEMO_SPANS,
        [attributions[token] for token in _DEMO_FEATURES],
        method=method,
    )
    text_interaction_html = create_text_interaction_html(
        _DEMO_FEATURES,
        [attributions[token] for token in _DEMO_FEATURES],
        [
            {"indices": [i, j], "value": float(val)}
            for (i, j), val in _DEMO_INTERACTION_MATRIX
        ],
        method=method,
        top_k=20,
        threshold=0.0,
    )

    meta = {
        "method": method,
        "order": order,
        "feature_count": len(_DEMO_FEATURES),
        "scalarizer": "logprob",
    }

    return update(
        figs={
            "interactions": interactions_fig,
        },
        meta=meta,
        html=text_html,
        interaction_text_html=text_interaction_html,
        scoring_target_source="model_output",
        scoring_target_text="",
        reference_answer="",
        unmasked_answer="",
        debug_scores=None,
        scalarizer_used="logprob",
        score_full=None,
        score_empty=None,
        y_len_tokens=None,
    )


def _render_additional_plots(method: str = "shapley"):
    return plot_interaction_matrix(_DEMO_FEATURES, _DEMO_INTERACTION_MATRIX)


def _records_for_dataset(dataset_name: str) -> List[Dict[str, Any]]:
    if get_examples is not None:
        try:
            records = get_examples(dataset_name, n=10)
            if records:
                return records
        except KeyError:
            pass
        except Exception:
            pass

    fallback_csv = _fallback_load_dataset(dataset_name, max_rows=10)
    if fallback_csv:
        return fallback_csv

    fallback = []
    for idx, row in enumerate(_DEMO_DATASETS.get(dataset_name, []), start=1):
        context, prompt, answer = row
        fallback.append(
            {
                "id": f"{dataset_name}_demo_{idx}",
                "context": context,
                "prompt": prompt,
                "correct_answer": answer,
            }
        )
    return fallback


def _available_datasets() -> List[str]:
    if list_datasets is not None:
        try:
            datasets = list_datasets()
            if datasets:
                return datasets
        except Exception:
            pass
    fallback = [k for k, v in _FALLBACK_DATASET_FILES.items() if (_fallback_datasets_dir() / v).exists()]
    if fallback:
        return sorted(fallback)
    return list(_DEMO_DATASETS.keys())


def _format_examples(records: List[Dict[str, Any]]) -> List[List[str]]:
    formatted = []
    for rec in records:
        formatted.append([
            rec.get("context", ""),
            rec.get("prompt", ""),
            rec.get("correct_answer")
            or rec.get("answer")
            or rec.get("target")
            or "",
        ])
    return formatted


def _load_examples_for_demo(dataset_name: str):
    # Convert display name to internal key if needed
    if get_dataset_key_from_display_name is not None:
        dataset_key = get_dataset_key_from_display_name(dataset_name)
    else:
        dataset_key = dataset_name
    
    records = _records_for_dataset(dataset_key)
    formatted = _format_examples(records)
    samples = formatted if formatted else _DEMO_DATASETS.get(dataset_key, [])
    return gr.update(samples=samples or [])


def _resolve_example_fields(record: Dict[str, Any]) -> Tuple[str, str, str]:
    context = record.get("context", "")
    prompt = record.get("prompt", "")
    answer = (
        record.get("correct_answer")
        or record.get("answer")
        or record.get("target")
        or ""
    )
    return context, prompt, answer


def _resolve_dataset_key(dataset_name: str) -> str:
    if dataset_name in _available_datasets():
        return dataset_name
    for key, label in DATASET_DISPLAY_LABELS.items():
        if dataset_name == label:
            return key
    if get_dataset_key_from_display_name is not None:
        return get_dataset_key_from_display_name(dataset_name)
    return dataset_name


def _dataset_choice_labels(dataset_keys: List[str]) -> List[str]:
    labels: List[str] = []
    for key in dataset_keys:
        if get_dataset_display_name is not None:
            try:
                labels.append(get_dataset_display_name(key))
                continue
            except Exception:
                pass
        labels.append(DATASET_DISPLAY_LABELS.get(key, key.replace("_", " ").title()))
    return labels


def _resolve_example_index(example_number: Any, records: List[Dict[str, Any]]) -> int:
    if not records:
        return 0
    try:
        index = int(example_number) - 1
    except Exception:
        index = 0
    return max(0, min(index, len(records) - 1))


def _resolve_example_id(example_number: Any, records: List[Dict[str, Any]]) -> str:
    if _public_only_mode():
        return f"example_{int(example_number or 1)}"
    index = _resolve_example_index(example_number, records)
    record = records[index] if records else {}
    return str(record.get("id") or f"example_{index + 1}")


def _build_model_answer_panel(dataset_name: str, example_number: Any) -> str:
    """Render Model's Answer + Justification HTML for the 30 wrong-answer
    examples; return empty string for everything else so the gr.HTML slot
    stays visually empty."""
    try:
        from visualization.wrong_answer_examples import WRONG_ANSWER_EXAMPLES
    except Exception:
        return ""
    dataset_key = _resolve_dataset_key(dataset_name) if dataset_name else ""
    try:
        ex_id = f"example_{int(example_number or 1)}"
    except Exception:
        ex_id = "example_1"
    if (dataset_key, ex_id) not in WRONG_ANSWER_EXAMPLES:
        return ""
    path = (
        _get_results_dir() / "model_answers" / "small" / dataset_key / f"{ex_id}.json"
    )
    if not path.exists():
        return ""
    try:
        with path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        return ""

    from html import escape as _escape

    letter = (data.get("model_answer_parsed") or "").strip()
    raw = (data.get("model_answer_raw") or "").strip()
    gt_letter = (data.get("ground_truth_letter") or data.get("ground_truth") or "").strip()
    is_match = bool(data.get("is_match"))
    similarity = data.get("similarity")
    try:
        sim_str = f"{float(similarity):.3f}" if similarity is not None else "—"
    except Exception:
        sim_str = "—"

    if is_match:
        chip_bg, chip_fg, chip_text = "#e7f6ec", "#1f8d4a", "✓ MATCH"
    else:
        chip_bg, chip_fg, chip_text = "#fdecea", "#c0392b", "✗ MISMATCH"

    # Split off the leading letter+rationale prefix for cleaner reading.
    justification = raw
    if raw.lower().startswith("justification:"):
        justification = raw.split(":", 1)[1].strip()
    elif "Justification:" in raw:
        justification = raw.split("Justification:", 1)[1].strip()

    return (
        '<div style="display:flex;flex-direction:column;gap:8px;'
        'background:#fdf7ff;border:1px solid #e2d6f3;border-radius:10px;'
        'padding:12px 14px;margin-top:6px;">'
        '<div style="display:flex;align-items:center;gap:10px;flex-wrap:wrap;">'
        '<strong style="color:#4a1c87;">Model\'s Answer</strong>'
        f'<span style="background:#fff;border:1px solid #d8c6f0;border-radius:6px;'
        f'padding:2px 8px;font-weight:600;color:#4a1c87;">{_escape(letter or "—")}</span>'
        f'<span style="color:#6f5a72;font-size:12px;">vs Ground Truth: '
        f'<strong>{_escape(gt_letter or "—")}</strong></span>'
        f'<span style="background:{chip_bg};color:{chip_fg};border-radius:999px;'
        f'padding:2px 10px;font-size:12px;font-weight:600;">{chip_text}</span>'
        f'<span style="color:#6f5a72;font-size:12px;">sim={sim_str}</span>'
        '</div>'
        '<div style="font-size:13px;color:#3a2b4a;line-height:1.55;'
        'background:#fff;border:1px solid #ece4f8;border-radius:8px;padding:10px 12px;">'
        f'<strong style="display:block;margin-bottom:4px;color:#4a1c87;">Justification</strong>'
        f'{_escape(justification) if justification else "<em>No justification captured.</em>"}'
        '</div>'
        '</div>'
    )


def _load_examples_for_slider(dataset_name: str):
    dataset_key = _resolve_dataset_key(dataset_name)
    records = _records_for_dataset(dataset_key)
    slider_max = max(1, min(10, len(records) or 10))
    context = prompt = answer = ""
    if records:
        context, prompt, answer = _resolve_example_fields(records[0])
    slider_update = gr.update(minimum=1, maximum=slider_max, step=1, value=1)
    return slider_update, records, context, prompt, answer


def _update_example_preview(example_number: Any, records):
    if not records:
        return "", "", ""
    index = _resolve_example_index(example_number, records)
    return _resolve_example_fields(records[index])


def _results_output_list(results: Dict[str, Any]) -> List[Any]:
    return [
        results["interactions"],
        results["interactions_tokens_html"],
        results["interactions_text_html"],
        results["text_html"],
        results["meta"],
        results["scoring_target_source"],
        results["scoring_target_text"],
        results["reference_answer"],
        results["unmasked_answer"],
        results["debug_scores"],
        results["scalarizer_used"],
        results["score_full"],
        results["score_empty"],
        results["y_len_tokens"],
    ]


def build_demo_app() -> gr.Blocks:
    datasets = _available_datasets()
    default_dataset = datasets[0] if datasets else "demo"

    # Apply the same colorful CSS theme
    custom_css = """
    .gradio-container {
        font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
        background: linear-gradient(135deg, #fef5f0 0%, #f0e8ff 50%, #e8f5ff 100%) !important;
        padding: 24px !important;
    }
    
    .gradio-container h1, .gradio-container h2 {
        background: linear-gradient(135deg, #ff6b6b 0%, #ee5a6f 30%, #c44569 60%, #6c5ce7 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        background-clip: text;
        font-weight: 900;
        font-size: 42px !important;
        margin: 20px 0 16px 0;
        letter-spacing: -0.03em;
    }
    
    label, .gr-label {
        font-weight: 700 !important;
        font-size: 16px !important;
        color: #2d1f4a !important;
    }
    
    .gr-button {
        border-radius: 16px !important;
        font-weight: 700 !important;
        font-size: 17px !important;
        padding: 16px 32px !important;
        background: linear-gradient(135deg, #6c5ce7 0%, #a29bfe 100%) !important;
        color: white !important;
        border: none !important;
    }
    
    .gr-box, .gr-input, .gr-dropdown, .gr-textbox {
        border-radius: 14px !important;
        border: 3px solid #e8dff5 !important;
        font-size: 17px !important;
    }
    
    .gr-markdown p {
        font-size: 17px !important;
        font-weight: 500 !important;
    }
    """

    _demo_kwargs = {"title": "AttrLLM Visualization Demo"}
    if _supports_kwarg(gr.Blocks, "css"):
        _demo_kwargs["css"] = custom_css
    with gr.Blocks(**_demo_kwargs) as demo:
        gr.Markdown(
            "# 🎨 AttrLLM Visualization Demo\n\n"
            "**Preview the attribution widgets** before wiring real backends. "
            "Use the controls below to explore the interface."
        )

        with gr.Row():
            with gr.Column(scale=1):
                # Prepare initial choices and value before creating component
                initial_choices = _dataset_choice_labels(datasets)
                initial_value = initial_choices[0] if initial_choices else None
                
                dataset_selector = gr.Dropdown(
                    choices=initial_choices,
                    value=initial_value,
                    label="Dataset",
                    interactive=True,
                    allow_custom_value=False,
                    elem_id="dataset-selector-demo",
                    elem_classes=["bubble-select"],
                )
                example_browser = create_example_browser()
                with gr.Column(scale=1):
                    model_selector = create_model_selector()
                    scalarizer_selector = gr.Dropdown(
                        choices=SCALARIZER_CHOICES,
                        value="logprob",
                        label="Scalarizer",
                        interactive=True,
                    )
                    embedding_model_box = gr.Textbox(
                        label="Embedding Model (for scalarizer=embedding)",
                        value="Qwen/Qwen3-Embedding-0.6B",
                        lines=1,
                    )
                    feature_level_selector = create_feature_level_selector()
                    method_toggle = create_attribution_method_toggle()

        dataset_selector.change(
            fn=_load_examples_for_demo,
            inputs=dataset_selector,
            outputs=example_browser,
        )
        demo.load(
            fn=_load_examples_for_demo,
            inputs=[dataset_selector],
            outputs=[example_browser],
        )

        render_button = gr.Button("Render Demo Visuals", variant="primary")

        outputs = create_results_display()
        extra_matrix = gr.Plot(label="Interaction Matrix (demo)")

        render_button.click(
            fn=_render_demo,
            inputs=[method_toggle],
            outputs=_results_output_list(outputs),
        )

        render_button.click(
            fn=_render_additional_plots,
            inputs=[method_toggle],
            outputs=[extra_matrix],
        )

    return demo


def _patch_gradio_schema_generation() -> None:
    """Prevent Gradio 5.x /info crash caused by additionalProperties: true in schemas."""
    try:
        from gradio_client import utils as client_utils
    except Exception:
        return
    if getattr(client_utils, "_attrllm_schema_patch", False):
        return
    original_inner = getattr(client_utils, "_json_schema_to_python_type", None)
    original_outer = getattr(client_utils, "json_schema_to_python_type", None)
    if not callable(original_inner) or not callable(original_outer):
        return

    def _normalize_schema(schema):
        if isinstance(schema, bool):
            return {} if schema else {"type": "null"}
        if isinstance(schema, list):
            return [_normalize_schema(item) for item in schema]
        if not isinstance(schema, dict):
            return schema
        normalized = dict(schema)
        if isinstance(normalized.get("additionalProperties"), bool):
            normalized["additionalProperties"] = _normalize_schema(normalized["additionalProperties"])
        for key in ("properties", "$defs", "definitions", "patternProperties"):
            value = normalized.get(key)
            if isinstance(value, dict):
                normalized[key] = {k: _normalize_schema(v) for k, v in value.items()}
        for key in ("items", "contains", "not", "if", "then", "else"):
            if key in normalized:
                normalized[key] = _normalize_schema(normalized[key])
        for key in ("anyOf", "allOf", "oneOf", "prefixItems"):
            value = normalized.get(key)
            if isinstance(value, list):
                normalized[key] = [_normalize_schema(item) for item in value]
        return normalized

    client_utils._json_schema_to_python_type = lambda s, d=None: original_inner(_normalize_schema(s), d)
    client_utils.json_schema_to_python_type = lambda s: original_outer(_normalize_schema(s))
    client_utils._attrllm_schema_patch = True


_patch_gradio_schema_generation()


def build_app() -> gr.Blocks:
    datasets = _available_datasets()
    default_dataset = datasets[0] if datasets else ""
    public_only = _public_only_mode()
    mm_only = _mm_only_mode()

    # Custom CSS for prettier UI - Inspired by modern, colorful design
    custom_css = """
    /* Main container styling - Warm gradient background */
    .gradio-container {
        font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
        background: linear-gradient(135deg, #fef5f0 0%, #f0e8ff 50%, #e8f5ff 100%) !important;
        padding: 24px !important;
    }
    
    /* Header styling - Large, bold, colorful */
    .gradio-container h1 {
        background: linear-gradient(135deg, #ff6b6b 0%, #ee5a6f 30%, #c44569 60%, #6c5ce7 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        background-clip: text;
        font-weight: 900;
        font-size: 48px !important;
        margin: 20px 0 16px 0;
        letter-spacing: -0.03em;
        text-align: left;
    }
    
    .gradio-container h2 {
        background: linear-gradient(135deg, #ff6b6b 0%, #ee5a6f 30%, #c44569 60%, #6c5ce7 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        background-clip: text;
        font-weight: 900;
        font-size: 42px !important;
        margin: 20px 0 16px 0;
        letter-spacing: -0.03em;
    }
    
    .gradio-container h3 {
        color: #2d1f4a;
        font-weight: 800;
        font-size: 24px !important;
        margin: 24px 0 16px 0;
    }
    
    /* Tab styling - Bold and colorful */
    .tab-nav {
        border: none !important;
        background: transparent !important;
        gap: 8px !important;
        padding: 8px 0 !important;
    }
    
    .tab-nav button {
        font-size: 18px !important;
        font-weight: 700 !important;
        padding: 16px 32px !important;
        border-radius: 16px !important;
        transition: all 0.3s ease !important;
        border: 3px solid #e0d0f0 !important;
        background: white !important;
        color: #6c5ce7 !important;
        margin-right: 8px !important;
    }
    
    .tab-nav button:hover {
        background: #f8f4ff !important;
        border-color: #b8a8db !important;
        transform: translateY(-2px) !important;
    }
    
    .tab-nav button.selected {
        background: linear-gradient(135deg, #6c5ce7 0%, #a29bfe 100%) !important;
        color: white !important;
        border: 3px solid #6c5ce7 !important;
        box-shadow: 0 6px 20px rgba(108, 92, 231, 0.3) !important;
    }
    
    /* Button styling - Vibrant and interactive */
    .gr-button {
        border-radius: 16px !important;
        font-weight: 700 !important;
        font-size: 17px !important;
        padding: 16px 32px !important;
        transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
        box-shadow: 0 6px 20px rgba(108, 92, 231, 0.2) !important;
        border: none !important;
    }
    
    .gr-button-primary {
        background: linear-gradient(135deg, #6c5ce7 0%, #a29bfe 100%) !important;
        color: white !important;
    }
    
    .gr-button-secondary {
        background: linear-gradient(135deg, #fd79a8 0%, #ff7675 100%) !important;
        color: white !important;
    }
    
    .gr-button:hover {
        transform: translateY(-3px) scale(1.02) !important;
        box-shadow: 0 10px 30px rgba(108, 92, 231, 0.35) !important;
    }
    
    .gr-button-primary:hover {
        background: linear-gradient(135deg, #5e4ec7 0%, #9089e8 100%) !important;
    }
    
    /* Input/Dropdown styling - Clear and modern */
    .gr-box, .gr-input, .gr-dropdown {
        border-radius: 14px !important;
        border: 3px solid #e8dff5 !important;
        background: white !important;
        font-size: 17px !important;
        padding: 12px 16px !important;
        transition: all 0.3s ease !important;
        font-weight: 500 !important;
    }
    
    .gr-box:focus, .gr-input:focus, .gr-dropdown:focus {
        border-color: #6c5ce7 !important;
        box-shadow: 0 0 0 4px rgba(108, 92, 231, 0.15) !important;
        transform: translateY(-1px) !important;
    }
    
    /* Textbox styling - Larger text */
    .gr-textbox {
        border-radius: 16px !important;
        border: 3px solid #e8dff5 !important;
        font-size: 17px !important;
        line-height: 1.6 !important;
    }
    
    .gr-textbox textarea {
        font-size: 17px !important;
        line-height: 1.6 !important;
        padding: 14px !important;
    }
    
    .gr-textbox:focus-within {
        border-color: #6c5ce7 !important;
        box-shadow: 0 6px 24px rgba(108, 92, 231, 0.2) !important;
    }
    
    /* Radio button styling - Colorful pills */
    .gr-radio {
        gap: 12px !important;
    }
    
    .gr-radio label {
        font-size: 17px !important;
        font-weight: 600 !important;
        padding: 14px 28px !important;
        border-radius: 14px !important;
        border: 3px solid #e8dff5 !important;
        transition: all 0.3s ease !important;
        background: white !important;
        cursor: pointer !important;
    }
    
    .gr-radio label:hover {
        border-color: #b8a8db !important;
        background: #faf8ff !important;
        transform: translateY(-2px) !important;
        box-shadow: 0 4px 12px rgba(108, 92, 231, 0.15) !important;
    }
    
    .gr-radio input:checked + label {
        background: linear-gradient(135deg, #6c5ce7 0%, #a29bfe 100%) !important;
        color: white !important;
        border-color: #6c5ce7 !important;
        font-weight: 800 !important;
        box-shadow: 0 6px 20px rgba(108, 92, 231, 0.3) !important;
    }
    
    /* Panel/Accordion styling - Clean cards */
    .gr-panel {
        border-radius: 20px !important;
        border: 3px solid #e8dff5 !important;
        padding: 24px !important;
        background: white !important;
        box-shadow: 0 6px 24px rgba(108, 92, 231, 0.1) !important;
        margin: 16px 0 !important;
    }
    
    .gr-accordion {
        border-radius: 18px !important;
        border: 3px solid #e8dff5 !important;
        background: white !important;
    }
    
    /* Label styling - Bold and readable */
    label, .gr-label {
        font-weight: 700 !important;
        font-size: 16px !important;
        color: #2d1f4a !important;
        margin-bottom: 10px !important;
        letter-spacing: -0.01em !important;
    }
    
    /* Dropdown options */
    .gr-dropdown-menu {
        border-radius: 14px !important;
        border: 3px solid #e8dff5 !important;
        box-shadow: 0 8px 32px rgba(108, 92, 231, 0.15) !important;
        font-size: 17px !important;
    }
    
    .gr-dropdown-menu .item {
        font-size: 17px !important;
        padding: 12px 16px !important;
        font-weight: 500 !important;
    }
    
    .gr-dropdown-menu .item:hover {
        background: linear-gradient(135deg, #f3f0ff 0%, #e8f5ff 100%) !important;
    }
    
    /* Plot container - Prominent */
    .gr-plot {
        border-radius: 20px !important;
        border: 3px solid #e8dff5 !important;
        overflow: hidden !important;
        box-shadow: 0 8px 30px rgba(108, 92, 231, 0.12) !important;
        background: white !important;
        width: 100% !important;
    }

    /* Force the inner Plotly canvas + svg to fill its container so the Bar
       View doesn't render in a half-width column when the Text Interaction
       view above it is wide. */
    .gr-plot .js-plotly-plot,
    .gr-plot .plot-container,
    .gr-plot .svg-container,
    .gr-plot .main-svg {
        width: 100% !important;
        max-width: 100% !important;
    }
    .interaction-stack > .gradio-plot,
    .interaction-stack > .block.gradio-plot,
    .interaction-stack .gr-plot {
        width: 100% !important;
        max-width: 100% !important;
        flex: 1 1 100% !important;
    }
    
    /* JSON viewer */
    .gr-json {
        border-radius: 16px !important;
        border: 3px solid #e8dff5 !important;
        background: #faf8ff !important;
        padding: 20px !important;
        font-family: 'Monaco', 'Menlo', 'Consolas', monospace !important;
        font-size: 15px !important;
    }
    
    /* Column styling */
    .gr-column {
        padding: 20px !important;
    }
    
    /* Row styling */
    .gr-row {
        gap: 24px !important;
        margin: 12px 0 !important;
    }
    
    /* Markdown content - Larger, more readable */
    .gr-markdown {
        line-height: 1.8 !important;
        color: #2d1f4a !important;
    }
    
    .gr-markdown p {
        font-size: 17px !important;
        margin: 12px 0 !important;
        font-weight: 500 !important;
    }
    
    .gr-markdown strong {
        font-weight: 800 !important;
        color: #6c5ce7 !important;
    }
    
    /* Status/info messages - Colorful notifications */
    .gr-info {
        border-radius: 16px !important;
        border-left: 5px solid #6c5ce7 !important;
        background: linear-gradient(135deg, #f8f6ff 0%, #f0f4ff 100%) !important;
        padding: 18px 24px !important;
        font-size: 16px !important;
        font-weight: 600 !important;
        color: #2d1f4a !important;
        box-shadow: 0 4px 16px rgba(108, 92, 231, 0.1) !important;
    }
    
    /* Error messages */
    .gr-error {
        border-radius: 16px !important;
        border-left: 5px solid #ff6b6b !important;
        background: linear-gradient(135deg, #fff5f5 0%, #ffe8e8 100%) !important;
        padding: 18px 24px !important;
        font-size: 16px !important;
        font-weight: 600 !important;
        color: #c44569 !important;
    }
    
    /* Loading spinner */
    .loading {
        border: 4px solid #f3f0ff !important;
        border-top: 4px solid #6c5ce7 !important;
    }
    
    /* Scrollbar styling */
    ::-webkit-scrollbar {
        width: 12px !important;
        height: 12px !important;
    }
    
    ::-webkit-scrollbar-track {
        background: #f8f6ff !important;
        border-radius: 10px !important;
    }
    
    ::-webkit-scrollbar-thumb {
        background: linear-gradient(135deg, #6c5ce7 0%, #a29bfe 100%) !important;
        border-radius: 10px !important;
    }
    
    ::-webkit-scrollbar-thumb:hover {
        background: linear-gradient(135deg, #5e4ec7 0%, #9089e8 100%) !important;
    }

    .results-shell {
        margin-top: 16px !important;
        background: transparent !important;
        border: none !important;
        border-radius: 0 !important;
        padding: 0 !important;
        box-shadow: none !important;
    }

    .results-shell,
    .results-shell > div,
    .results-shell .gr-group,
    .results-shell .gr-box,
    .results-shell .gr-panel,
    .results-shell .block {
        background: transparent !important;
        border: none !important;
        box-shadow: none !important;
    }

    .interaction-stack {
        gap: 20px !important;
        padding: 0 8px 6px !important;
    }

    .interaction-stack h3 {
        margin-left: 24px !important;
        margin-bottom: 12px !important;
    }

    .public-controls {
        align-items: stretch !important;
        gap: 20px !important;
        margin-top: 8px !important;
    }

    .control-card {
        background: linear-gradient(180deg, rgba(255, 255, 255, 0.82) 0%, rgba(250, 246, 255, 0.96) 100%) !important;
        border: 2px solid rgba(224, 208, 240, 0.78) !important;
        border-radius: 26px !important;
        padding: 18px 20px 14px !important;
        box-shadow: 0 14px 30px rgba(108, 92, 231, 0.07) !important;
    }

    .control-card-primary {
        background: linear-gradient(180deg, rgba(255, 255, 255, 0.86) 0%, rgba(244, 248, 255, 0.96) 100%) !important;
    }

    .control-card-secondary {
        background: linear-gradient(180deg, rgba(255, 255, 255, 0.86) 0%, rgba(250, 244, 255, 0.96) 100%) !important;
    }

    .control-card .gradio-container,
    .control-card .gr-group {
        background: transparent !important;
    }

    .control-card > div,
    .control-card .block,
    .control-card .wrap,
    .control-card .gr-form,
    .control-card .form {
        background: transparent !important;
        border: none !important;
        box-shadow: none !important;
    }

    .control-card .gr-box,
    .control-card .gr-panel {
        background: transparent !important;
        box-shadow: none !important;
    }

    .bubble-select {
        border: 3px solid #8f5cff !important;
        border-radius: 18px !important;
        box-shadow: 0 8px 20px rgba(143, 92, 255, 0.10) !important;
        transition: box-shadow 0.2s ease, border-color 0.2s ease !important;
    }

    .bubble-select:focus-within {
        border-color: #7a3dff !important;
        box-shadow: 0 0 0 4px rgba(143, 92, 255, 0.14), 0 10px 24px rgba(143, 92, 255, 0.16) !important;
    }

    .example-id-slider {
        margin-top: 8px !important;
        padding: 10px 2px 2px !important;
    }

    .example-id-slider input[type="range"] {
        accent-color: #4f7cff !important;
    }

    .example-id-slider .number-input,
    .example-id-slider input[type="number"] {
        border-radius: 16px !important;
        border: 2px solid #d8dcee !important;
        background: linear-gradient(180deg, #ffffff 0%, #f7f9ff 100%) !important;
        font-weight: 700 !important;
        min-width: 72px !important;
    }

    .example-id-slider .wrap {
        gap: 14px !important;
    }

    @media (prefers-color-scheme: dark) {
        .gradio-container {
            background: radial-gradient(circle at top, #1e2a44 0%, #0d1422 52%, #090f19 100%) !important;
        }

        .gradio-container h3,
        label, .gr-label,
        .gr-markdown,
        .gr-markdown p {
            color: #e8eefc !important;
        }

        .gr-markdown strong {
            color: #cbd7ff !important;
        }

        .tab-nav button,
        .gr-box, .gr-input, .gr-dropdown, .gr-textbox,
        .gr-panel, .gr-accordion,
        .gr-plot, .gr-json {
            background: rgba(16, 24, 39, 0.88) !important;
            border-color: rgba(148, 163, 184, 0.24) !important;
            color: #e8eefc !important;
        }

        .tab-nav button {
            color: #d7e1ff !important;
        }

        .tab-nav button:hover {
            background: rgba(37, 52, 79, 0.96) !important;
            border-color: rgba(199, 210, 254, 0.36) !important;
        }

        .gr-radio label {
            background: rgba(16, 24, 39, 0.9) !important;
            border-color: rgba(148, 163, 184, 0.26) !important;
            color: #e8eefc !important;
        }

        .gr-radio label:hover {
            background: rgba(37, 52, 79, 0.96) !important;
        }

        .gr-textbox textarea,
        .gr-input input {
            background: transparent !important;
            color: #e8eefc !important;
        }

        .gr-dropdown-menu {
            background: #101827 !important;
            border-color: rgba(148, 163, 184, 0.24) !important;
        }

        .gr-dropdown-menu .item {
            color: #e8eefc !important;
        }

        .gr-dropdown-menu .item:hover {
            background: rgba(37, 52, 79, 0.96) !important;
        }

        .gr-plot .main-svg,
        .gr-plot .svg-container,
        .gr-plot .plot-container,
        .gr-plot .user-select-none {
            background: transparent !important;
        }

        .gr-plot .xtick text,
        .gr-plot .ytick text,
        .gr-plot .gtitle text,
        .gr-plot .xtitle text,
        .gr-plot .ytitle text,
        .gr-plot .annotation-text,
        .gr-plot .legend text {
            fill: #e8eefc !important;
            color: #e8eefc !important;
        }

        .gr-plot .gridlayer path,
        .gr-plot .zerolinelayer path,
        .gr-plot .xlines-above path,
        .gr-plot .ylines-above path {
            stroke: rgba(148, 163, 184, 0.22) !important;
        }

        .gr-info {
            background: linear-gradient(135deg, rgba(30, 41, 59, 0.95) 0%, rgba(17, 24, 39, 0.95) 100%) !important;
            color: #dbe7ff !important;
            border-left-color: #9db4ff !important;
        }

        .control-card {
            background: linear-gradient(180deg, rgba(16, 24, 39, 0.9) 0%, rgba(18, 28, 45, 0.96) 100%) !important;
            border-color: rgba(148, 163, 184, 0.2) !important;
            box-shadow: 0 18px 36px rgba(0, 0, 0, 0.24) !important;
        }

        .results-shell,
        .results-shell > div,
        .results-shell .gr-group,
        .results-shell .gr-box,
        .results-shell .gr-panel,
        .results-shell .block {
            background: transparent !important;
            border: none !important;
            box-shadow: none !important;
        }

        .bubble-select {
            border-color: #a06cff !important;
            box-shadow: 0 10px 24px rgba(143, 92, 255, 0.18) !important;
        }

        .bubble-select:focus-within {
            border-color: #c29cff !important;
            box-shadow: 0 0 0 4px rgba(143, 92, 255, 0.16), 0 12px 26px rgba(143, 92, 255, 0.22) !important;
        }

        .example-id-slider .number-input,
        .example-id-slider input[type="number"] {
            background: linear-gradient(180deg, #162031 0%, #111827 100%) !important;
            border-color: rgba(148, 163, 184, 0.24) !important;
            color: #e8eefc !important;
        }

        .gr-error {
            background: linear-gradient(135deg, rgba(68, 18, 32, 0.95) 0%, rgba(39, 12, 20, 0.95) 100%) !important;
            color: #ffd5dc !important;
        }

        ::-webkit-scrollbar-track {
            background: #111827 !important;
        }
    }
    """

    _app_kwargs = {"title": "LLM Reasoning Explorer Studio"}
    if _supports_kwarg(gr.Blocks, "css"):
        _app_kwargs["css"] = custom_css
    with gr.Blocks(**_app_kwargs) as app:
        gr.Markdown(
            "# LLM Reasoning Explorer Studio\n\n"
            "**Explore attribution results and feature interactions** with our interactive visualization tools. "
            "Browse pre-computed examples or analyze your own text in real-time with powerful AI insights."
        )
        gr.Markdown(f"**Build:** {BUILD_ID} ({BUILD_TS})")

        example_state = gr.State([])

        with (gr.Column(visible=not mm_only) if (public_only or mm_only) else gr.Tab("Public Mode")):
            with gr.Accordion("How to Use", open=False):
                gr.Markdown(
                    "1. **Select a dataset** from 10 available datasets (100 total examples, 10 per dataset)\n"
                    "2. **Choose a model** to compare: Qwen3-4B, Qwen3-30B, or Mistral-7B\n"
                    "3. **Pick a scoring method:** Perplexity or Semantic Similarity\n"
                    "4. **Set the feature level:** Word, Sentence, or Paragraph\n"
                    "5. **Choose an attribution method:** Shapley, Banzhaf, or Influence\n"
                    "6. **View results** in the Text Interaction View (inline highlights) and Bar View (ranked interactions)"
                )
            with gr.Row(elem_classes=["public-controls"]):
                with gr.Column(scale=1, elem_classes=["control-card", "control-card-primary"]):
                    # Prepare initial choices and value before creating component
                    initial_choices = _dataset_choice_labels(datasets)
                    # In mm_only mode the text attribution tab is hidden — no default value
                    # prevents the .change() callback from firing on page load.
                    _preferred_default = "BBQ Disambiguation"
                    if mm_only:
                        initial_value = None
                    elif _preferred_default in initial_choices:
                        initial_value = _preferred_default
                    else:
                        initial_value = initial_choices[0] if initial_choices else None

                    dataset_selector = gr.Dropdown(
                        choices=initial_choices,
                        value=initial_value,
                        label="Dataset",
                        interactive=True,
                        allow_custom_value=False,
                        elem_id="dataset-selector",
                        elem_classes=["bubble-select"],
                    )

                    example_selector = gr.Slider(
                        label="Example ID",
                        minimum=1,
                        maximum=10,
                        step=1,
                        value=1,
                        interactive=True,
                        elem_classes=["example-id-slider"],
                    )
                with gr.Column(scale=1, elem_classes=["control-card", "control-card-secondary"]):
                    model_selector = create_model_selector()
                    scalarizer_selector = gr.Dropdown(
                        choices=PUBLIC_SCALARIZER_CHOICES,
                        value="geomean_jointprob",
                        label="Scalarizer",
                        interactive=True,
                        elem_classes=["bubble-select"],
                    )
                    public_feature_level_selector = create_feature_level_selector(value="word")
                    method_toggle = create_attribution_method_toggle()

            with gr.Accordion("Example Preview", open=True):
                with gr.Row():
                    with gr.Column(scale=3):
                        context_box = gr.Textbox(
                            label="Context",
                            lines=8,
                            interactive=False,
                        )
                    with gr.Column(scale=2):
                        prompt_box = gr.Textbox(
                            label="Prompt",
                            lines=4,
                            interactive=False,
                        )
                        answer_box = gr.Textbox(
                            label="Ground Truth Answer",
                            lines=3,
                            interactive=False,
                        )
                # Empty for examples outside the 30-pair allow-list; renders
                # the model's parsed letter + justification for the others.
                try:
                    model_answer_html = gr.HTML(value="", sanitize_html=False)
                except TypeError:
                    model_answer_html = gr.HTML(value="")

            public_results = create_results_display()

            def _public_mode_compute(
                dataset,
                example_number,
                records,
                model_size,
                scalarizer,
                feature_level,
                method,
                progress=gr.Progress(track_tqdm=True),
            ):
                if mm_only:
                    return tuple([None] * 14)
                if not dataset:
                    raise gr.Error("Please select a dataset.")
                if not example_number:
                    raise gr.Error("Please select an example.")

                dataset_key = _resolve_dataset_key(dataset)
                ex_id = _resolve_example_id(example_number, records)

                method = _normalize_method(method)
                level = _normalize_level(feature_level)
                model_size = _normalize_model_size(model_size)

                # Prefer precomputed results: use loader if available, else load from file (Space-friendly).
                get_res = get_result_by_id if get_result_by_id is not None else _public_get_result_from_file
                result = get_res(
                    model_size,
                    dataset_key,
                    ex_id,
                    scalarizer=scalarizer,
                    feature_level=level,
                ) or {}
                payload = result.get(method, {})

                if not payload:
                    alt_size = _find_available_model_size(dataset_key, ex_id, scalarizer, level)
                    if alt_size and alt_size != model_size:
                        result = get_res(
                            alt_size,
                            dataset_key,
                            ex_id,
                            scalarizer=scalarizer,
                            feature_level=level,
                        ) or {}
                        payload = result.get(method, {})
                        if payload:
                            model_size = alt_size

                # If still no payload, try any available (model_size, scalarizer, level) for this example
                if not payload:
                    alt_size, alt_scalarizer, alt_level, result = _find_any_available_result(
                        dataset_key, ex_id, get_res, method
                    )
                    if alt_size and alt_scalarizer and alt_level and result:
                        payload = result.get(method, {})
                        model_size, scalarizer, level = alt_size, alt_scalarizer, alt_level

                if payload and (payload.get("features") or payload.get("heatmap")):
                    _, _, _, *outputs = on_select_example(
                        dataset_key,
                        ex_id,
                        model_size,
                        2,
                        method,
                        scalarizer=scalarizer,
                        feature_level=level,
                    )
                    return outputs

                # Public-only mode: do not attempt live compute
                if _public_only_mode() or get_example_by_id is None:
                    expected_ref = _reference_results_file(model_size, dataset_key, ex_id, scalarizer, level)
                    raise gr.Error(
                        "No precomputed results found.\n\n"
                        f"Expected (reference_answer):\n{expected_ref}\n\n"
                        "On Hugging Face Space: make sure the 'results' folder is in your repo "
                        "(commit & push it). If you use Git LFS, enable 'LFS' in Space Settings → "
                        "Repository and ensure files are pulled. You can also try another "
                        "scalarizer (e.g. Perplexity) or feature level (e.g. word)."
                    )

                # Fallback to live compute if no precomputed payload or non-word level
                get_ex = _ensure_backend("loader.data.get_example_by_id", get_example_by_id)
                record = get_ex(dataset_key, ex_id)
                context = record.get("context", "")
                prompt = record.get("prompt", "")
                answer = _extract_answer(record)

                return _compute_live_attributions(
                    context=context,
                    prompt=prompt,
                    correct_answer=answer,
                    model_size=model_size,
                    scalarizer=scalarizer,
                    embedding_model=None,
                    level=level,
                    method=method,
                    order=2,
                    progress=progress,
                )

            public_preview_outputs = [context_box, prompt_box, answer_box]
            public_compute_inputs = [
                dataset_selector,
                example_selector,
                example_state,
                model_selector,
                scalarizer_selector,
                public_feature_level_selector,
                method_toggle,
            ]
            public_compute_outputs = _results_output_list(public_results)

            dataset_change_event = dataset_selector.change(
                fn=_load_examples_for_slider,
                inputs=[dataset_selector],
                outputs=[
                    example_selector,
                    example_state,
                    context_box,
                    prompt_box,
                    answer_box,
                ],
                queue=False,
            ).then(
                fn=_build_model_answer_panel,
                inputs=[dataset_selector, example_selector],
                outputs=[model_answer_html],
                queue=False,
            )
            load_event = app.load(
                fn=_load_examples_for_slider,
                inputs=[dataset_selector],
                outputs=[
                    example_selector,
                    example_state,
                    context_box,
                    prompt_box,
                    answer_box,
                ],
            ).then(
                fn=_build_model_answer_panel,
                inputs=[dataset_selector, example_selector],
                outputs=[model_answer_html],
                queue=False,
            )

            dataset_change_event.then(
                fn=_public_mode_compute,
                inputs=public_compute_inputs,
                outputs=public_compute_outputs,
                show_progress="full",
            )
            load_event.then(
                fn=_public_mode_compute,
                inputs=public_compute_inputs,
                outputs=public_compute_outputs,
                show_progress="full",
            )

            example_selector.release(
                fn=_update_example_preview,
                inputs=[example_selector, example_state],
                outputs=public_preview_outputs,
                queue=False,
            ).then(
                fn=_build_model_answer_panel,
                inputs=[dataset_selector, example_selector],
                outputs=[model_answer_html],
                queue=False,
            ).then(
                fn=_public_mode_compute,
                inputs=public_compute_inputs,
                outputs=public_compute_outputs,
                show_progress="full",
            )

            for component in (
                model_selector,
                scalarizer_selector,
                public_feature_level_selector,
                method_toggle,
            ):
                component.change(
                    fn=_public_mode_compute,
                    inputs=public_compute_inputs,
                    outputs=public_compute_outputs,
                    show_progress="full",
                )

        # ── MULTIMODAL TAB ──────────────────────────────────────────
        with gr.Tab("Multimodal"):

                with gr.Accordion("How to Use", open=False):
                    gr.Markdown(
                        "1. **Choose a dataset** from the three sub-tabs:\n"
                        "   - **MIMIC-CXR (10 Samples)** — chest X-rays across 10 pathology categories\n"
                        "   - **Dermoscopy ISIC (10 Samples)** — skin-lesion dermoscopy across 8 diagnostic classes\n"
                        "   - **MS-COCO (5 Samples)** — natural-image cross-modal benchmark\n"
                        "2. **Pick an example** from the dropdown (each is an image + caption pair)\n"
                        "3. **Choose an attribution method:** Influence (default, non-negative — clearer for clinicians) or Shapley (signed)\n"
                        "4. **Read the four panels side-by-side:**\n"
                        "   - **Interactive Cross-Modal View** — click any image patch or caption word to see its strongest cross-modal partners\n"
                        "   - **BiomedCLIP Cross-Modal Attribution** — patch-level overlay + bar charts (cosine-similarity scoring)\n"
                        "   - **LLaVA-Med Attribution** — log-prob and generation Shapley charts from the medical 7B VLM\n"
                        "   - **Compare Two Methods Side-by-Side** — pick any two of the above to overlay their rankings\n"
                        "5. **Hover token chips and patches** for exact attribution values; hover SVG arcs for pairwise interaction strength"
                    )

                with gr.Tab("MIMIC-CXR (10 Samples)"):
                    gr.Markdown(
                        "**10-sample MIMIC-CXR chest X-ray attribution benchmark** "
                        "(10 pathology categories).  \n"
                        "Source: [MIMIC-CXR-JPG](https://huggingface.co/datasets/itsanmolgupta/mimic-cxr-dataset-cleaned) "
                        "— de-identified chest radiographs from Beth Israel Deaconess Medical Center.  \n"
                        "Each example has a radiology report (impression = caption, findings = detail)."
                    )

                    # Build (category_name, example_id) choices so picking a
                    # pathology directly loads its example (1:1 mapping).
                    _mimic_choices = (
                        [(v["category"], k) for k, v in MIMIC_EXAMPLES.items()]
                        if _MIMIC_AVAILABLE else []
                    )

                    mimic_selector = gr.Dropdown(
                        choices=_mimic_choices,
                        value=None,
                        label="Filter by Pathology",
                        interactive=True,
                    )

                    mimic_method_toggle = gr.Radio(
                        choices=["Influence", "Shapley"],
                        value="Influence",
                        label="Attribution method",
                        info=(
                            "Influence (default) is always positive — clearer for clinicians. "
                            "Shapley is signed (green = supports caption, red = detracts)."
                        ),
                        interactive=True,
                    )

                    mimic_caption = gr.Textbox(
                        label="Radiology Impression (Caption)",
                        interactive=False,
                        lines=2,
                    )
                    with gr.Accordion("Full Radiology Findings", open=False):
                        mimic_findings = gr.Textbox(
                            label="Detailed Findings",
                            interactive=False,
                            lines=5,
                        )

                    # ── Original Image ────────────────────────────────
                    mimic_original = gr.Image(label="Chest X-ray", type="filepath")

                    mimic_interpretation = gr.Markdown(
                        value="*Select an example above to see the attribution analysis.*",
                        label="Interpretation",
                    )

                    # ── Table of Contents ─────────────────────────────
                    _mimic_pill = (
                        'style="display:inline-block;padding:6px 14px;background:#e3f2fd;'
                        'border-radius:16px;text-decoration:none;color:#1565c0;font-size:0.9em;'
                        'border:1px solid #bbdefb;"'
                    )
                    _mimic_toc_html = (
                        '<div style="background:#f8f9fa;border:1px solid #dee2e6;border-radius:8px;'
                        'padding:16px;margin:12px 0;">'
                        '<strong style="font-size:1.05em;">Jump to Section:</strong>'
                        '<div style="display:flex;flex-wrap:wrap;gap:8px;margin-top:10px;">'
                        f'<a href="#mimic-method-biomedclip" {_mimic_pill}>BiomedCLIP Cross-Modal</a>'
                        f'<a href="#mimic-method-llavamed" {_mimic_pill}>LLaVA-Med (UnSAM)</a>'
                        f'<a href="#mimic-interactive" {_mimic_pill}>Interactive View</a>'
                        f'<a href="#mimic-compare" {_mimic_pill}>Compare Methods</a>'
                        '</div></div>'
                    )
                    gr.HTML(value=_mimic_toc_html)

                    mimic_results_state = gr.State({})

                    # ════════════════════════════════════════════════════
                    # ── Interactive Cross-Modal View ───────────────────
                    # ════════════════════════════════════════════════════
                    with gr.Column(elem_id="mimic-interactive"):
                        gr.Markdown("---\n### BiomedCLIP Cross-Modal Interaction View — click segments or words")
                        gr.Markdown(
                            "**How to use:** Click any **image region** to see which caption words "
                            "it connects to, or click a **word** to see which regions activate.  \n"
                            "**Green** arrows = positive interaction. **Red** arrows = negative."
                        )
                        mimic_biomedclip_interaction_html = _html_component(
                            "BiomedCLIP Cross-Modal Interaction View")

                    # ════════════════════════════════════════════════════
                    # ── BiomedCLIP Cross-Modal Attribution ─────────────
                    # ════════════════════════════════════════════════════
                    with gr.Column(elem_id="mimic-method-biomedclip"):
                        gr.Markdown("---\n### BiomedCLIP Cross-Modal Attribution")
                        gr.Markdown(
                            "**What it does:** Uses [BiomedCLIP](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224) "
                            "— a CLIP model trained on **15 million biomedical figure-caption pairs** — "
                            "to jointly score image regions (via UnSAM segmentation) and caption tokens.  \n"
                            "**How to read:** **Green** = positive Shapley value (contributes to alignment). "
                            "**Red** = negative (hurts alignment)."
                        )
                        mimic_biomedclip_overlay = gr.Image(
                            label="BiomedCLIP Overlay (labeled segments)", type="filepath")
                        mimic_biomedclip_token_plot = gr.Plot(
                            label="BiomedCLIP — Caption Word Shapley Values")
                        mimic_biomedclip_region_plot = gr.Plot(
                            label="BiomedCLIP — Image Region Shapley Values")

                    # ════════════════════════════════════════════════════
                    # ── LLaVA-Med Attribution (UnSAM Segments) ─────────
                    # ════════════════════════════════════════════════════
                    with gr.Column(elem_id="mimic-method-llavamed"):
                        gr.Markdown("---\n### LLaVA-Med Attribution (4×4 Patch Grid, P1–P16)")
                        gr.Markdown(
                            "**What it does:** Uses [LLaVA-Med](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) "
                            "— a **7B parameter** medical VLM — evaluated over a **uniform 4×4 patch grid** "
                            "(16 cells labeled **P1–P16**, row-major).  \n"
                            "**Two scoring approaches:**  \n"
                            "- **Log-Prob:** How removing a region affects confidence in the correct caption  \n"
                            "- **Generation:** How removing a region changes what the model describes"
                        )
                        gr.Markdown(
                            "Each method colors segments by its own Shapley values — "
                            "**green** = positive, **red** = negative. Signs often differ "
                            "between Log-Prob and Generation, so each has its own overlay."
                        )
                        with gr.Row(equal_height=True):
                            mimic_llavamed_unsam_lp_overlay = gr.Image(
                                label="LLaVA-Med Log-Prob — Overlay",
                                type="filepath", height=600)
                            mimic_llavamed_unsam_gen_overlay = gr.Image(
                                label="LLaVA-Med Generation — Overlay",
                                type="filepath", height=600)
                        with gr.Row():
                            mimic_llavamed_unsam_lp_plot = gr.Plot(
                                label="LLaVA-Med Log-Prob — Segment Shapley Values")
                            mimic_llavamed_unsam_gen_plot = gr.Plot(
                                label="LLaVA-Med Generation — Segment Shapley Values")

                    # ════════════════════════════════════════════════════
                    # ── Compare Two Methods Side-by-Side ──────────────
                    # ════════════════════════════════════════════════════
                    with gr.Column(elem_id="mimic-compare"):
                        gr.Markdown("---\n### Compare Two Methods Side-by-Side")
                        gr.Markdown(
                            "Select two methods to compare their attribution overlays "
                            "and Shapley value distributions on the same image."
                        )
                        with gr.Row():
                            mimic_compare_method_a = gr.Dropdown(
                                choices=_MIMIC_METHOD_NAMES,
                                label="Method A",
                                interactive=True,
                            )
                            mimic_compare_method_b = gr.Dropdown(
                                choices=_MIMIC_METHOD_NAMES,
                                label="Method B",
                                interactive=True,
                            )
                        with gr.Row():
                            mimic_compare_img_a = gr.Image(label="Method A — Overlay", type="filepath")
                            mimic_compare_img_b = gr.Image(label="Method B — Overlay", type="filepath")
                        with gr.Row():
                            mimic_compare_plot_a = gr.Plot(label="Method A — Shapley Values")
                            mimic_compare_plot_b = gr.Plot(label="Method B — Shapley Values")

                    mimic_meta = gr.JSON(label="Example Info", visible=False)

                    _mimic_outputs = [
                        mimic_caption, mimic_original, mimic_findings, mimic_interpretation,
                        mimic_biomedclip_overlay, mimic_biomedclip_token_plot,
                        mimic_biomedclip_region_plot, mimic_llavamed_unsam_lp_overlay,
                        mimic_llavamed_unsam_lp_plot, mimic_llavamed_unsam_gen_overlay,
                        mimic_llavamed_unsam_gen_plot, mimic_biomedclip_interaction_html,
                        mimic_meta, mimic_results_state, mimic_compare_method_a,
                    ]
                    mimic_selector.change(
                        fn=_on_select_mimic_example,
                        inputs=[mimic_selector, mimic_method_toggle],
                        outputs=_mimic_outputs,
                    )
                    mimic_method_toggle.change(
                        fn=_on_select_mimic_example,
                        inputs=[mimic_selector, mimic_method_toggle],
                        outputs=_mimic_outputs,
                    )

                    # Wire: comparison dropdowns -> side-by-side display
                    for _mimic_cmp_dd in [mimic_compare_method_a, mimic_compare_method_b]:
                        _mimic_cmp_dd.change(
                            fn=_on_mimic_compare_methods,
                            inputs=[mimic_compare_method_a, mimic_compare_method_b,
                                    mimic_results_state],
                            outputs=[mimic_compare_img_a, mimic_compare_img_b,
                                     mimic_compare_plot_a, mimic_compare_plot_b],
                        )

                # ── ISIC Dermoscopy Tab ────────────────────────
                with gr.Tab("Dermoscopy ISIC (10 Samples)"):
                    gr.Markdown(
                        "**10-sample ISIC-2019 dermoscopy attribution benchmark** "
                        "(8 diagnostic classes: MEL × 2, NV × 2, BCC, AK, BKL, DF, VASC, SCC).  \n"
                        "Source: [ISIC_2019_224](https://huggingface.co/datasets/MKZuziak/ISIC_2019_224) "
                        "— dermoscopic skin-lesion images from the International Skin Imaging Collaboration.  \n"
                        "Captions are synthesized from class labels (clinical descriptions of each diagnosis)."
                    )

                    _isic_choices = (
                        [(v["category"], k) for k, v in ISIC_EXAMPLES.items()]
                        if _ISIC_AVAILABLE else []
                    )

                    isic_selector = gr.Dropdown(
                        choices=_isic_choices,
                        value=None,
                        label="Filter by Diagnosis",
                        interactive=True,
                    )

                    isic_method_toggle = gr.Radio(
                        choices=["Influence", "Shapley"],
                        value="Influence",
                        label="Attribution method",
                        info=(
                            "Influence (default) is always positive — clearer for clinicians. "
                            "Shapley is signed (green = supports caption, red = detracts)."
                        ),
                        interactive=True,
                    )

                    isic_caption = gr.Textbox(
                        label="Diagnostic Caption",
                        interactive=False,
                        lines=3,
                    )

                    isic_original = gr.Image(label="Dermoscopic Image", type="filepath")

                    isic_interpretation = gr.Markdown(
                        value="*Select an example above to see the attribution analysis.*",
                        label="Interpretation",
                    )

                    _isic_pill = (
                        'style="display:inline-block;padding:6px 14px;background:#e3f2fd;'
                        'border-radius:16px;text-decoration:none;color:#1565c0;font-size:0.9em;'
                        'border:1px solid #bbdefb;"'
                    )
                    _isic_toc_html = (
                        '<div style="background:#f8f9fa;border:1px solid #dee2e6;border-radius:8px;'
                        'padding:16px;margin:12px 0;">'
                        '<strong style="font-size:1.05em;">Jump to Section:</strong>'
                        '<div style="display:flex;flex-wrap:wrap;gap:8px;margin-top:10px;">'
                        f'<a href="#isic-method-biomedclip" {_isic_pill}>BiomedCLIP Cross-Modal</a>'
                        f'<a href="#isic-method-llavamed" {_isic_pill}>LLaVA-Med (UnSAM)</a>'
                        f'<a href="#isic-interactive" {_isic_pill}>Interactive View</a>'
                        f'<a href="#isic-compare" {_isic_pill}>Compare Methods</a>'
                        '</div></div>'
                    )
                    gr.HTML(value=_isic_toc_html)

                    isic_results_state = gr.State({})

                    with gr.Column(elem_id="isic-interactive"):
                        gr.Markdown("---\n### BiomedCLIP Cross-Modal Interaction View — click segments or words")
                        gr.Markdown(
                            "**How to use:** Click any **image region** to see which caption words "
                            "it connects to, or click a **word** to see which regions activate."
                        )
                        isic_biomedclip_interaction_html = _html_component(
                            "BiomedCLIP Cross-Modal Interaction View")

                    with gr.Column(elem_id="isic-method-biomedclip"):
                        gr.Markdown("---\n### BiomedCLIP Cross-Modal Attribution")
                        gr.Markdown(
                            "**What it does:** Uses [BiomedCLIP](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224) "
                            "to jointly score dermoscopic image regions (via UnSAM segmentation) "
                            "and caption tokens.  \n"
                            "**How to read:** **Influence** bars (default) show positive importance. "
                            "Switch to **Shapley** above for signed values (green/red)."
                        )
                        isic_biomedclip_overlay = gr.Image(
                            label="BiomedCLIP Overlay (labeled segments)", type="filepath")
                        isic_biomedclip_token_plot = gr.Plot(
                            label="BiomedCLIP — Caption Word Values")
                        isic_biomedclip_region_plot = gr.Plot(
                            label="BiomedCLIP — Image Region Values")

                    with gr.Column(elem_id="isic-method-llavamed"):
                        gr.Markdown("---\n### LLaVA-Med Attribution (4×4 Patch Grid, P1–P16)")
                        gr.Markdown(
                            "**What it does:** Uses [LLaVA-Med](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) "
                            "— a **7B parameter** medical VLM — evaluated over a **uniform 4×4 patch grid** "
                            "(16 cells labeled **P1–P16**, row-major).  \n"
                            "**Two scoring approaches:**  \n"
                            "- **Log-Prob:** How removing a region affects confidence in the caption  \n"
                            "- **Generation:** How removing a region changes what the model describes"
                        )
                        with gr.Row(equal_height=True):
                            isic_llavamed_unsam_lp_overlay = gr.Image(
                                label="LLaVA-Med Log-Prob — Overlay",
                                type="filepath", height=600)
                            isic_llavamed_unsam_gen_overlay = gr.Image(
                                label="LLaVA-Med Generation — Overlay",
                                type="filepath", height=600)
                        with gr.Row():
                            isic_llavamed_unsam_lp_plot = gr.Plot(
                                label="LLaVA-Med Log-Prob — Segment Values")
                            isic_llavamed_unsam_gen_plot = gr.Plot(
                                label="LLaVA-Med Generation — Segment Values")

                    with gr.Column(elem_id="isic-compare"):
                        gr.Markdown("---\n### Compare Two Methods Side-by-Side")
                        gr.Markdown(
                            "Select two methods to compare their attribution overlays "
                            "and value distributions on the same image."
                        )
                        with gr.Row():
                            isic_compare_method_a = gr.Dropdown(
                                choices=_ISIC_METHOD_NAMES,
                                label="Method A",
                                interactive=True,
                            )
                            isic_compare_method_b = gr.Dropdown(
                                choices=_ISIC_METHOD_NAMES,
                                label="Method B",
                                interactive=True,
                            )
                        with gr.Row():
                            isic_compare_img_a = gr.Image(label="Method A — Overlay", type="filepath")
                            isic_compare_img_b = gr.Image(label="Method B — Overlay", type="filepath")
                        with gr.Row():
                            isic_compare_plot_a = gr.Plot(label="Method A — Values")
                            isic_compare_plot_b = gr.Plot(label="Method B — Values")

                    isic_meta = gr.JSON(label="Example Info", visible=False)

                    _isic_outputs = [
                        isic_caption, isic_original, isic_interpretation,
                        isic_biomedclip_overlay, isic_biomedclip_token_plot,
                        isic_biomedclip_region_plot, isic_llavamed_unsam_lp_overlay,
                        isic_llavamed_unsam_lp_plot, isic_llavamed_unsam_gen_overlay,
                        isic_llavamed_unsam_gen_plot, isic_biomedclip_interaction_html,
                        isic_meta, isic_results_state, isic_compare_method_a,
                    ]
                    isic_selector.change(
                        fn=_on_select_isic_example,
                        inputs=[isic_selector, isic_method_toggle],
                        outputs=_isic_outputs,
                    )
                    isic_method_toggle.change(
                        fn=_on_select_isic_example,
                        inputs=[isic_selector, isic_method_toggle],
                        outputs=_isic_outputs,
                    )

                    for _isic_cmp_dd in [isic_compare_method_a, isic_compare_method_b]:
                        _isic_cmp_dd.change(
                            fn=_on_isic_compare_methods,
                            inputs=[isic_compare_method_a, isic_compare_method_b,
                                    isic_results_state],
                            outputs=[isic_compare_img_a, isic_compare_img_b,
                                     isic_compare_plot_a, isic_compare_plot_b],
                        )

                # ── MS-COCO Tab ─────────────────────────────────
                with gr.Tab("MS-COCO (5 Samples)"):
                    gr.Markdown(
                        "**CLIP cross-modal attribution on MS-COCO natural images.**  \n"
                        "Click an **image region** or **caption word** below to explore "
                        "which parts of the image and text are most strongly linked via "
                        "CLIP's visual-language similarity score."
                    )

                    _coco_choices = (
                        [(v["title"], k) for k, v in COCO_EXAMPLES.items()]
                        if _COCO_AVAILABLE else []
                    )
                    _coco_default = _coco_choices[0][1] if _coco_choices else None
                    coco_selector = gr.Radio(
                        choices=_coco_choices,
                        value=_coco_default,
                        label="Select MS-COCO Example",
                        interactive=True,
                    )
                    coco_method_toggle = gr.Radio(
                        choices=["Influence", "Shapley"],
                        value="Influence",
                        label="Attribution method",
                        info="Influence (default) is always positive. Shapley is signed.",
                        interactive=True,
                    )
                    coco_caption = gr.Textbox(
                        label="Caption", interactive=False, lines=2,
                    )

                    gr.Markdown("---\n#### Interactive Cross-Modal View")
                    gr.Markdown(
                        "Click a colored **image region** (left) to highlight the caption "
                        "words it interacts with, or click a **word** (right) to highlight "
                        "linked regions. Green = positive, red = negative."
                    )
                    coco_interaction_html = _html_component(
                        "COCO Cross-Modal Interaction View")

                    gr.Markdown("---\n#### Attribution Details")
                    with gr.Row():
                        coco_original = gr.Image(
                            label="Original Image", type="filepath")
                        coco_overlay = gr.Image(
                            label="CLIP Overlay (labeled segments)", type="filepath")

                    with gr.Row():
                        coco_token_plot = gr.Plot(
                            label="Caption Word Shapley Values")
                        coco_region_plot = gr.Plot(
                            label="Image Region Shapley Values")

                    with gr.Row():
                        coco_cross_plot = gr.Plot(
                            label="Top Image x Word Interactions")
                        coco_cross_table = gr.Dataframe(
                            headers=["Image Region", "Caption Word", "Score"],
                            label="Cross-Modal Interaction Table",
                            interactive=False,
                        )

                    with gr.Accordion("Influence Heatmap (Regions x Words)",
                                      open=False):
                        coco_heatmap = gr.Plot(
                            label="Full Heatmap: Regions x Caption Words")

                    gr.Markdown("---\n#### Masked Image Browser")
                    gr.Markdown(
                        "Browse ablation images: **solo** shows only the selected region "
                        "(everything else inpainted away); **removed** shows the image with "
                        "that region inpainted out."
                    )
                    with gr.Row():
                        coco_masked_dd = gr.Dropdown(
                            choices=[],
                            label="Region / View",
                            interactive=True,
                        )
                        coco_masked_img = gr.Image(
                            label="Masked View", type="filepath")

                    coco_note = gr.Markdown(value="")

                    _coco_outputs = [
                        coco_caption,
                        coco_original,
                        coco_overlay,
                        coco_interaction_html,
                        coco_token_plot,
                        coco_region_plot,
                        coco_cross_plot,
                        coco_cross_table,
                        coco_heatmap,
                        coco_note,
                        coco_masked_img,
                        coco_masked_dd,
                    ]
                    coco_selector.change(
                        fn=_on_select_coco_example,
                        inputs=[coco_selector, coco_method_toggle],
                        outputs=_coco_outputs,
                    )
                    coco_method_toggle.change(
                        fn=_on_select_coco_example,
                        inputs=[coco_selector, coco_method_toggle],
                        outputs=_coco_outputs,
                    )
                    coco_masked_dd.change(
                        fn=_on_select_coco_masked,
                        inputs=[coco_selector, coco_masked_dd],
                        outputs=[coco_masked_img],
                    )

        # NOTE: auto-load removed — too much data on startup crashes the browser.
        # Users select an example via the Radio to trigger loading.

        gr.HTML(
            '<div style="margin-top:32px;padding:20px;border-top:1px solid #e5e7eb;text-align:center;">'
            '<p style="font-weight:600;margin-bottom:10px;">Contributors — University of California, Berkeley</p>'
            '<p style="display:flex;justify-content:center;gap:40px;flex-wrap:wrap;font-size:0.9em;">'
            '<span><strong>Stephen Tao</strong> · Loader Layer · '
            '<a href="mailto:stephen_tao@berkeley.edu" style="color:#6366f1;">stephen_tao@berkeley.edu</a></span>'
            '<span><strong>Yiting Gao</strong> · Attribution Layer · '
            '<a href="mailto:yg2025@berkeley.edu" style="color:#6366f1;">yg2025@berkeley.edu</a></span>'
            '<span><strong>Qingpeng Kong</strong> · Visualization Layer · '
            '<a href="mailto:qpkong@berkeley.edu" style="color:#6366f1;">qpkong@berkeley.edu</a></span>'
            '</p>'
            '<p style="display:flex;justify-content:center;gap:40px;flex-wrap:wrap;font-size:0.9em;margin-top:6px;">'
            '<span><strong>Advisor:</strong> Kannan Ramchandran · '
            '<a href="mailto:kannanr@berkeley.edu" style="color:#6366f1;">kannanr@berkeley.edu</a></span>'
            '<span><strong>Mentor:</strong> Landon Butler · '
            '<a href="mailto:landonb@berkeley.edu" style="color:#6366f1;">landonb@berkeley.edu</a></span>'
            '</p></div>'
        )

    # Stash CSS for Gradio 6.x launch() (Blocks(css=) is deprecated in 6.x)
    app._custom_css = custom_css
    return app


def _launch_kwargs(app_or_demo, **kwargs):
    """Build common launch kwargs, injecting CSS for Gradio 6.x."""
    lk = dict(
        server_name=kwargs.pop("server_name", os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")),
        server_port=int(kwargs.pop("server_port", os.getenv("GRADIO_SERVER_PORT", "7860"))),
        share=kwargs.pop("share", _env_flag("GRADIO_SHARE", False)),
        show_error=kwargs.pop("show_error", True),
    )
    css = getattr(app_or_demo, "_custom_css", None)
    if css and _supports_kwarg(app_or_demo.launch, "css"):
        lk["css"] = css
    lk.update(kwargs)
    return lk


def launch_demo(**kwargs):
    demo = build_demo_app()
    demo.launch(**_launch_kwargs(demo, **kwargs))


def launch_app(**kwargs):
    app = build_app()
    app.launch(**_launch_kwargs(app, **kwargs))


if __name__ == "__main__":
    launch_app()