'''
streamlitapp.py — Vision Transformer Interpretability Dashboard (Streamlit app)

This Streamlit app provides interpretability tools for vision transformer and CNN models.
Features:
- LIME explanations for image classification predictions
- Uncertainty analysis via MC Dropout and Test-Time Augmentation (TTA)
- Switch between Hugging Face (ViT, Swin, DeiT) and timm (ResNet, EfficientNet, ConvNeXt) models
- Support for custom finetuned models and class mappings
- Interactive sidebar for model selection and checkpoint upload
- Feynman-style explanations and cheat-sheet for interpretability concepts

Inspired by and reuses code from:
- vit_and_captum.py (Integrated Gradients with Captum)
- vit_lime_uncertainty.py (LIME explanations and uncertainty)
- detr_and_interp.py (Grad-CAM for DETR, logging setup)
'''

import streamlit as st
import html
import numpy as np, torch, matplotlib.pyplot as plt
from PIL import Image
from transformers import AutoModelForImageClassification, AutoImageProcessor, PreTrainedModel
from lime import lime_image
import torchvision.transforms as T
import timm
from skimage.segmentation import slic, mark_boundaries
import streamlit.components.v1 as components


# Add logging
import logging, os
from logging.handlers import RotatingFileHandler

LOG_DIR = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logfile = os.path.join(LOG_DIR, "interp.log")

logger = logging.getLogger("interp")
if not logger.handlers:
    logger.setLevel(logging.INFO)
    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    fh = RotatingFileHandler(logfile, maxBytes=5_000_000, backupCount=3, encoding="utf-8")
    fh.setLevel(logging.INFO)
    fmt = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s")
    sh.setFormatter(fmt)
    fh.setFormatter(fmt)
    logger.addHandler(sh)
    logger.addHandler(fh)


# ---------------- Setup ----------------
MODEL_NAME = "google/vit-base-patch16-224"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- Sidebar model selectors ----------
# Quick lists you can edit to test other HF / timm models
HF_MODELS = [
    "google/vit-base-patch16-224",
    "facebook/deit-base-patch16-224",
    "microsoft/swin-tiny-patch4-window7-224",
    "google/vit-large-patch16-224",
]
TIMM_MODELS = [
    "convnext_base",
    "resnet50",
    "efficientnet_b0",
]

def model_selector(slot_key: str, default_source="hf"):
    source = st.sidebar.selectbox(
        f"{slot_key} source",
        ["hf", "timm"],
        index=0 if default_source == "hf" else 1,
        key=f"{slot_key}_source",
    )
    if source == "hf":
        hf_choice = st.sidebar.selectbox(
            f"{slot_key} Hugging Face model",
            HF_MODELS,
            index=0,
            key=f"{slot_key}_hf",
        )
        return f"hf:{hf_choice}"
    else:
        timm_choice = st.sidebar.selectbox(
            f"{slot_key} timm model",
            TIMM_MODELS,
            index=0,
            key=f"{slot_key}_timm",
        )
        return f"timm:{timm_choice}"

# ---------- Model Loader ----------
# Use Streamlit caching when available to avoid repeated downloads
try:
    cache_decorator = st.cache_resource
except Exception:
    from functools import lru_cache
    cache_decorator = lru_cache(maxsize=8)

@cache_decorator
def load_model(choice, checkpoint=None, class_map=None, num_classes=None):
    """
    Load a model from HF, timm, or a custom checkpoint
    Args:
        choice: Model identifier ('hf:model_name' or 'timm:model_name')
        checkpoint: Optional path to custom checkpoint file
        class_map: Optional dict mapping class indices to labels
        num_classes: Optional number of classes for custom models
    """
    logger.info("Loading model: %s", choice)
    is_hf = choice.startswith("hf:")
    
    # Parse model identifier
    if is_hf:
        hf_name = choice.split("hf:")[1]
        if checkpoint:  # Custom checkpoint
            # For custom HF model, first load the architecture then apply weights
            try:
                if num_classes:
                    model = AutoModelForImageClassification.from_pretrained(
                        hf_name, num_labels=num_classes, ignore_mismatched_sizes=True
                    ).to(device)
                else:
                    model = AutoModelForImageClassification.from_pretrained(hf_name).to(device)
                
                # Load checkpoint with error handling
                state_dict = torch.load(checkpoint, map_location=device)
                # If state_dict is wrapped (common in training checkpoints)
                if "model" in state_dict:
                    state_dict = state_dict["model"]
                elif "state_dict" in state_dict:
                    state_dict = state_dict["state_dict"]
                    
                # Handle any prefix differences by checking and stripping if needed
                if all(k.startswith('model.') for k in state_dict if k != 'config'):
                    state_dict = {k[6:]: v for k, v in state_dict.items() if k != 'config'}
                
                # Load with flexible partial loading (ignore missing/unexpected)
                model.load_state_dict(state_dict, strict=False)
                logger.info("Custom checkpoint loaded for HF model")
                
                # If custom class mapping provided, update config
                if class_map:
                    model.config.id2label = class_map
                    model.config.label2id = {v: int(k) for k, v in class_map.items()}
            except Exception as e:
                logger.error(f"Error loading custom HF model: {e}")
                st.error(f"Failed to load custom model: {e}")
                # Fallback to base model
                model = AutoModelForImageClassification.from_pretrained(hf_name).to(device)
        else:
            # Standard HF model
            model = AutoModelForImageClassification.from_pretrained(hf_name).to(device)
        
        processor = AutoImageProcessor.from_pretrained(hf_name)
    
    elif choice.startswith("timm:"):
        name = choice.split("timm:")[1]
        if checkpoint:  # Custom checkpoint
            try:
                # For timm, specify custom number of classes if provided
                if num_classes:
                    model = timm.create_model(name, pretrained=False, num_classes=num_classes).to(device)
                else:
                    model = timm.create_model(name, pretrained=True).to(device)
                
                # Load checkpoint
                state_dict = torch.load(checkpoint, map_location=device)
                # Handle common checkpoint formats
                if "model" in state_dict:
                    state_dict = state_dict["model"]
                elif "state_dict" in state_dict:
                    state_dict = state_dict["state_dict"]
                
                # Handle any prefix differences
                if all(k.startswith('module.') for k in state_dict):
                    state_dict = {k[7:]: v for k, v in state_dict}
                
                model.load_state_dict(state_dict, strict=False)
                logger.info("Custom checkpoint loaded for timm model")
            except Exception as e:
                logger.error(f"Error loading custom timm model: {e}")
                st.error(f"Failed to load custom model: {e}")
                # Fallback to pretrained
                model = timm.create_model(name, pretrained=True).to(device)
        else:
            # Standard timm model
            model = timm.create_model(name, pretrained=True).to(device)
        
        # Use a standard processor for timm
        processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        
    # Set model to eval mode
    model.eval()
    logger.info("Model %s loaded (eval mode)", choice)
    
    # Return model, processor, flag for HF, and class map
    return model, processor, is_hf, class_map

# Add sidebar with clear sections
st.sidebar.title("Model Selection")

# Enhanced sidebar with custom model support
with st.sidebar:
    # Add tabs for standard vs custom models
    tab1, tab2 = st.tabs(["Standard Models", "Custom Finetuned Models"])
    
    with tab1:
        st.markdown("### 📊 Standard Models")
        st.markdown("Choose from pre-trained models:")
        m1 = model_selector("Active Model", default_source="hf")
        
        # Button to apply standard model change
        if st.button("📋 Set as Active Model", help="Click to use the selected model for analysis", key="std_model_btn"):
            with st.spinner(f"Loading {m1}..."):
                model, processor, is_hf_model, _ = load_model(m1)
                st.session_state.model = model
                st.session_state.processor = processor
                st.session_state.is_hf_model = is_hf_model
                st.session_state.active_model = m1
                st.session_state.using_custom = False
                st.session_state.class_map = None
                st.success(f"✅ Model activated: {m1}")
    
    with tab2:
        st.markdown("### 🔧 Custom Finetuned Model")
        st.markdown("Use your own finetuned model:")
        
        # Select base architecture
        custom_source = st.selectbox(
            "Base architecture source",
            ["hf", "timm"],
            key="custom_source"
        )
        
        if custom_source == "hf":
            custom_base = st.selectbox(
                "Hugging Face base model",
                HF_MODELS,
                key="custom_hf_base"
            )
            base_model = f"hf:{custom_base}"
        else:
            custom_base = st.selectbox(
                "timm base model",
                TIMM_MODELS,
                key="custom_timm_base"
            )
            base_model = f"timm:{custom_base}"
        
        # Upload checkpoint file
        uploaded_checkpoint = st.file_uploader(
            "Upload model checkpoint (.pth, .bin)",
            type=["pth", "bin", "pt", "ckpt"],
            help="Upload your finetuned model weights"
        )
        
        # Optional class mapping
        custom_classes = st.number_input(
            "Number of classes (if different from base model)",
            min_value=0, max_value=1000, value=0,
            help="Leave at 0 to use default classes from base model"
        )
        
        uploaded_labels = st.file_uploader(
            "Upload class labels (optional JSON)",
            type=["json"],
            help="JSON file mapping class indices to labels: {\"0\": \"cat\", \"1\": \"dog\"}"
        )
        
        # Process label mapping
        class_map = None
        if uploaded_labels:
            try:
                import json
                class_map = json.loads(uploaded_labels.getvalue().decode("utf-8"))
                st.success(f"✓ Loaded {len(class_map)} class labels")
            except Exception as e:
                st.error(f"Error loading class labels: {e}")
        
        # Store uploaded file in session state if provided
        if uploaded_checkpoint:
            # Save to a temporary file
            import tempfile
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pth') as tmp_file:
                tmp_file.write(uploaded_checkpoint.getvalue())
                checkpoint_path = tmp_file.name
                
                # Store in session state
                if 'checkpoint_path' not in st.session_state:
                    st.session_state.checkpoint_path = checkpoint_path
            
            st.success("✓ Checkpoint ready to use")
            
            # Button to apply custom model
            if st.button("🚀 Load Custom Model", help="Click to use your custom model"):
                with st.spinner(f"Loading custom model based on {base_model}..."):
                    try:
                        num_classes = custom_classes if custom_classes > 0 else None
                        model, processor, is_hf_model, class_map = load_model(
                            base_model, checkpoint_path, class_map, num_classes
                        )
                        st.session_state.model = model
                        st.session_state.processor = processor
                        st.session_state.is_hf_model = is_hf_model
                        st.session_state.active_model = f"Custom {base_model}"
                        st.session_state.using_custom = True
                        st.session_state.class_map = class_map
                        st.success(f"✅ Custom model activated!")
                    except Exception as e:
                        st.error(f"Failed to load custom model: {str(e)}")
    
    # Explanation section
    st.markdown("---")
    st.markdown("### ℹ️ Model Types")
    st.markdown("""
    - **HF (Hugging Face)**: Vision Transformer models with standard interpretability
    - **timm (PyTorch Image Models)**: Classical CNN architectures like ResNet, EfficientNet
    
    *Custom models must match the base architecture's format.*
    """)

# Initialize model and processor from session state
if 'active_model' not in st.session_state:
    # First time loading - use default model
    m1 = "hf:google/vit-base-patch16-224" 
    st.session_state.active_model = m1
    model, processor, is_hf_model, _ = load_model(m1)
    st.session_state.model = model
    st.session_state.processor = processor
    st.session_state.is_hf_model = is_hf_model
    st.session_state.using_custom = False
    st.session_state.class_map = None
else:
    # Get from session state
    model = st.session_state.model
    processor = st.session_state.processor
    is_hf_model = st.session_state.is_hf_model

# Initialize explainer
explainer = lime_image.LimeImageExplainer()

st.title("🧠 Vision Transformer Interpretability Dashboard")
st.write("Upload an image and explore explanations with **LIME** and **Uncertainty Analysis**.")

# Add a Feynman-style "How it works" explanation as a collapsible expander
with st.expander("How it works — Feynman-style explanations (click to expand)", expanded=False):
    st.markdown("""
        ## 🧠 Vision Transformer Interpretability — Feynman-Style Explanations

        ### Why do we care about interpretability & uncertainty?

        Imagine you ask a kid to identify whether a picture is a cat. They point to the fur, ears, maybe whiskers. But what if the kid always focused on shadows, or background trees, instead of the cat itself? We want two things:

        1. **Why** did the model say “cat”? What parts of the image made it decide so?
        2. **How confident** is the model in that decision? Could small changes flip it?

        Interpretable methods show us #1. Uncertainty estimation shows us #2. Together, they help us see not just *what* the model does, but *whether* we should trust it.

        ### Key techniques, in plain analogies

        - **LIME (Local Interpretable Model-agnostic Explanations)**: For a single image & prediction, LIME perturbs (changes) parts of the image, watches how the prediction changes, and fits a simple model locally to understand which parts are most influential.
            - Analogy: Like shining small spotlights on different parts of a stage during a play: you dim a section, see how the actor’s reaction changes. The parts whose dimming changes the reaction most are parts the actor depends on.

        - **Uncertainty in LIME (multiple LIME runs)**: Because LIME uses randomness (perturbing patches), different runs can give different “important” regions. Measuring how much they differ tells you how stable/fragile the explanation is.
            - Analogy: If you ask several cooks what the dominant spice in a stew is and everyone agrees, you're confident; if opinions vary, your knowledge is shakier.

        - **MC Dropout (Monte Carlo Dropout)**: Leave dropout on at inference time and run the model multiple times. The spread of predictions is a proxy for epistemic uncertainty.
            - Analogy: Like a jury where each juror occasionally misses a sentence; if the verdict remains the same across many "faulty hearing" runs, trust it more.

        - **Test-Time Augmentation (TTA) Uncertainty**: Apply small transforms (crops, flips) at inference and watch prediction variance. High variance → brittle model.
            - Analogy: Take photos under slightly different lighting/angles; if the label flips, the model may depend on superficial cues.

        ### How to read the visuals

        - LIME highlights: bright / colored superpixels = influential regions. If background or artifacts light up, that's a red flag.
        - LIME uncertainty heatmap: high std in a region means attributions are unstable there.
        - MC Dropout / TTA histograms: narrow/tall peak = confident, wide/multi-modal = uncertain.

        ### Limitations & caveats

        - Stable explanations can still be consistently wrong if the model learned a bias.
        - MC Dropout is an approximation — it helps but doesn't fully replace calibrated probabilistic methods.
        - TTA shows input sensitivity, not full distributional shift robustness.

        ### Quick example (walkthrough)

        1. Upload image → model predicts label with some probability.
        2. LIME finds important superpixels; multiple LIME runs give mean + std maps.
        3. MC Dropout produces a histogram over runs; use it to judge epistemic uncertainty.
        4. TTA shows sensitivity to small input changes.

        ### Practical tips

        - Use explanation + uncertainty to guide active learning: label cases where the model is uncertain or explanations are unstable.
        - For safety-critical systems, combine these visual signals with human review and stricter failure thresholds.

        ### Where to read more

        - Christoph Molnar — Interpretable Machine Learning (chapter on LIME): https://christophm.github.io/interpretable-ml-book/lime.html
        - Ribeiro et al., "Why Should I Trust You?" (original LIME paper): https://homes.cs.washington.edu/~marcotcr/blog/lime/
        - Zhang et al., "Why Should You Trust My Explanation?" (LIME reliability): https://arxiv.org/abs/1904.12991
        - MC Dropout practical guide & notes: https://medium.com/@ciaranbench/monte-carlo-dropout-a-practical-guide-4b4dc18014b5
        """)

# Compact one-page cheat-sheet (quick flags & checks)
with st.expander("Cheat-sheet — Quick flags & warnings", expanded=False):
        cheat_text = """
Quick checks when an explanation looks suspicious

- Red flag: LIME highlights background or repeated dataset artifacts (logos, borders) — model may have learned spurious cues.
- Red flag: LIME attribution std is high in key regions — explanation unstable; try different segmentations or more samples.
- Red flag: MC Dropout or TTA histograms are multi-modal or very wide — model uncertain; consider human review or abstain.
- Quick fixes: increase dataset diversity, add regularization, try different segmentation_fn parameters, or collect more labels for uncertain cases.

One-line definitions
- LIME: perturb + fit simple local model to explain a single prediction.
- MC Dropout: enable dropout at inference and sample to estimate epistemic uncertainty.
- TTA: apply small input transforms at inference to measure sensitivity / aleatoric uncertainty.

Pro-tip: Use explanation + uncertainty to drive active learning: pick instances with high prediction uncertainty or unstable explanations for labeling.
"""

        # Show the cheat-sheet as markdown
        st.markdown(cheat_text)

        # Download button for the cheat-sheet as plain text
        try:
                st.download_button(
                        label="Download cheat-sheet (.txt)",
                        data=cheat_text,
                        file_name="cheat_sheet.txt",
                        mime="text/plain",
                )
        except Exception:
                # Streamlit may raise if download_button isn't available in some environments; ignore gracefully
                pass

        # Copy-to-clipboard button using a small HTML+JS snippet
        escaped = html.escape(cheat_text)
        copy_html = f"""
        <div>
            <button id='copy-btn' style='padding:6px 10px;border-radius:4px;'>Copy cheat-sheet</button>
            <script>
                const btn = document.getElementById('copy-btn');
                btn.addEventListener('click', async () => {{
                    try {{
                        await navigator.clipboard.writeText(`{escaped}`);
                        btn.innerText = 'Copied!';
                        setTimeout(() => btn.innerText = 'Copy cheat-sheet', 1500);
                    }} catch (e) {{
                        btn.innerText = 'Copy failed';
                    }}
                }});
            </script>
        </div>
        """
        components.html(copy_html, height=70)

# Display active model clearly in the main panel
is_custom = st.session_state.get('using_custom', False)
custom_badge = " 🔧 Custom" if is_custom else ""
st.markdown(f"### Active Model: `{st.session_state.active_model}{custom_badge}`")
model_type = "Hugging Face Transformer" if is_hf_model else "timm CNN Architecture"
st.caption(f"Model type: {model_type}")

# ---------------- Helpers ----------------
def classifier_fn(images_batch):
    # Use current model/processor from session state
    inputs = processor(images=[Image.fromarray(x.astype(np.uint8)) for x in images_batch],
                      return_tensors="pt").to(device)
    with torch.no_grad():
        if is_hf_model:
            outputs = model(**inputs)
            logits = outputs.logits
        else:
            x = inputs['pixel_values']
            logits = model(x)
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
    return probs

def predict_probs(pil_img):
    # Use current model/processor from session state
    inputs = processor(images=pil_img, return_tensors="pt").to(device)
    with torch.no_grad():
        if is_hf_model:
            outputs = model(**inputs)
            logits = outputs.logits
        else:
            x = inputs['pixel_values']
            logits = model(x)
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    return probs

# ---------------- Upload ----------------
uploaded = st.file_uploader("Upload an image", type=["png","jpg","jpeg"])
if uploaded:
    img = Image.open(uploaded).convert("RGB").resize((224,224))
    logger.info("Uploaded image received (size=%s)", img.size)
    # Streamlit 1.XX: replace deprecated `use_container_width` with `width`
    # For full-width behavior use width='stretch' (or 'content' for intrinsic size)
    st.image(img, caption="Uploaded image", width='stretch')

    # ---------------- Prediction ----------------
    probs = predict_probs(img)
    pred_idx = int(np.argmax(probs))
    
    # Get label - handle models differently based on source
    if is_hf_model:
        # Use model's config.id2label if available
        pred_label = model.config.id2label[pred_idx]
    elif st.session_state.get('class_map'):
        # Use custom class map if provided (access defensively)
        _class_map = st.session_state.get('class_map')
        pred_label = _class_map.get(str(pred_idx), f"Class {pred_idx}") if _class_map is not None else f"Class {pred_idx}"
    else:
        # For timm models without labels
        pred_label = f"Class {pred_idx}"
    
    pred_prob = float(probs[pred_idx])
    logger.info("Prediction: %s (%.3f)", pred_label, pred_prob)

    st.subheader("🔮 Prediction")
    st.write(f"**Top-1:** {pred_label} ({pred_prob:.3f})")
    
    if not is_hf_model and not st.session_state.get('class_map'):
        st.info("ℹ️ Using model without class names. Upload a class mapping in the sidebar for friendly labels.")

    # ---------------- LIME ----------------
    st.subheader("📍 LIME Attribution")
    st.markdown("""
    **Local Interpretable Model-agnostic Explanations (LIME)** is a technique that approximates how a complex model (like ViT or ResNet) makes decisions for a specific input by creating a simpler, interpretable model around it. 
    It perturbs the image into segments and sees which ones most influence the prediction, revealing what the model "sees" as important. 
    This is crucial for debugging biases or understanding if the model focuses on relevant features vs. artifacts.
    """)
    img_np = np.array(img)
    
    with st.spinner("Generating LIME explanation..."):
        exp = explainer.explain_instance(
            img_np, classifier_fn=classifier_fn, top_labels=1, num_samples=1000,
            segmentation_fn=lambda x: slic(x, n_segments=60, compactness=9, start_label=0)
        )
        temp, mask = exp.get_image_and_mask(pred_idx, positive_only=True,
                                            num_features=8, hide_rest=False)
        lime_img = mark_boundaries(temp/255.0, mask)

    st.image(lime_img, caption=f"LIME highlights regions important for '{pred_label}'")
    st.info("""
    **How to read:** Bright (or colored) segments show areas the model relied on most for its prediction – these are the "superpixels" that, when altered, change the output the most. 
    Green/red overlays often indicate positive/negative contributions. If irrelevant background or edges light up, it might signal the model learned spurious correlations (e.g., from training data artifacts). 
    Furthermore, this builds trust by showing if AI decisions align with human intuition.
    """)

    # ---------------- LIME Uncertainty ----------------
    st.subheader("📊 LIME Attribution Uncertainty")
    st.markdown("""
    Uncertainty in explanations arises because LIME is stochastic – it samples perturbations randomly. By running LIME multiple times, we can measure variability in attributions, 
    highlighting if the model's reasoning is consistent or fragile for this image. High variability suggests the explanation (and thus model confidence) isn't robust.
    """)
    logger.info("Starting LIME uncertainty runs (n=5)")
    maps = []
    for i in range(5):
        logger.debug("LIME run %d", i+1)
        exp = explainer.explain_instance(
            img_np, classifier_fn=classifier_fn, top_labels=1, num_samples=500,
            segmentation_fn=lambda x: slic(x, n_segments=60, compactness=9, start_label=0)
        )
        local_exp = dict(exp.local_exp)[pred_idx]
        segments = exp.segments
        attr_map = np.zeros(segments.shape)
        for seg_id, weight in local_exp:
            attr_map[segments == seg_id] = weight
        maps.append(attr_map)
    maps = np.stack(maps)
    mean_attr, std_attr = maps.mean(0), maps.std(0)

    fig, ax = plt.subplots(1,2, figsize=(8,4))
    im1 = ax[0].imshow(mean_attr, cmap="jet"); ax[0].set_title("Mean attribution"); ax[0].axis("off")
    plt.colorbar(im1, ax=ax[0], fraction=0.046)
    im2 = ax[1].imshow(std_attr, cmap="hot"); ax[1].set_title("Attribution std (uncertainty)"); ax[1].axis("off")
    plt.colorbar(im2, ax=ax[1], fraction=0.046)
    st.pyplot(fig)
    st.info("""
    **How to read:** The left heatmap shows average importance across runs (hotter = more influential). The right shows standard deviation – high std (yellow/red) means unstable explanations for those regions. 
    If uncertainty is high in key areas, the model might overfit or need more diverse training data. This helps ML practitioners quantify explanation reliability.
    """)
    logger.info("Completed LIME uncertainty runs")

    # ---------------- MC Dropout ----------------
    st.subheader("🎲 MC Dropout Uncertainty")
    st.markdown("""
    Monte Carlo (MC) Dropout treats dropout layers (normally off during inference) as a Bayesian approximation to estimate epistemic uncertainty – how much the model "doesn't know" due to limited training. 
    By enabling dropout and sampling predictions multiple times, we see if the model consistently agrees on the class or wavers, indicating potential unreliability.
    """)
    logger.info("Starting MC Dropout sampling")
    model.train()  # enable dropout
    mc_preds = []
    with torch.no_grad():
        for _ in range(30):
            probs_mc = predict_probs(img)
            mc_preds.append(probs_mc)
    model.eval()
    mc_preds = np.stack(mc_preds)
    mc_mean = mc_preds.mean(0)
    mc_top = mc_mean.argmax()
    if is_hf_model:
        mc_label = model.config.id2label[mc_top] 
    elif st.session_state.get('class_map'):
        _class_map = st.session_state.get('class_map')
        mc_label = _class_map.get(str(mc_top), f"Class {mc_top}") if _class_map is not None else f"Class {mc_top}"
    else:
        mc_label = f"Class {mc_top}"
    p = mc_preds[:, mc_top]

    fig, ax = plt.subplots()
    ax.hist(p, bins=15, color="C0")
    ax.set_title(f"MC Dropout: p({mc_label}) across samples")
    st.pyplot(fig)
    st.info("""
    **How to read:** This histogram shows probability distributions for the top class across 30 samples. A narrow, peaked distribution means stable confidence (low uncertainty). 
    A wide spread or multiple modes suggests the model is unsure, possibly due to out-of-distribution inputs. For devs, this flags cases needing human review; it highlights risky predictions.
    """)
    logger.info("Completed MC Dropout: top=%s", mc_label)

    # ---------------- Test-Time Augmentation (TTA) Uncertainty ----------------
    st.subheader("🔄 Test-Time Augmentation (TTA) Uncertainty")
    st.markdown("""
    Test-Time Augmentation (TTA) applies random transformations (crops, flips) at inference to probe aleatoric uncertainty – noise inherent in the input or model. 
    If predictions vary wildly under small changes, the model relies on brittle features, revealing data-related issues rather than model knowledge gaps.
    """)
    logger.info("Starting TTA sampling")
    tta_tfms = T.Compose([T.Resize(256), T.RandomResizedCrop(224, scale=(0.9,1.0)), T.RandomHorizontalFlip(p=0.5)])
    tta_preds = []
    with torch.no_grad():
        for _ in range(20):
            aug = tta_tfms(img)
            probs_tta = predict_probs(aug)
            tta_preds.append(probs_tta)
    tta_preds = np.stack(tta_preds)
    tta_mean = tta_preds.mean(0)
    tta_top = tta_mean.argmax()
    if is_hf_model:
        tta_label = model.config.id2label[tta_top]
    elif st.session_state.get('class_map'):
        _class_map = st.session_state.get('class_map')
        tta_label = _class_map.get(str(tta_top), f"Class {tta_top}") if _class_map is not None else f"Class {tta_top}"
    else:
        tta_label = f"Class {tta_top}"
    p_tta = tta_preds[:, tta_top]

    fig, ax = plt.subplots()
    ax.hist(p_tta, bins=15, color="C1")
    ax.set_title(f"TTA: p({tta_label}) across augmentations")
    st.pyplot(fig)
    st.info("""
    **How to read:** Similar to MC Dropout, but focused on input variations. Low variance means the prediction is robust to perturbations (good sign). High variance indicates sensitivity to details like lighting/position, 
    common in overfitted models. Use this to assess if your AI system handles real-world variability well.
    """)
    logger.info("Completed TTA: top=%s", tta_label)
# ---------------- Summary ----------------